summaryrefslogtreecommitdiff
path: root/thirdparty/embree-aarch64/kernels/geometry
diff options
context:
space:
mode:
authorRĂ©mi Verschelde <remi@verschelde.fr>2021-04-27 19:07:12 +0200
committerGitHub <noreply@github.com>2021-04-27 19:07:12 +0200
commit95cfce661bbe9700209c09dfe297ab7ef5ebfe09 (patch)
tree9f3c8a98b619e2a0dd171d6cb6d8dc2e791baa14 /thirdparty/embree-aarch64/kernels/geometry
parentb999fbc4bd349cce153c2133dd0487694add1a05 (diff)
parent4d9d99bb827967e2bb931eeb8c3f0e079b39ae1a (diff)
Merge pull request #48050 from JFonS/occlusion_culling
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/geometry')
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/cone.h321
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h209
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h141
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curveNi.h222
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h569
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h278
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h516
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curveNv.h101
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h181
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h98
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h129
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h417
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h49
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h214
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h362
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h671
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h21
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h21
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h21
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h21
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h21
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h22
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/cylinder.h223
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h216
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h277
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/filter.h204
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h99
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/grid_soa.h275
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h207
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h445
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/instance.h78
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h84
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h1074
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h172
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/line_intersector.h141
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/linei.h709
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h124
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/object.h84
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/object_intersector.h127
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/plane.h57
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/pointi.h417
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/primitive.h49
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp379
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h76
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h566
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h529
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/quadi.h483
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h350
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/quadv.h165
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h181
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h710
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h136
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h183
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h156
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h38
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h237
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/subgrid.h517
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h518
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h493
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h508
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h236
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/triangle.h162
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h96
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h403
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h247
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h418
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h132
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/trianglei.h442
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h336
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/trianglev.h157
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h206
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h201
-rw-r--r--thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h211
73 files changed, 19139 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/geometry/cone.h b/thirdparty/embree-aarch64/kernels/geometry/cone.h
new file mode 100644
index 0000000000..961ef86160
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/cone.h
@@ -0,0 +1,321 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct Cone
+ {
+ const Vec3fa p0; //!< start position of cone
+ const Vec3fa p1; //!< end position of cone
+ const float r0; //!< start radius of cone
+ const float r1; //!< end radius of cone
+
+ __forceinline Cone(const Vec3fa& p0, const float r0, const Vec3fa& p1, const float r1)
+ : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+ __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir,
+ BBox1f& t_o,
+ float& u0_o, Vec3fa& Ng0_o,
+ float& u1_o, Vec3fa& Ng1_o) const
+ {
+ /* calculate quadratic equation to solve */
+ const Vec3fa v0 = p0-org;
+ const Vec3fa v1 = p1-org;
+
+ const float rl = rcp_length(v1-v0);
+ const Vec3fa P0 = v0, dP = (v1-v0)*rl;
+ const float dr = (r1-r0)*rl;
+ const Vec3fa O = -P0, dO = dir;
+
+ const float dOdO = dot(dO,dO);
+ const float OdO = dot(dO,O);
+ const float OO = dot(O,O);
+ const float dOz = dot(dP,dO);
+ const float Oz = dot(dP,O);
+
+ const float R = r0 + Oz*dr;
+ const float A = dOdO - sqr(dOz) * (1.0f+sqr(dr));
+ const float B = 2.0f * (OdO - dOz*(Oz + R*dr));
+ const float C = OO - (sqr(Oz) + sqr(R));
+
+ /* we miss the cone if determinant is smaller than zero */
+ const float D = B*B - 4.0f*A*C;
+ if (D < 0.0f) return false;
+
+ /* special case for rays that are "parallel" to the cone */
+ const float eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+ if (unlikely(abs(A) < eps))
+ {
+ /* cylinder case */
+ if (abs(dr) < 16.0f*float(ulp)) {
+ if (C <= 0.0f) { t_o = BBox1f(neg_inf,pos_inf); return true; }
+ else { t_o = BBox1f(pos_inf,neg_inf); return false; }
+ }
+
+ /* cone case */
+ else
+ {
+ /* if we hit the negative cone there cannot be a hit */
+ const float t = -C/B;
+ const float z0 = Oz+t*dOz;
+ const float z0r = r0+z0*dr;
+ if (z0r < 0.0f) return false;
+
+ /* test if we start inside or outside the cone */
+ if (dOz*dr > 0.0f) t_o = BBox1f(t,pos_inf);
+ else t_o = BBox1f(neg_inf,t);
+ }
+ }
+
+ /* standard case for "non-parallel" rays */
+ else
+ {
+ const float Q = sqrt(D);
+ const float rcp_2A = rcp(2.0f*A);
+ t_o.lower = (-B-Q)*rcp_2A;
+ t_o.upper = (-B+Q)*rcp_2A;
+
+ /* standard case where both hits are on same cone */
+ if (likely(A > 0.0f)) {
+ const float z0 = Oz+t_o.lower*dOz;
+ const float z0r = r0+z0*dr;
+ if (z0r < 0.0f) return false;
+ }
+
+ /* special case where the hits are on the positive and negative cone */
+ else
+ {
+ /* depending on the ray direction and the open direction
+ * of the cone we have a hit from inside or outside the
+ * cone */
+ if (dOz*dr > 0) t_o.upper = pos_inf;
+ else t_o.lower = neg_inf;
+ }
+ }
+
+ /* calculates u and Ng for near hit */
+ {
+ u0_o = (Oz+t_o.lower*dOz)*rl;
+ const Vec3fa Pr = t_o.lower*dir;
+ const Vec3fa Pl = v0 + u0_o*(v1-v0);
+ const Vec3fa R = normalize(Pr-Pl);
+ const Vec3fa U = (p1-p0)+(r1-r0)*R;
+ const Vec3fa V = cross(p1-p0,R);
+ Ng0_o = cross(V,U);
+ }
+
+ /* calculates u and Ng for far hit */
+ {
+ u1_o = (Oz+t_o.upper*dOz)*rl;
+ const Vec3fa Pr = t_o.upper*dir;
+ const Vec3fa Pl = v0 + u1_o*(v1-v0);
+ const Vec3fa R = normalize(Pr-Pl);
+ const Vec3fa U = (p1-p0)+(r1-r0)*R;
+ const Vec3fa V = cross(p1-p0,R);
+ Ng1_o = cross(V,U);
+ }
+ return true;
+ }
+
+ __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, BBox1f& t_o) const
+ {
+ float u0_o; Vec3fa Ng0_o; float u1_o; Vec3fa Ng1_o;
+ return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+ }
+
+ static bool verify(const size_t id, const Cone& cone, const Ray& ray, bool shouldhit, const float t0, const float t1)
+ {
+ float eps = 0.001f;
+ BBox1f t; bool hit;
+ hit = cone.intersect(ray.org,ray.dir,t);
+
+ bool failed = hit != shouldhit;
+ if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : (t0 == -1E6) ? t.lower > -1E6f : abs(t0-t.lower) > eps;
+ if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : (t1 == +1E6) ? t.upper < +1E6f : abs(t1-t.upper) > eps;
+ if (!failed) return true;
+ embree_cout << "Cone test " << id << " failed: cone = " << cone << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl;
+ return false;
+ }
+
+ /* verify cone class */
+ static bool verify()
+ {
+ bool passed = true;
+ const Cone cone0(Vec3fa(0.0f,0.0f,0.0f),0.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+ passed &= verify(0,cone0,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,3.0f,pos_inf);
+ passed &= verify(1,cone0,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+ passed &= verify(2,cone0,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),false,0.0f,0.0f);
+ passed &= verify(3,cone0,Ray(Vec3fa(+1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,3.0f);
+ passed &= verify(4,cone0,Ray(Vec3fa(-1.0f,0.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,1.0f,pos_inf);
+ passed &= verify(5,cone0,Ray(Vec3fa(+1.0f,0.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+ passed &= verify(6,cone0,Ray(Vec3fa(+0.0f,0.0f,1.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,1.0f);
+ passed &= verify(7,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+ passed &= verify(8,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(+1.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.5f,+1E6);
+ passed &= verify(9,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,+1.0f,+0.0f),0.0f,float(inf)),true,-1E6,-0.5f);
+ const Cone cone1(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),0.0f);
+ passed &= verify(10,cone1,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,2.0f);
+ passed &= verify(11,cone1,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,0.0f,4.0f);
+ const Cone cylinder(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+ passed &= verify(12,cylinder,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+ passed &= verify(13,cylinder,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+ passed &= verify(14,cylinder,Ray(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+ passed &= verify(15,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+ passed &= verify(16,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+ passed &= verify(17,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+ passed &= verify(18,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+ return passed;
+ }
+
+ /*! output operator */
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cone& c) {
+ return cout << "Cone { p0 = " << c.p0 << ", r0 = " << c.r0 << ", p1 = " << c.p1 << ", r1 = " << c.r1 << "}";
+ }
+ };
+
+ template<int N>
+ struct ConeN
+ {
+ typedef Vec3<vfloat<N>> Vec3vfN;
+
+ const Vec3vfN p0; //!< start position of cone
+ const Vec3vfN p1; //!< end position of cone
+ const vfloat<N> r0; //!< start radius of cone
+ const vfloat<N> r1; //!< end radius of cone
+
+ __forceinline ConeN(const Vec3vfN& p0, const vfloat<N>& r0, const Vec3vfN& p1, const vfloat<N>& r1)
+ : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+ __forceinline Cone operator[] (const size_t i) const
+ {
+ assert(i<N);
+ return Cone(Vec3fa(p0.x[i],p0.y[i],p0.z[i]),r0[i],Vec3fa(p1.x[i],p1.y[i],p1.z[i]),r1[i]);
+ }
+
+ __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir,
+ BBox<vfloat<N>>& t_o,
+ vfloat<N>& u0_o, Vec3vfN& Ng0_o,
+ vfloat<N>& u1_o, Vec3vfN& Ng1_o) const
+ {
+ /* calculate quadratic equation to solve */
+ const Vec3vfN v0 = p0-Vec3vfN(org);
+ const Vec3vfN v1 = p1-Vec3vfN(org);
+
+ const vfloat<N> rl = rcp_length(v1-v0);
+ const Vec3vfN P0 = v0, dP = (v1-v0)*rl;
+ const vfloat<N> dr = (r1-r0)*rl;
+ const Vec3vfN O = -P0, dO = dir;
+
+ const vfloat<N> dOdO = dot(dO,dO);
+ const vfloat<N> OdO = dot(dO,O);
+ const vfloat<N> OO = dot(O,O);
+ const vfloat<N> dOz = dot(dP,dO);
+ const vfloat<N> Oz = dot(dP,O);
+
+ const vfloat<N> R = r0 + Oz*dr;
+ const vfloat<N> A = dOdO - sqr(dOz) * (vfloat<N>(1.0f)+sqr(dr));
+ const vfloat<N> B = 2.0f * (OdO - dOz*(Oz + R*dr));
+ const vfloat<N> C = OO - (sqr(Oz) + sqr(R));
+
+ /* we miss the cone if determinant is smaller than zero */
+ const vfloat<N> D = B*B - 4.0f*A*C;
+ vbool<N> valid = D >= 0.0f;
+ if (none(valid)) return valid;
+
+ /* special case for rays that are "parallel" to the cone */
+ const vfloat<N> eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+ const vbool<N> validt = valid & (abs(A) < eps);
+ const vbool<N> validf = valid & !(abs(A) < eps);
+ if (unlikely(any(validt)))
+ {
+ const vboolx validtt = validt & (abs(dr) < 16.0f*float(ulp));
+ const vboolx validtf = validt & (abs(dr) >= 16.0f*float(ulp));
+
+ /* cylinder case */
+ if (unlikely(any(validtt)))
+ {
+ t_o.lower = select(validtt, select(C <= 0.0f, vfloat<N>(neg_inf), vfloat<N>(pos_inf)), t_o.lower);
+ t_o.upper = select(validtt, select(C <= 0.0f, vfloat<N>(pos_inf), vfloat<N>(neg_inf)), t_o.upper);
+ valid &= !validtt | C <= 0.0f;
+ }
+
+ /* cone case */
+ if (any(validtf))
+ {
+ /* if we hit the negative cone there cannot be a hit */
+ const vfloat<N> t = -C/B;
+ const vfloat<N> z0 = Oz+t*dOz;
+ const vfloat<N> z0r = r0+z0*dr;
+ valid &= !validtf | z0r >= 0.0f;
+
+ /* test if we start inside or outside the cone */
+ t_o.lower = select(validtf, select(dOz*dr > 0.0f, t, vfloat<N>(neg_inf)), t_o.lower);
+ t_o.upper = select(validtf, select(dOz*dr > 0.0f, vfloat<N>(pos_inf), t), t_o.upper);
+ }
+ }
+
+ /* standard case for "non-parallel" rays */
+ if (likely(any(validf)))
+ {
+ const vfloat<N> Q = sqrt(D);
+ const vfloat<N> rcp_2A = 0.5f*rcp(A);
+ t_o.lower = select(validf, (-B-Q)*rcp_2A, t_o.lower);
+ t_o.upper = select(validf, (-B+Q)*rcp_2A, t_o.upper);
+
+ /* standard case where both hits are on same cone */
+ const vbool<N> validft = validf & A>0.0f;
+ const vbool<N> validff = validf & !(A>0.0f);
+ if (any(validft)) {
+ const vfloat<N> z0 = Oz+t_o.lower*dOz;
+ const vfloat<N> z0r = r0+z0*dr;
+ valid &= !validft | z0r >= 0.0f;
+ }
+
+ /* special case where the hits are on the positive and negative cone */
+ if (any(validff)) {
+ /* depending on the ray direction and the open direction
+ * of the cone we have a hit from inside or outside the
+ * cone */
+ t_o.lower = select(validff, select(dOz*dr > 0.0f, t_o.lower, float(neg_inf)), t_o.lower);
+ t_o.upper = select(validff, select(dOz*dr > 0.0f, float(pos_inf), t_o.upper), t_o.upper);
+ }
+ }
+
+ /* calculates u and Ng for near hit */
+ {
+ u0_o = (Oz+t_o.lower*dOz)*rl;
+ const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+ const Vec3vfN Pl = v0 + u0_o*(v1-v0);
+ const Vec3vfN R = normalize(Pr-Pl);
+ const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+ const Vec3vfN V = cross(p1-p0,R);
+ Ng0_o = cross(V,U);
+ }
+
+ /* calculates u and Ng for far hit */
+ {
+ u1_o = (Oz+t_o.upper*dOz)*rl;
+ const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+ const Vec3vfN Pl = v0 + u1_o*(v1-v0);
+ const Vec3vfN R = normalize(Pr-Pl);
+ const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+ const Vec3vfN V = cross(p1-p0,R);
+ Ng1_o = cross(V,U);
+ }
+ return valid;
+ }
+
+ __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+ {
+ vfloat<N> u0_o; Vec3vfN Ng0_o; vfloat<N> u1_o; Vec3vfN Ng1_o;
+ return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+ }
+ };
+ }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h
new file mode 100644
index 0000000000..0902baff7d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h
@@ -0,0 +1,209 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ namespace __coneline_internal
+ {
+ template<int M, typename Epilog, typename ray_tfar_func>
+ static __forceinline bool intersectCone(const vbool<M>& valid_i,
+ const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir,
+ const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+ const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+ const vbool<M>& cL, const vbool<M>& cR,
+ const Epilog& epilog)
+ {
+ vbool<M> valid = valid_i;
+
+ /* move ray origin closer to make calculations numerically stable */
+ const vfloat<M> dOdO = sqr(ray_dir);
+ const vfloat<M> rcp_dOdO = rcp(dOdO);
+ const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+ const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+ const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+
+ const Vec3vf<M> dP = v1.xyz() - v0.xyz();
+ const Vec3vf<M> p0 = ray_org - v0.xyz();
+ const Vec3vf<M> p1 = ray_org - v1.xyz();
+
+ const vfloat<M> dPdP = sqr(dP);
+ const vfloat<M> dP0 = dot(p0,dP);
+ const vfloat<M> dP1 = dot(p1,dP);
+ const vfloat<M> dOdP = dot(ray_dir,dP);
+
+ // intersect cone body
+ const vfloat<M> dr = v0.w - v1.w;
+ const vfloat<M> hy = dPdP + sqr(dr);
+ const vfloat<M> dO0 = dot(ray_dir,p0);
+ const vfloat<M> OO = sqr(p0);
+ const vfloat<M> dPdP2 = sqr(dPdP);
+ const vfloat<M> dPdPr0 = dPdP*v0.w;
+
+ const vfloat<M> A = dPdP2 - sqr(dOdP)*hy;
+ const vfloat<M> B = dPdP2*dO0 - dP0*dOdP*hy + dPdPr0*(dr*dOdP);
+ const vfloat<M> C = dPdP2*OO - sqr(dP0)*hy + dPdPr0*(2.0f*dr*dP0 - dPdPr0);
+
+ const vfloat<M> D = B*B - A*C;
+ valid &= D >= 0.0f;
+ if (unlikely(none(valid))) {
+ return false;
+ }
+
+ /* standard case for "non-parallel" rays */
+ const vfloat<M> Q = sqrt(D);
+ const vfloat<M> rcp_A = rcp(A);
+ /* special case for rays that are "parallel" to the cone - assume miss */
+ const vbool<M> isParallel = abs(A) <= min_rcp_input;
+
+ vfloat<M> t_cone_lower = select (isParallel, neg_inf, (-B-Q)*rcp_A);
+ vfloat<M> t_cone_upper = select (isParallel, pos_inf, (-B+Q)*rcp_A);
+ const vfloat<M> y_lower = dP0 + t_cone_lower*dOdP;
+ const vfloat<M> y_upper = dP0 + t_cone_upper*dOdP;
+ t_cone_lower = select(valid & y_lower > 0.0f & y_lower < dPdP, t_cone_lower, pos_inf);
+ t_cone_upper = select(valid & y_upper > 0.0f & y_upper < dPdP, t_cone_upper, neg_inf);
+
+ const vbool<M> hitDisk0 = valid & cL;
+ const vbool<M> hitDisk1 = valid & cR;
+ const vfloat<M> rcp_dOdP = rcp(dOdP);
+ const vfloat<M> t_disk0 = select (hitDisk0, select (sqr(p0*dOdP-ray_dir*dP0)<(sqr(v0.w)*sqr(dOdP)), -dP0*rcp_dOdP, pos_inf), pos_inf);
+ const vfloat<M> t_disk1 = select (hitDisk1, select (sqr(p1*dOdP-ray_dir*dP1)<(sqr(v1.w)*sqr(dOdP)), -dP1*rcp_dOdP, pos_inf), pos_inf);
+ const vfloat<M> t_disk_lower = min(t_disk0, t_disk1);
+ const vfloat<M> t_disk_upper = max(t_disk0, t_disk1);
+
+ const vfloat<M> t_lower = min(t_cone_lower, t_disk_lower);
+ const vfloat<M> t_upper = max(t_cone_upper, select(t_lower==t_disk_lower,
+ select(t_disk_upper==vfloat<M>(pos_inf),neg_inf,t_disk_upper),
+ select(t_disk_lower==vfloat<M>(pos_inf),neg_inf,t_disk_lower)));
+
+ const vbool<M> valid_lower = valid & ray_tnear <= dt+t_lower & dt+t_lower <= ray_tfar() & t_lower != vfloat<M>(pos_inf);
+ const vbool<M> valid_upper = valid & ray_tnear <= dt+t_upper & dt+t_upper <= ray_tfar() & t_upper != vfloat<M>(neg_inf);
+
+ const vbool<M> valid_first = valid_lower | valid_upper;
+ if (unlikely(none(valid_first)))
+ return false;
+
+ const vfloat<M> t_first = select(valid_lower, t_lower, t_upper);
+ const vfloat<M> y_first = select(valid_lower, y_lower, y_upper);
+
+ const vfloat<M> rcp_dPdP = rcp(dPdP);
+ const Vec3vf<M> dP2drr0dP = dPdP*dr*v0.w*dP;
+ const Vec3vf<M> dPhy = dP*hy;
+ const vbool<M> cone_hit_first = valid & (t_first == t_cone_lower | t_first == t_cone_upper);
+ const vbool<M> disk0_hit_first = valid & (t_first == t_disk0);
+ const Vec3vf<M> Ng_first = select(cone_hit_first, dPdP2*(p0+t_first*ray_dir)+dP2drr0dP-dPhy*y_first, select(disk0_hit_first, -dP, dP));
+ const vfloat<M> u_first = select(cone_hit_first, y_first*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+ /* invoke intersection filter for first hit */
+ RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+ const bool is_hit_first = epilog(valid_first, hit);
+
+ /* check for possible second hits before potentially accepted hit */
+ const vfloat<M> t_second = t_upper;
+ const vfloat<M> y_second = y_upper;
+ const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_upper <= ray_tfar());
+ if (unlikely(none(valid_second)))
+ return is_hit_first;
+
+ /* invoke intersection filter for second hit */
+ const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+ const vbool<M> disk0_hit_second = t_second == t_disk0;
+ const Vec3vf<M> Ng_second = select(cone_hit_second, dPdP2*(p0+t_second*ray_dir)+dP2drr0dP-dPhy*y_second, select(disk0_hit_second, -dP, dP));
+ const vfloat<M> u_second = select(cone_hit_second, y_second*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+ hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+ const bool is_hit_second = epilog(valid_second, hit);
+
+ return is_hit_first | is_hit_second;
+ }
+ }
+
+ template<int M>
+ struct ConeLineIntersectorHitM
+ {
+ __forceinline ConeLineIntersectorHitM() {}
+
+ __forceinline ConeLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+ : vu(u), vv(v), vt(t), vNg(Ng) {}
+
+ __forceinline void finalize() {}
+
+ __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+ __forceinline float t (const size_t i) const { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ public:
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ template<int M>
+ struct ConeCurveIntersector1
+ {
+ typedef CurvePrecalculations1 Precalculations;
+
+ struct ray_tfar {
+ Ray& ray;
+ __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+ __forceinline vfloat<M> operator() () const { return ray.tfar; };
+ };
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ Ray& ray,
+ IntersectContext* context,
+ const LineSegments* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+ const vbool<M>& cL, const vbool<M>& cR,
+ const Epilog& epilog)
+ {
+ const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+ const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+ const vfloat<M> ray_tnear(ray.tnear());
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+ return __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,cL,cR,epilog);
+ }
+ };
+
+ template<int M, int K>
+ struct ConeCurveIntersectorK
+ {
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ struct ray_tfar {
+ RayK<K>& ray;
+ size_t k;
+ __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+ __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+ };
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const LineSegments* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+ const vbool<M>& cL, const vbool<M>& cR,
+ const Epilog& epilog)
+ {
+ const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+ const vfloat<M> ray_tnear = ray.tnear()[k];
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+ return __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,cL,cR,epilog);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h
new file mode 100644
index 0000000000..d47218eb8b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h
@@ -0,0 +1,141 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "coneline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M, int Mx, bool filter>
+ struct ConeCurveMiIntersector1
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1;
+ vbool<M> cL,cR;
+ line.gather(v0,v1,cL,cR,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1;
+ vbool<M> cL,cR;
+ line.gather(v0,v1,cL,cR,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ return false;
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+ }
+ };
+
+ template<int M, int Mx, bool filter>
+ struct ConeCurveMiMBIntersector1
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1;
+ vbool<M> cL,cR;
+ line.gather(v0,v1,cL,cR,geom,ray.time());
+ const vbool<Mx> valid = line.template valid<Mx>();
+ ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1;
+ vbool<M> cL,cR;
+ line.gather(v0,v1,cL,cR,geom,ray.time());
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ return false;
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct ConeCurveMiIntersectorK
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1;
+ vbool<M> cL,cR;
+ line.gather(v0,v1,cL,cR,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1;
+ vbool<M> cL,cR;
+ line.gather(v0,v1,cL,cR,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct ConeCurveMiMBIntersectorK
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1;
+ vbool<M> cL,cR;
+ line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1;
+ vbool<M> cL,cR;
+ line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h
new file mode 100644
index 0000000000..51384f1959
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+ template<int M>
+ struct CurveNi
+ {
+ struct Type : public PrimitiveType {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* Returns maximum number of stored primitives */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+ static __forceinline size_t bytes(size_t N)
+ {
+ const size_t f = N/M, r = N%M;
+ static_assert(sizeof(CurveNi) == 22+25*M, "internal data layout issue");
+ return f*sizeof(CurveNi) + (r!=0)*(22 + 25*r);
+ }
+
+ public:
+
+ /*! Default constructor. */
+ __forceinline CurveNi () {}
+
+ /*! fill curve from curve list */
+ __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+ {
+ size_t end = min(begin+M,_end);
+ N = (uint8_t)(end-begin);
+ const unsigned int geomID0 = prims[begin].geomID();
+ this->geomID(N) = geomID0;
+ ty = (uint8_t) scene->get(geomID0)->getType();
+
+ /* encode all primitives */
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<N; i++)
+ {
+ const PrimRef& prim = prims[begin+i];
+ const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+ const unsigned int primID = prim.primID();
+ bounds.extend(scene->get(geomID)->vbounds(primID));
+ }
+
+ /* calculate offset and scale */
+ Vec3fa loffset = bounds.lower;
+ float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+ if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+ *this->offset(N) = loffset;
+ *this->scale(N) = lscale;
+
+ /* encode all primitives */
+ for (size_t i=0; i<M && begin<end; i++, begin++)
+ {
+ const PrimRef& prim = prims[begin];
+ const unsigned int geomID = prim.geomID();
+ const unsigned int primID = prim.primID();
+ const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpace(primID);
+
+ const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+ const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID);
+
+ bounds_vx_x(N)[i] = (int8_t) space3.vx.x;
+ bounds_vx_y(N)[i] = (int8_t) space3.vx.y;
+ bounds_vx_z(N)[i] = (int8_t) space3.vx.z;
+ bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f);
+ bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f);
+ assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f);
+
+ bounds_vy_x(N)[i] = (int8_t) space3.vy.x;
+ bounds_vy_y(N)[i] = (int8_t) space3.vy.y;
+ bounds_vy_z(N)[i] = (int8_t) space3.vy.z;
+ bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f);
+ bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f);
+ assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f);
+
+ bounds_vz_x(N)[i] = (int8_t) space3.vz.x;
+ bounds_vz_y(N)[i] = (int8_t) space3.vz.y;
+ bounds_vz_z(N)[i] = (int8_t) space3.vz.z;
+ bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f);
+ bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f);
+ assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.upper.z) && ceil (bounds.upper.z) <= 32767.0f);
+
+ this->primID(N)[i] = primID;
+ }
+ }
+
+ template<typename BVH, typename Allocator>
+ __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+ {
+ size_t start = set.begin();
+ size_t items = CurveNi::blocks(set.size());
+ size_t numbytes = CurveNi::bytes(set.size());
+ CurveNi* accel = (CurveNi*) alloc.malloc1(numbytes,BVH::byteAlignment);
+ for (size_t i=0; i<items; i++) {
+ accel[i].fill(prims,start,set.end(),bvh->scene);
+ }
+ return bvh->encodeLeaf((int8_t*)accel,items);
+ };
+
+ public:
+
+ // 27.6 - 46 bytes per primitive
+ uint8_t ty;
+ uint8_t N;
+ uint8_t data[4+25*M+16];
+
+ /*
+ struct Layout
+ {
+ unsigned int geomID;
+ unsigned int primID[N];
+
+ int8_t bounds_vx_x[N];
+ int8_t bounds_vx_y[N];
+ int8_t bounds_vx_z[N];
+ short bounds_vx_lower[N];
+ short bounds_vx_upper[N];
+
+ int8_t bounds_vy_x[N];
+ int8_t bounds_vy_y[N];
+ int8_t bounds_vy_z[N];
+ short bounds_vy_lower[N];
+ short bounds_vy_upper[N];
+
+ int8_t bounds_vz_x[N];
+ int8_t bounds_vz_y[N];
+ int8_t bounds_vz_z[N];
+ short bounds_vz_lower[N];
+ short bounds_vz_upper[N];
+
+ Vec3f offset;
+ float scale;
+ };
+ */
+
+ __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); }
+ __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
+
+ __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); }
+ __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
+
+ __forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); }
+ __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
+
+ __forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); }
+ __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
+
+ __forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); }
+ __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
+
+ __forceinline short* bounds_vx_lower(size_t N) { return (short*)((int8_t*)this+6+7*N); }
+ __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
+
+ __forceinline short* bounds_vx_upper(size_t N) { return (short*)((int8_t*)this+6+9*N); }
+ __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
+
+ __forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+11*N); }
+ __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+11*N); }
+
+ __forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+12*N); }
+ __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+12*N); }
+
+ __forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+13*N); }
+ __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+13*N); }
+
+ __forceinline short* bounds_vy_lower(size_t N) { return (short*)((int8_t*)this+6+14*N); }
+ __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((int8_t*)this+6+14*N); }
+
+ __forceinline short* bounds_vy_upper(size_t N) { return (short*)((int8_t*)this+6+16*N); }
+ __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((int8_t*)this+6+16*N); }
+
+ __forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+18*N); }
+ __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+18*N); }
+
+ __forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+19*N); }
+ __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+19*N); }
+
+ __forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+20*N); }
+ __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+20*N); }
+
+ __forceinline short* bounds_vz_lower(size_t N) { return (short*)((int8_t*)this+6+21*N); }
+ __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((int8_t*)this+6+21*N); }
+
+ __forceinline short* bounds_vz_upper(size_t N) { return (short*)((int8_t*)this+6+23*N); }
+ __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((int8_t*)this+6+23*N); }
+
+ __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+25*N); }
+ __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+25*N); }
+
+ __forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+25*N+12); }
+ __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+25*N+12); }
+
+ __forceinline int8_t* end(size_t N) { return (int8_t*)this+6+25*N+16; }
+ __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+25*N+16; }
+ };
+
+ template<int M>
+ typename CurveNi<M>::Type CurveNi<M>::type;
+
+ typedef CurveNi<4> Curve4i;
+ typedef CurveNi<8> Curve8i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h
new file mode 100644
index 0000000000..0f9038c9fc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h
@@ -0,0 +1,569 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct CurveNiIntersector1
+ {
+ typedef CurveNi<M> Primitive;
+ typedef Vec3vf<M> Vec3vfM;
+ typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+ {
+ const size_t N = prim.N;
+ const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+ const Vec3fa offset = Vec3fa(offset_scale);
+ const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+ const Vec3fa org1 = (ray.org-offset)*scale;
+ const Vec3fa dir1 = ray.dir*scale;
+
+ const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+ vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+ vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+ const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+ const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+ const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+ const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+ const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+ const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+ const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+ const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+ const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+ const vfloat<M> round_up (1.0f+3.0f*float(ulp));
+ const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+ const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+ const vfloat<M> tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+ tNear_o = tNear;
+ return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ const unsigned int primID1 = prim.primID(N)[i1];
+ geom->prefetchL1_vertices(geom->curve(primID1));
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ const unsigned int primID2 = prim.primID(N)[i2];
+ geom->prefetchL2_vertices(geom->curve(primID2));
+ }
+ }
+
+ Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ const unsigned int primID1 = prim.primID(N)[i1];
+ geom->prefetchL1_vertices(geom->curve(primID1));
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ const unsigned int primID2 = prim.primID(N)[i2];
+ geom->prefetchL2_vertices(geom->curve(primID2));
+ }
+ }
+
+ if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+ unsigned int vertexID = geom->curve(primID);
+ Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ const unsigned int primID1 = prim.primID(N)[i1];
+ geom->prefetchL1_vertices(geom->curve(primID1));
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ const unsigned int primID2 = prim.primID(N)[i2];
+ geom->prefetchL2_vertices(geom->curve(primID2));
+ }
+ }
+
+ Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+ unsigned int vertexID = geom->curve(primID);
+ Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ const unsigned int primID1 = prim.primID(N)[i1];
+ geom->prefetchL1_vertices(geom->curve(primID1));
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ const unsigned int primID2 = prim.primID(N)[i2];
+ geom->prefetchL2_vertices(geom->curve(primID2));
+ }
+ }
+
+ if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+ Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+ if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+ Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+ if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+ };
+
+ template<int M, int K>
+ struct CurveNiIntersectorK
+ {
+ typedef CurveNi<M> Primitive;
+ typedef Vec3vf<M> Vec3vfM;
+ typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+ {
+ const size_t N = prim.N;
+ const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+ const Vec3fa offset = Vec3fa(offset_scale);
+ const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+ const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+ const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+ const Vec3fa org1 = (ray_org-offset)*scale;
+ const Vec3fa dir1 = ray_dir*scale;
+
+ const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+ vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+ vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+ const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+ const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+ const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+ const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+ const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+ const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+ const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+ const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+ const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+ const vfloat<M> round_up (1.0f+3.0f*float(ulp));
+ const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+ const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+ const vfloat<M> tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+ tNear_o = tNear;
+ return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ const unsigned int primID1 = prim.primID(N)[i1];
+ geom->prefetchL1_vertices(geom->curve(primID1));
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ const unsigned int primID2 = prim.primID(N)[i2];
+ geom->prefetchL2_vertices(geom->curve(primID2));
+ }
+ }
+
+ Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ const unsigned int primID1 = prim.primID(N)[i1];
+ geom->prefetchL1_vertices(geom->curve(primID1));
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ const unsigned int primID2 = prim.primID(N)[i2];
+ geom->prefetchL2_vertices(geom->curve(primID2));
+ }
+ }
+
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+ unsigned int vertexID = geom->curve(primID);
+ Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ const unsigned int primID1 = prim.primID(N)[i1];
+ geom->prefetchL1_vertices(geom->curve(primID1));
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ const unsigned int primID2 = prim.primID(N)[i2];
+ geom->prefetchL2_vertices(geom->curve(primID2));
+ }
+ }
+
+ Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+ unsigned int vertexID = geom->curve(primID);
+ Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ const unsigned int primID1 = prim.primID(N)[i1];
+ geom->prefetchL1_vertices(geom->curve(primID1));
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ const unsigned int primID2 = prim.primID(N)[i2];
+ geom->prefetchL2_vertices(geom->curve(primID2));
+ }
+ }
+
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+ Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+ Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h
new file mode 100644
index 0000000000..0cd8f833fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h
@@ -0,0 +1,278 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+ template<int M>
+ struct CurveNiMB
+ {
+ struct Type : public PrimitiveType {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* Returns maximum number of stored primitives */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+ static __forceinline size_t bytes(size_t N)
+ {
+ const size_t f = N/M, r = N%M;
+ static_assert(sizeof(CurveNiMB) == 6+37*M+24, "internal data layout issue");
+ return f*sizeof(CurveNiMB) + (r!=0)*(6+37*r+24);
+ }
+
+ public:
+
+ /*! Default constructor. */
+ __forceinline CurveNiMB () {}
+
+ /*! fill curve from curve list */
+ __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range)
+ {
+ size_t end = min(begin+M,_end);
+ N = (uint8_t)(end-begin);
+ const unsigned int geomID0 = prims[begin].geomID();
+ this->geomID(N) = geomID0;
+ ty = (uint8_t) scene->get(geomID0)->getType();
+
+ /* encode all primitives */
+ LBBox3fa lbounds = empty;
+ for (size_t i=0; i<N; i++)
+ {
+ const PrimRefMB& prim = prims[begin+i];
+ const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+ const unsigned int primID = prim.primID();
+ lbounds.extend(scene->get(geomID)->vlinearBounds(primID,time_range));
+ }
+ BBox3fa bounds = lbounds.bounds();
+
+ /* calculate offset and scale */
+ Vec3fa loffset = bounds.lower;
+ float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+ if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+ *this->offset(N) = loffset;
+ *this->scale(N) = lscale;
+ this->time_offset(N) = time_range.lower;
+ this->time_scale(N) = 1.0f/time_range.size();
+
+ /* encode all primitives */
+ for (size_t i=0; i<M && begin<end; i++, begin++)
+ {
+ const PrimRefMB& prim = prims[begin];
+ const unsigned int geomID = prim.geomID();
+ const unsigned int primID = prim.primID();
+ const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpaceMB(primID,time_range);
+
+ const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+ const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range);
+
+ // NOTE: this weird (int8_t) (short) cast works around VS2015 Win32 compiler bug
+ bounds_vx_x(N)[i] = (int8_t) (short) space3.vx.x;
+ bounds_vx_y(N)[i] = (int8_t) (short) space3.vx.y;
+ bounds_vx_z(N)[i] = (int8_t) (short) space3.vx.z;
+ bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f);
+ bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f);
+ bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f);
+ bounds_vx_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.x),-32767.0f,32767.0f);
+ assert(-32767.0f <= floor(bounds.bounds0.lower.x) && floor(bounds.bounds0.lower.x) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.bounds0.upper.x) && ceil (bounds.bounds0.upper.x) <= 32767.0f);
+ assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f);
+
+ bounds_vy_x(N)[i] = (int8_t) (short) space3.vy.x;
+ bounds_vy_y(N)[i] = (int8_t) (short) space3.vy.y;
+ bounds_vy_z(N)[i] = (int8_t) (short) space3.vy.z;
+ bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f);
+ bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f);
+ bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f);
+ bounds_vy_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.y),-32767.0f,32767.0f);
+ assert(-32767.0f <= floor(bounds.bounds0.lower.y) && floor(bounds.bounds0.lower.y) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.bounds0.upper.y) && ceil (bounds.bounds0.upper.y) <= 32767.0f);
+ assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f);
+
+ bounds_vz_x(N)[i] = (int8_t) (short) space3.vz.x;
+ bounds_vz_y(N)[i] = (int8_t) (short) space3.vz.y;
+ bounds_vz_z(N)[i] = (int8_t) (short) space3.vz.z;
+ bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f);
+ bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f);
+ bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f);
+ bounds_vz_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.z),-32767.0f,32767.0f);
+ assert(-32767.0f <= floor(bounds.bounds0.lower.z) && floor(bounds.bounds0.lower.z) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.bounds0.upper.z) && ceil (bounds.bounds0.upper.z) <= 32767.0f);
+ assert(-32767.0f <= floor(bounds.bounds1.lower.z) && floor(bounds.bounds1.lower.z) <= 32767.0f);
+ assert(-32767.0f <= ceil (bounds.bounds1.upper.z) && ceil (bounds.bounds1.upper.z) <= 32767.0f);
+
+ this->primID(N)[i] = primID;
+ }
+
+ return lbounds;
+ }
+
+ template<typename BVH, typename SetMB, typename Allocator>
+ __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+ {
+ size_t start = prims.begin();
+ size_t end = prims.end();
+ size_t items = CurveNiMB::blocks(prims.size());
+ size_t numbytes = CurveNiMB::bytes(prims.size());
+ CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment);
+ const typename BVH::NodeRef node = bvh->encodeLeaf((int8_t*)accel,items);
+
+ LBBox3fa bounds = empty;
+ for (size_t i=0; i<items; i++)
+ bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+
+ return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+ };
+
+
+ public:
+
+ // 27.6 - 46 bytes per primitive
+ uint8_t ty;
+ uint8_t N;
+ uint8_t data[4+37*M+24];
+
+ /*
+ struct Layout
+ {
+ unsigned int geomID;
+ unsigned int primID[N];
+
+ int8_t bounds_vx_x[N];
+ int8_t bounds_vx_y[N];
+ int8_t bounds_vx_z[N];
+ short bounds_vx_lower0[N];
+ short bounds_vx_upper0[N];
+ short bounds_vx_lower1[N];
+ short bounds_vx_upper1[N];
+
+ int8_t bounds_vy_x[N];
+ int8_t bounds_vy_y[N];
+ int8_t bounds_vy_z[N];
+ short bounds_vy_lower0[N];
+ short bounds_vy_upper0[N];
+ short bounds_vy_lower1[N];
+ short bounds_vy_upper1[N];
+
+ int8_t bounds_vz_x[N];
+ int8_t bounds_vz_y[N];
+ int8_t bounds_vz_z[N];
+ short bounds_vz_lower0[N];
+ short bounds_vz_upper0[N];
+ short bounds_vz_lower1[N];
+ short bounds_vz_upper1[N];
+
+ Vec3f offset;
+ float scale;
+
+ float time_offset;
+ float time_scale;
+ };
+ */
+
+ __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); }
+ __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
+
+ __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); }
+ __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
+
+ __forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); }
+ __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
+
+ __forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); }
+ __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
+
+ __forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); }
+ __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
+
+ __forceinline short* bounds_vx_lower0(size_t N) { return (short*)((int8_t*)this+6+7*N); }
+ __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
+
+ __forceinline short* bounds_vx_upper0(size_t N) { return (short*)((int8_t*)this+6+9*N); }
+ __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
+
+ __forceinline short* bounds_vx_lower1(size_t N) { return (short*)((int8_t*)this+6+11*N); }
+ __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((int8_t*)this+6+11*N); }
+
+ __forceinline short* bounds_vx_upper1(size_t N) { return (short*)((int8_t*)this+6+13*N); }
+ __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((int8_t*)this+6+13*N); }
+
+ __forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+15*N); }
+ __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+15*N); }
+
+ __forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+16*N); }
+ __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+16*N); }
+
+ __forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+17*N); }
+ __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+17*N); }
+
+ __forceinline short* bounds_vy_lower0(size_t N) { return (short*)((int8_t*)this+6+18*N); }
+ __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((int8_t*)this+6+18*N); }
+
+ __forceinline short* bounds_vy_upper0(size_t N) { return (short*)((int8_t*)this+6+20*N); }
+ __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((int8_t*)this+6+20*N); }
+
+ __forceinline short* bounds_vy_lower1(size_t N) { return (short*)((int8_t*)this+6+22*N); }
+ __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((int8_t*)this+6+22*N); }
+
+ __forceinline short* bounds_vy_upper1(size_t N) { return (short*)((int8_t*)this+6+24*N); }
+ __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((int8_t*)this+6+24*N); }
+
+ __forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+26*N); }
+ __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+26*N); }
+
+ __forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+27*N); }
+ __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+27*N); }
+
+ __forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+28*N); }
+ __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+28*N); }
+
+ __forceinline short* bounds_vz_lower0(size_t N) { return (short*)((int8_t*)this+6+29*N); }
+ __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((int8_t*)this+6+29*N); }
+
+ __forceinline short* bounds_vz_upper0(size_t N) { return (short*)((int8_t*)this+6+31*N); }
+ __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((int8_t*)this+6+31*N); }
+
+ __forceinline short* bounds_vz_lower1(size_t N) { return (short*)((int8_t*)this+6+33*N); }
+ __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((int8_t*)this+6+33*N); }
+
+ __forceinline short* bounds_vz_upper1(size_t N) { return (short*)((int8_t*)this+6+35*N); }
+ __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((int8_t*)this+6+35*N); }
+
+ __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+37*N); }
+ __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+37*N); }
+
+ __forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+37*N+12); }
+ __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+37*N+12); }
+
+ __forceinline float& time_offset(size_t N) { return *(float*)((int8_t*)this+6+37*N+16); }
+ __forceinline const float& time_offset(size_t N) const { return *(float*)((int8_t*)this+6+37*N+16); }
+
+ __forceinline float& time_scale(size_t N) { return *(float*)((int8_t*)this+6+37*N+20); }
+ __forceinline const float& time_scale(size_t N) const { return *(float*)((int8_t*)this+6+37*N+20); }
+
+ __forceinline int8_t* end(size_t N) { return (int8_t*)this+6+37*N+24; }
+ __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+37*N+24; }
+ };
+
+ template<int M>
+ typename CurveNiMB<M>::Type CurveNiMB<M>::type;
+
+ typedef CurveNiMB<4> Curve4iMB;
+ typedef CurveNiMB<8> Curve8iMB;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h
new file mode 100644
index 0000000000..0cbc764668
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h
@@ -0,0 +1,516 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi_mb.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct CurveNiMBIntersector1
+ {
+ typedef CurveNiMB<M> Primitive;
+ typedef Vec3vf<M> Vec3vfM;
+ typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+ {
+ const size_t N = prim.N;
+ const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+ const Vec3fa offset = Vec3fa(offset_scale);
+ const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+ const Vec3fa org1 = (ray.org-offset)*scale;
+ const Vec3fa dir1 = ray.dir*scale;
+
+ const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+ vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+ vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+ const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+ const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+ const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+ const vfloat<M> ltime = (ray.time()-prim.time_offset(N))*prim.time_scale(N);
+ const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+ const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+ const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+ const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+ const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+ const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+ const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+ const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+ const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+ const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+ const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+ const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+
+ const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+ const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+ const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+ const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+ const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+ const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+
+ const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+ const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+ const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+ const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+ const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+ const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+ const vfloat<M> round_up (1.0f+3.0f*float(ulp));
+ const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+ const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+ const vfloat<M> tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+ tNear_o = tNear;
+ return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+ Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+ if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+ Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+
+ if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+ Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+ if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+ Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+ if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+ };
+
+ template<int M, int K>
+ struct CurveNiMBIntersectorK
+ {
+ typedef CurveNiMB<M> Primitive;
+ typedef Vec3vf<M> Vec3vfM;
+ typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+ {
+ const size_t N = prim.N;
+ const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+ const Vec3fa offset = Vec3fa(offset_scale);
+ const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+ const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+ const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+ const Vec3fa org1 = (ray_org-offset)*scale;
+ const Vec3fa dir1 = ray_dir*scale;
+
+ const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+ vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+ vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+ const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+ const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+ const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+ const vfloat<M> ltime = (ray.time()[k]-prim.time_offset(N))*prim.time_scale(N);
+ const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+ const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+ const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+ const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+ const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+ const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+ const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+ const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+ const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+ const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+ const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+ const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+
+ const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+ const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+ const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+ const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+ const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+ const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+
+ const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+ const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+ const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+ const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+ const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+ const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+ const vfloat<M> round_up (1.0f+3.0f*float(ulp));
+ const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+ const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+ const vfloat<M> tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+ tNear_o = tNear;
+ return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+ Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+ Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+ Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+ Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+ const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h
new file mode 100644
index 0000000000..6eb5e30b39
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+ template<int M>
+ struct CurveNv : public CurveNi<M>
+ {
+ using CurveNi<M>::N;
+
+ struct Type : public PrimitiveType {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* Returns maximum number of stored primitives */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+ static __forceinline size_t bytes(size_t N)
+ {
+ const size_t f = N/M, r = N%M;
+ static_assert(sizeof(CurveNv) == 22+25*M+4*16*M, "internal data layout issue");
+ return f*sizeof(CurveNv) + (r!=0)*(22 + 25*r + 4*16*r);
+ }
+
+ public:
+
+ /*! Default constructor. */
+ __forceinline CurveNv () {}
+
+ /*! fill curve from curve list */
+ __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+ {
+ size_t end = min(begin+M,_end);
+ size_t N = end-begin;
+
+ /* encode all primitives */
+ for (size_t i=0; i<N; i++)
+ {
+ const PrimRef& prim = prims[begin+i];
+ const unsigned int geomID = prim.geomID();
+ const unsigned int primID = prim.primID();
+ CurveGeometry* mesh = (CurveGeometry*) scene->get(geomID);
+ const unsigned vtxID = mesh->curve(primID);
+ Vec3fa::storeu(&this->vertices(i,N)[0],mesh->vertex(vtxID+0));
+ Vec3fa::storeu(&this->vertices(i,N)[1],mesh->vertex(vtxID+1));
+ Vec3fa::storeu(&this->vertices(i,N)[2],mesh->vertex(vtxID+2));
+ Vec3fa::storeu(&this->vertices(i,N)[3],mesh->vertex(vtxID+3));
+ }
+ }
+
+ template<typename BVH, typename Allocator>
+ __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+ {
+ if (set.size() == 0)
+ return BVH::emptyNode;
+
+ /* fall back to CurveNi for oriented curves */
+ unsigned int geomID = prims[set.begin()].geomID();
+ if (bvh->scene->get(geomID)->getCurveType() == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) {
+ return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+ }
+ if (bvh->scene->get(geomID)->getCurveBasis() == Geometry::GTY_BASIS_HERMITE) {
+ return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+ }
+
+ size_t start = set.begin();
+ size_t items = CurveNv::blocks(set.size());
+ size_t numbytes = CurveNv::bytes(set.size());
+ CurveNv* accel = (CurveNv*) alloc.malloc1(numbytes,BVH::byteAlignment);
+ for (size_t i=0; i<items; i++) {
+ accel[i].CurveNv<M>::fill(prims,start,set.end(),bvh->scene);
+ accel[i].CurveNi<M>::fill(prims,start,set.end(),bvh->scene);
+ }
+ return bvh->encodeLeaf((char*)accel,items);
+ };
+
+ public:
+ unsigned char data[4*16*M];
+ __forceinline Vec3fa* vertices(size_t i, size_t N) { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+ __forceinline const Vec3fa* vertices(size_t i, size_t N) const { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+ };
+
+ template<int M>
+ typename CurveNv<M>::Type CurveNv<M>::type;
+
+ typedef CurveNv<4> Curve4v;
+ typedef CurveNv<8> Curve8v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h
new file mode 100644
index 0000000000..e20da2882e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNv.h"
+#include "curveNi_intersector.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct CurveNvIntersector1 : public CurveNiIntersector1<M>
+ {
+ typedef CurveNv<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+ const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+ const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+ const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+ const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ prefetchL1(&prim.vertices(i1,N)[0]);
+ prefetchL1(&prim.vertices(i1,N)[4]);
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ prefetchL2(&prim.vertices(i2,N)[0]);
+ prefetchL2(&prim.vertices(i2,N)[4]);
+ }
+ }
+
+ Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+ const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+ const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+ const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+ const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ prefetchL1(&prim.vertices(i1,N)[0]);
+ prefetchL1(&prim.vertices(i1,N)[4]);
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ prefetchL2(&prim.vertices(i2,N)[0]);
+ prefetchL2(&prim.vertices(i2,N)[4]);
+ }
+ }
+
+ if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+ }
+ return false;
+ }
+ };
+
+ template<int M, int K>
+ struct CurveNvIntersectorK : public CurveNiIntersectorK<M,K>
+ {
+ typedef CurveNv<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(normal.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+ const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+ const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+ const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+ const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ prefetchL1(&prim.vertices(i1,N)[0]);
+ prefetchL1(&prim.vertices(i1,N)[4]);
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ prefetchL2(&prim.vertices(i2,N)[0]);
+ prefetchL2(&prim.vertices(i2,N)[4]);
+ }
+ }
+
+ Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ }
+
+ template<typename Intersector, typename Epilog>
+ static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+ {
+ vfloat<M> tNear;
+ vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+ const size_t N = prim.N;
+ size_t mask = movemask(valid);
+ while (mask)
+ {
+ const size_t i = bscf(mask);
+ STAT3(shadow.trav_prims,1,1,1);
+ const unsigned int geomID = prim.geomID(N);
+ const unsigned int primID = prim.primID(N)[i];
+ const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+ const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+ const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+ const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+ const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+ size_t mask1 = mask;
+ const size_t i1 = bscf(mask1);
+ if (mask) {
+ prefetchL1(&prim.vertices(i1,N)[0]);
+ prefetchL1(&prim.vertices(i1,N)[4]);
+ if (mask1) {
+ const size_t i2 = bsf(mask1);
+ prefetchL2(&prim.vertices(i2,N)[0]);
+ prefetchL2(&prim.vertices(i2,N)[4]);
+ }
+ }
+
+ if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+ return true;
+
+ mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+ }
+ return false;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h
new file mode 100644
index 0000000000..204958f7cc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "bezier_hair_intersector.h"
+#include "bezier_ribbon_intersector.h"
+#include "bezier_curve_intersector.h"
+#include "oriented_curve_intersector.h"
+#include "../bvh/node_intersector1.h"
+
+// FIXME: this file seems replicate of curve_intersector_virtual.h
+
+namespace embree
+{
+ namespace isa
+ {
+ struct VirtualCurveIntersector1
+ {
+ typedef unsigned char Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+ leafIntersector.intersect<1>(&pre,&ray,context,prim);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+ return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+ }
+ };
+
+ template<int K>
+ struct VirtualCurveIntersectorK
+ {
+ typedef unsigned char Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+ size_t mask = movemask(valid_i);
+ while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+ }
+
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+ vbool<K> valid_o = false;
+ size_t mask = movemask(valid_i);
+ while (mask) {
+ size_t k = bscf(mask);
+ if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+ set(valid_o, k);
+ }
+ return valid_o;
+ }
+
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+ leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+ }
+
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+ return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h
new file mode 100644
index 0000000000..343cc8ff28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<typename NativeCurve3fa, int M>
+ struct DistanceCurveHit
+ {
+ __forceinline DistanceCurveHit() {}
+
+ __forceinline DistanceCurveHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+ const NativeCurve3fa& curve3D)
+ : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+
+ __forceinline void finalize()
+ {
+ vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+ vv = V;
+ vt = T;
+ }
+
+ __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+ __forceinline float t (const size_t i) const { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) const {
+ return curve3D.eval_du(vu[i]);
+ }
+
+ public:
+ vfloat<M> U;
+ vfloat<M> V;
+ vfloat<M> T;
+ int i, N;
+ NativeCurve3fa curve3D;
+
+ public:
+ vbool<M> valid;
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ };
+
+ template<typename NativeCurve3fa>
+ struct DistanceCurve1Intersector1
+ {
+ template<typename Epilog>
+ __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3,
+ const Epilog& epilog)
+ {
+ const int N = geom->tessellationRate;
+
+ /* transform control points into ray space */
+ const NativeCurve3fa curve3Di(v0,v1,v2,v3);
+ const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di);
+ const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org);
+
+ /* evaluate the bezier curve */
+ vboolx valid = vfloatx(step) < vfloatx(float(N));
+ const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+ const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+
+ /* approximative intersection with cone */
+ const Vec4vfx v = p1-p0;
+ const Vec4vfx w = -p0;
+ const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+ const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+ const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+ const Vec4vfx p = madd(u,v,p0);
+ const vfloatx t = p.z*pre.depth_scale;
+ const vfloatx d2 = madd(p.x,p.x,p.y*p.y);
+ const vfloatx r = p.w;
+ const vfloatx r2 = r*r;
+ valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+ if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+ valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+ /* update hit information */
+ bool ishit = false;
+ if (unlikely(any(valid))) {
+ DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,0,N,curve3D);
+ ishit = ishit | epilog(valid,hit);
+ }
+
+ if (unlikely(VSIZEX < N))
+ {
+ /* process SIMD-size many segments per iteration */
+ for (int i=VSIZEX; i<N; i+=VSIZEX)
+ {
+ /* evaluate the bezier curve */
+ vboolx valid = vintx(i)+vintx(step) < vintx(N);
+ const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+ const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+
+ /* approximative intersection with cone */
+ const Vec4vfx v = p1-p0;
+ const Vec4vfx w = -p0;
+ const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+ const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+ const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+ const Vec4vfx p = madd(u,v,p0);
+ const vfloatx t = p.z*pre.depth_scale;
+ const vfloatx d2 = madd(p.x,p.x,p.y*p.y);
+ const vfloatx r = p.w;
+ const vfloatx r2 = r*r;
+ valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+ if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+ valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+ /* update hit information */
+ if (unlikely(any(valid))) {
+ DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,i,N,curve3D);
+ ishit = ishit | epilog(valid,hit);
+ }
+ }
+ }
+ return ishit;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h
new file mode 100644
index 0000000000..47531027fc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h
@@ -0,0 +1,417 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+#include "curve_intersector_sweep.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+#define DBG(x)
+
+namespace embree
+{
+ namespace isa
+ {
+ template<typename Ray, typename Epilog>
+ struct TensorLinearCubicBezierSurfaceIntersector
+ {
+ const LinearSpace3fa& ray_space;
+ Ray& ray;
+ TensorLinearCubicBezierSurface3fa curve3d;
+ TensorLinearCubicBezierSurface2fa curve2d;
+ float eps;
+ const Epilog& epilog;
+ bool isHit;
+
+ __forceinline TensorLinearCubicBezierSurfaceIntersector (const LinearSpace3fa& ray_space, Ray& ray, const TensorLinearCubicBezierSurface3fa& curve3d, const Epilog& epilog)
+ : ray_space(ray_space), ray(ray), curve3d(curve3d), epilog(epilog), isHit(false)
+ {
+ const TensorLinearCubicBezierSurface3fa curve3dray = curve3d.xfm(ray_space,ray.org);
+ curve2d = TensorLinearCubicBezierSurface2fa(CubicBezierCurve2fa(curve3dray.L),CubicBezierCurve2fa(curve3dray.R));
+ const BBox2fa b2 = curve2d.bounds();
+ eps = 8.0f*float(ulp)*reduce_max(max(abs(b2.lower),abs(b2.upper)));
+ }
+
+ __forceinline Interval1f solve_linear(const float u0, const float u1, const float& p0, const float& p1)
+ {
+ if (p1 == p0) {
+ if (p0 == 0.0f) return Interval1f(u0,u1);
+ else return Interval1f(empty);
+ }
+ const float t = -p0/(p1-p0);
+ const float tt = lerp(u0,u1,t);
+ return Interval1f(tt);
+ }
+
+ __forceinline void solve_linear(const float u0, const float u1, const Interval1f& p0, const Interval1f& p1, Interval1f& u)
+ {
+ if (sign(p0.lower) != sign(p0.upper)) u.extend(u0);
+ if (sign(p0.lower) != sign(p1.lower)) u.extend(solve_linear(u0,u1,p0.lower,p1.lower));
+ if (sign(p0.upper) != sign(p1.upper)) u.extend(solve_linear(u0,u1,p0.upper,p1.upper));
+ if (sign(p1.lower) != sign(p1.upper)) u.extend(u1);
+ }
+
+ __forceinline Interval1f bezier_clipping(const CubicBezierCurve<Interval1f>& curve)
+ {
+ Interval1f u = empty;
+ solve_linear(0.0f/3.0f,1.0f/3.0f,curve.v0,curve.v1,u);
+ solve_linear(0.0f/3.0f,2.0f/3.0f,curve.v0,curve.v2,u);
+ solve_linear(0.0f/3.0f,3.0f/3.0f,curve.v0,curve.v3,u);
+ solve_linear(1.0f/3.0f,2.0f/3.0f,curve.v1,curve.v2,u);
+ solve_linear(1.0f/3.0f,3.0f/3.0f,curve.v1,curve.v3,u);
+ solve_linear(2.0f/3.0f,3.0f/3.0f,curve.v2,curve.v3,u);
+ return intersect(u,Interval1f(0.0f,1.0f));
+ }
+
+ __forceinline Interval1f bezier_clipping(const LinearBezierCurve<Interval1f>& curve)
+ {
+ Interval1f v = empty;
+ solve_linear(0.0f,1.0f,curve.v0,curve.v1,v);
+ return intersect(v,Interval1f(0.0f,1.0f));
+ }
+
+ __forceinline void solve_bezier_clipping(BBox1f cu, BBox1f cv, const TensorLinearCubicBezierSurface2fa& curve2)
+ {
+ BBox2fa bounds = curve2.bounds();
+ if (bounds.upper.x < 0.0f) return;
+ if (bounds.upper.y < 0.0f) return;
+ if (bounds.lower.x > 0.0f) return;
+ if (bounds.lower.y > 0.0f) return;
+
+ if (max(cu.size(),cv.size()) < 1E-4f)
+ {
+ const float u = cu.center();
+ const float v = cv.center();
+ TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+ const float t = curve_z.eval(u,v);
+ if (ray.tnear() <= t && t <= ray.tfar) {
+ const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+ BezierCurveHit hit(t,u,v,Ng);
+ isHit |= epilog(hit);
+ }
+ return;
+ }
+
+ const Vec2fa dv = curve2.axis_v();
+ const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+ LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+ if (!curve0v.hasRoot()) return;
+
+ const Interval1f v = bezier_clipping(curve0v);
+ if (isEmpty(v)) return;
+ TensorLinearCubicBezierSurface2fa curve2a = curve2.clip_v(v);
+ cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+ const Vec2fa du = curve2.axis_u();
+ const TensorLinearCubicBezierSurface1f curve1u = curve2a.xfm(du);
+ CubicBezierCurve<Interval1f> curve0u = curve1u.reduce_v();
+ int roots = curve0u.maxRoots();
+ if (roots == 0) return;
+
+ if (roots == 1)
+ {
+ const Interval1f u = bezier_clipping(curve0u);
+ if (isEmpty(u)) return;
+ TensorLinearCubicBezierSurface2fa curve2b = curve2a.clip_u(u);
+ cu = BBox1f(lerp(cu.lower,cu.upper,u.lower),lerp(cu.lower,cu.upper,u.upper));
+ solve_bezier_clipping(cu,cv,curve2b);
+ return;
+ }
+
+ TensorLinearCubicBezierSurface2fa curve2l, curve2r;
+ curve2a.split_u(curve2l,curve2r);
+ solve_bezier_clipping(BBox1f(cu.lower,cu.center()),cv,curve2l);
+ solve_bezier_clipping(BBox1f(cu.center(),cu.upper),cv,curve2r);
+ }
+
+ __forceinline bool solve_bezier_clipping()
+ {
+ solve_bezier_clipping(BBox1f(0.0f,1.0f),BBox1f(0.0f,1.0f),curve2d);
+ return isHit;
+ }
+
+ __forceinline void solve_newton_raphson(BBox1f cu, BBox1f cv)
+ {
+ Vec2fa uv(cu.center(),cv.center());
+ const Vec2fa dfdu = curve2d.eval_du(uv.x,uv.y);
+ const Vec2fa dfdv = curve2d.eval_dv(uv.x,uv.y);
+ const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+ solve_newton_raphson_loop(cu,cv,uv,dfdu,dfdv,rcp_J);
+ }
+
+ __forceinline void solve_newton_raphson_loop(BBox1f cu, BBox1f cv, const Vec2fa& uv_in, const Vec2fa& dfdu, const Vec2fa& dfdv, const LinearSpace2fa& rcp_J)
+ {
+ Vec2fa uv = uv_in;
+
+ for (size_t i=0; i<200; i++)
+ {
+ const Vec2fa f = curve2d.eval(uv.x,uv.y);
+ const Vec2fa duv = rcp_J*f;
+ uv -= duv;
+
+ if (max(abs(f.x),abs(f.y)) < eps)
+ {
+ const float u = uv.x;
+ const float v = uv.y;
+ if (!(u >= 0.0f && u <= 1.0f)) return; // rejects NaNs
+ if (!(v >= 0.0f && v <= 1.0f)) return; // rejects NaNs
+ const TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+ const float t = curve_z.eval(u,v);
+ if (!(ray.tnear() <= t && t <= ray.tfar)) return; // rejects NaNs
+ const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+ BezierCurveHit hit(t,u,v,Ng);
+ isHit |= epilog(hit);
+ return;
+ }
+ }
+ }
+
+ __forceinline bool clip_v(BBox1f& cu, BBox1f& cv)
+ {
+ const Vec2fa dv = curve2d.eval_dv(cu.lower,cv.lower);
+ const TensorLinearCubicBezierSurface1f curve1v = curve2d.xfm(dv).clip(cu,cv);
+ LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+ if (!curve0v.hasRoot()) return false;
+ Interval1f v = bezier_clipping(curve0v);
+ if (isEmpty(v)) return false;
+ v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+ cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+ return true;
+ }
+
+ __forceinline bool solve_krawczyk(bool very_small, BBox1f& cu, BBox1f& cv)
+ {
+ /* perform bezier clipping in v-direction to get tight v-bounds */
+ TensorLinearCubicBezierSurface2fa curve2 = curve2d.clip(cu,cv);
+ const Vec2fa dv = curve2.axis_v();
+ const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+ LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+ if (unlikely(!curve0v.hasRoot())) return true;
+ Interval1f v = bezier_clipping(curve0v);
+ if (unlikely(isEmpty(v))) return true;
+ v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+ curve2 = curve2.clip_v(v);
+ cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+ /* perform one newton raphson iteration */
+ Vec2fa c(cu.center(),cv.center());
+ Vec2fa f,dfdu,dfdv; curve2d.eval(c.x,c.y,f,dfdu,dfdv);
+ const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+ const Vec2fa c1 = c - rcp_J*f;
+
+ /* calculate bounds of derivatives */
+ const BBox2fa bounds_du = (1.0f/cu.size())*curve2.derivative_u().bounds();
+ const BBox2fa bounds_dv = (1.0f/cv.size())*curve2.derivative_v().bounds();
+
+ /* calculate krawczyk test */
+ LinearSpace2<Vec2<Interval1f>> I(Interval1f(1.0f), Interval1f(0.0f),
+ Interval1f(0.0f), Interval1f(1.0f));
+
+ LinearSpace2<Vec2<Interval1f>> G(Interval1f(bounds_du.lower.x,bounds_du.upper.x), Interval1f(bounds_dv.lower.x,bounds_dv.upper.x),
+ Interval1f(bounds_du.lower.y,bounds_du.upper.y), Interval1f(bounds_dv.lower.y,bounds_dv.upper.y));
+
+ const LinearSpace2<Vec2f> rcp_J2(rcp_J);
+ const LinearSpace2<Vec2<Interval1f>> rcp_Ji(rcp_J2);
+
+ const Vec2<Interval1f> x(cu,cv);
+ const Vec2<Interval1f> K = Vec2<Interval1f>(Vec2f(c1)) + (I - rcp_Ji*G)*(x-Vec2<Interval1f>(Vec2f(c)));
+
+ /* test if there is no solution */
+ const Vec2<Interval1f> KK = intersect(K,x);
+ if (unlikely(isEmpty(KK.x) || isEmpty(KK.y))) return true;
+
+ /* exit if convergence cannot get proven, but terminate if we are very small */
+ if (unlikely(!subset(K,x) && !very_small)) return false;
+
+ /* solve using newton raphson iteration of convergence is guarenteed */
+ solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J);
+ return true;
+ }
+
+ __forceinline void solve_newton_raphson_no_recursion(BBox1f cu, BBox1f cv)
+ {
+ if (!clip_v(cu,cv)) return;
+ return solve_newton_raphson(cu,cv);
+ }
+
+ __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv)
+ {
+ unsigned int sptr = 0;
+ const unsigned int stack_size = 4;
+ unsigned int mask_stack[stack_size];
+ BBox1f cu_stack[stack_size];
+ BBox1f cv_stack[stack_size];
+ goto entry;
+
+ /* terminate if stack is empty */
+ while (sptr)
+ {
+ /* pop from stack */
+ {
+ sptr--;
+ size_t mask = mask_stack[sptr];
+ cu = cu_stack[sptr];
+ cv = cv_stack[sptr];
+ const size_t i = bscf(mask);
+ mask_stack[sptr] = mask;
+ if (mask) sptr++; // there are still items on the stack
+
+ /* process next element recurse into each hit curve segment */
+ const float u0 = float(i+0)*(1.0f/(VSIZEX-1));
+ const float u1 = float(i+1)*(1.0f/(VSIZEX-1));
+ const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1));
+ cu = cui;
+ }
+
+#if 0
+ solve_newton_raphson_no_recursion(cu,cv);
+ continue;
+
+#else
+ /* we assume convergence for small u ranges and verify using krawczyk */
+ if (cu.size() < 1.0f/6.0f) {
+ const bool very_small = cu.size() < 0.001f || sptr >= stack_size;
+ if (solve_krawczyk(very_small,cu,cv)) {
+ continue;
+ }
+ }
+#endif
+
+ entry:
+
+ /* split the curve into VSIZEX-1 segments in u-direction */
+ vboolx valid = true;
+ TensorLinearCubicBezierSurface<Vec2vfx> subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu);
+
+ /* slabs test in u-direction */
+ Vec2vfx ndv = cross(subcurves.axis_v());
+ BBox<vfloatx> boundsv = subcurves.vxfm(ndv).bounds();
+ valid &= boundsv.lower <= eps;
+ valid &= boundsv.upper >= -eps;
+ if (none(valid)) continue;
+
+ /* slabs test in v-direction */
+ Vec2vfx ndu = cross(subcurves.axis_u());
+ BBox<vfloatx> boundsu = subcurves.vxfm(ndu).bounds();
+ valid &= boundsu.lower <= eps;
+ valid &= boundsu.upper >= -eps;
+ if (none(valid)) continue;
+
+ /* push valid segments to stack */
+ assert(sptr < stack_size);
+ mask_stack [sptr] = movemask(valid);
+ cu_stack [sptr] = cu;
+ cv_stack [sptr] = cv;
+ sptr++;
+ }
+ }
+
+ __forceinline bool solve_newton_raphson_main()
+ {
+ BBox1f vu(0.0f,1.0f);
+ BBox1f vv(0.0f,1.0f);
+ solve_newton_raphson_recursion(vu,vv);
+ return isHit;
+ }
+ };
+
+
+ template<template<typename Ty> class SourceCurve>
+ struct OrientedCurve1Intersector1
+ {
+ //template<typename Ty> using Curve = SourceCurve<Ty>;
+ typedef SourceCurve<Vec3ff> SourceCurve3ff;
+ typedef SourceCurve<Vec3fa> SourceCurve3fa;
+
+ __forceinline OrientedCurve1Intersector1() {}
+
+ __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {}
+
+ template<typename Epilog>
+ __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+ const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+ const Epilog& epilog) const
+ {
+ STAT3(normal.trav_prims,1,1,1);
+
+ SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+ SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+ ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+ TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+ //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+ return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+ }
+
+ template<typename Epilog>
+ __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+ return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+ }
+ };
+
+ template<template<typename Ty> class SourceCurve, int K>
+ struct OrientedCurve1IntersectorK
+ {
+ //template<typename Ty> using Curve = SourceCurve<Ty>;
+ typedef SourceCurve<Vec3ff> SourceCurve3ff;
+ typedef SourceCurve<Vec3fa> SourceCurve3fa;
+
+ struct Ray1
+ {
+ __forceinline Ray1(RayK<K>& ray, size_t k)
+ : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+ Vec3fa org;
+ Vec3fa dir;
+ float _tnear;
+ float& tfar;
+
+ __forceinline float& tnear() { return _tnear; }
+ //__forceinline float& tfar() { return _tfar; }
+ __forceinline const float& tnear() const { return _tnear; }
+ //__forceinline const float& tfar() const { return _tfar; }
+ };
+
+ template<typename Epilog>
+ __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+ const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+ const Epilog& epilog)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Ray1 ray(vray,k);
+ SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+ SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+ ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+ TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+ //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+ return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const TensorLinearCubicBezierSurface3fa& curve,
+ const Epilog& epilog)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Ray1 ray(vray,k);
+ //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+ return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h
new file mode 100644
index 0000000000..6e9fc91925
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/geometry.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct CurvePrecalculations1
+ {
+ float depth_scale;
+ LinearSpace3fa ray_space;
+
+ __forceinline CurvePrecalculations1() {}
+
+ __forceinline CurvePrecalculations1(const Ray& ray, const void* ptr)
+ {
+ depth_scale = rsqrt(dot(ray.dir,ray.dir));
+ LinearSpace3fa space = frame(depth_scale*ray.dir);
+ space.vz *= depth_scale;
+ ray_space = space.transposed();
+ }
+ };
+
+ template<int K>
+ struct CurvePrecalculationsK
+ {
+ vfloat<K> depth_scale;
+ LinearSpace3fa ray_space[K];
+
+ __forceinline CurvePrecalculationsK(const vbool<K>& valid, const RayK<K>& ray)
+ {
+ size_t mask = movemask(valid);
+ depth_scale = rsqrt(dot(ray.dir,ray.dir));
+ while (mask) {
+ size_t k = bscf(mask);
+ Vec3fa ray_dir_k = Vec3fa(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+ LinearSpace3fa ray_space_k = frame(depth_scale[k]*ray_dir_k);
+ ray_space_k.vz *= depth_scale[k];
+ ray_space[k] = ray_space_k.transposed();
+ }
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h
new file mode 100644
index 0000000000..a99cf99d56
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h
@@ -0,0 +1,214 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "quad_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+#define Bezier1Intersector1 RibbonCurve1Intersector1
+#define Bezier1IntersectorK RibbonCurve1IntersectorK
+
+namespace embree
+{
+ namespace isa
+ {
+ template<typename NativeCurve3ff, int M>
+ struct RibbonHit
+ {
+ __forceinline RibbonHit() {}
+
+ __forceinline RibbonHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+ const NativeCurve3ff& curve3D)
+ : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+
+ __forceinline void finalize()
+ {
+ vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+ vv = V;
+ vt = T;
+ }
+
+ __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+ __forceinline float t (const size_t i) const { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) const {
+ return curve3D.eval_du(vu[i]);
+ }
+
+ public:
+ vfloat<M> U;
+ vfloat<M> V;
+ vfloat<M> T;
+ int i, N;
+ NativeCurve3ff curve3D;
+
+ public:
+ vbool<M> valid;
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ };
+
+ /* calculate squared distance of point p0 to line p1->p2 */
+ __forceinline std::pair<vfloatx,vfloatx> sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2)
+ {
+ const vfloatx num = det(p2-p1,p1-p0);
+ const vfloatx den2 = dot(p2-p1,p2-p1);
+ return std::make_pair(num*num,den2);
+ }
+
+ /* performs culling against a cylinder */
+ __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r)
+ {
+ const std::pair<vfloatx,vfloatx> d = sqr_point_line_distance(p0,p1,p2);
+ return d.first <= r*r*d.second;
+ }
+
+ template<typename NativeCurve3ff, typename Epilog>
+ __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar,
+ const LinearSpace3fa& ray_space, const float& depth_scale,
+ const NativeCurve3ff& curve3D, const int N,
+ const Epilog& epilog)
+ {
+ /* transform control points into ray space */
+ const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org);
+ float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3)));
+
+ /* evaluate the bezier curve */
+ bool ishit = false;
+ vboolx valid = vfloatx(step) < vfloatx(float(N));
+ const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+ const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+ valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+
+ if (any(valid))
+ {
+ Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(0,N);
+ Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(0,N);
+ dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+ dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+ const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+ const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+ const Vec3vfx nn0 = normalize(n0);
+ const Vec3vfx nn1 = normalize(n1);
+ const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+ const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+ const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+ const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+
+ vfloatx vu,vv,vt;
+ vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+ if (any(valid0))
+ {
+ /* ignore self intersections */
+ if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+ vfloatx r = lerp(p0.w, p1.w, vu);
+ valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+ }
+
+ if (any(valid0))
+ {
+ vv = madd(2.0f,vv,vfloatx(-1.0f));
+ RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,0,N,curve3D);
+ ishit |= epilog(bhit.valid,bhit);
+ }
+ }
+ }
+
+ if (unlikely(VSIZEX < N))
+ {
+ /* process SIMD-size many segments per iteration */
+ for (int i=VSIZEX; i<N; i+=VSIZEX)
+ {
+ /* evaluate the bezier curve */
+ vboolx valid = vintx(i)+vintx(step) < vintx(N);
+ const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+ const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+ valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+ if (none(valid)) continue;
+
+ Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(i,N);
+ Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(i,N);
+ dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+ dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+ const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+ const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+ const Vec3vfx nn0 = normalize(n0);
+ const Vec3vfx nn1 = normalize(n1);
+ const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+ const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+ const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+ const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+
+ vfloatx vu,vv,vt;
+ vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+ if (any(valid0))
+ {
+ /* ignore self intersections */
+ if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+ vfloatx r = lerp(p0.w, p1.w, vu);
+ valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+ }
+
+ if (any(valid0))
+ {
+ vv = madd(2.0f,vv,vfloatx(-1.0f));
+ RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,i,N,curve3D);
+ ishit |= epilog(bhit.valid,bhit);
+ }
+ }
+ }
+ }
+ return ishit;
+ }
+
+ template<template<typename Ty> class NativeCurve>
+ struct RibbonCurve1Intersector1
+ {
+ typedef NativeCurve<Vec3ff> NativeCurve3ff;
+
+ template<typename Epilog>
+ __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+ const Epilog& epilog)
+ {
+ const int N = geom->tessellationRate;
+ NativeCurve3ff curve(v0,v1,v2,v3);
+ curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve);
+ return intersect_ribbon<NativeCurve3ff>(ray.org,ray.dir,ray.tnear(),ray.tfar,
+ pre.ray_space,pre.depth_scale,
+ curve,N,
+ epilog);
+ }
+ };
+
+ template<template<typename Ty> class NativeCurve, int K>
+ struct RibbonCurve1IntersectorK
+ {
+ typedef NativeCurve<Vec3ff> NativeCurve3ff;
+
+ template<typename Epilog>
+ __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+ const Epilog& epilog)
+ {
+ const int N = geom->tessellationRate;
+ const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+ const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+ NativeCurve3ff curve(v0,v1,v2,v3);
+ curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve);
+ return intersect_ribbon<NativeCurve3ff>(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k],
+ pre.ray_space[k],pre.depth_scale[k],
+ curve,N,
+ epilog);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h
new file mode 100644
index 0000000000..883cedc3d2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "cylinder.h"
+#include "plane.h"
+#include "line_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ static const size_t numJacobianIterations = 5;
+#if defined(__AVX__)
+ static const size_t numBezierSubdivisions = 2;
+#else
+ static const size_t numBezierSubdivisions = 3;
+#endif
+
+ struct BezierCurveHit
+ {
+ __forceinline BezierCurveHit() {}
+
+ __forceinline BezierCurveHit(const float t, const float u, const Vec3fa& Ng)
+ : t(t), u(u), v(0.0f), Ng(Ng) {}
+
+ __forceinline BezierCurveHit(const float t, const float u, const float v, const Vec3fa& Ng)
+ : t(t), u(u), v(v), Ng(Ng) {}
+
+ __forceinline void finalize() {}
+
+ public:
+ float t;
+ float u;
+ float v;
+ Vec3fa Ng;
+ };
+
+ template<typename NativeCurve3ff, typename Ray, typename Epilog>
+ __forceinline bool intersect_bezier_iterative_debug(const Ray& ray, const float dt, const NativeCurve3ff& curve, size_t i,
+ const vfloatx& u, const BBox<vfloatx>& tp, const BBox<vfloatx>& h0, const BBox<vfloatx>& h1,
+ const Vec3vfx& Ng, const Vec4vfx& dP0du, const Vec4vfx& dP3du,
+ const Epilog& epilog)
+ {
+ if (tp.lower[i]+dt > ray.tfar) return false;
+ Vec3fa Ng_o = Vec3fa(Ng.x[i],Ng.y[i],Ng.z[i]);
+ if (h0.lower[i] == tp.lower[i]) Ng_o = -Vec3fa(dP0du.x[i],dP0du.y[i],dP0du.z[i]);
+ if (h1.lower[i] == tp.lower[i]) Ng_o = +Vec3fa(dP3du.x[i],dP3du.y[i],dP3du.z[i]);
+ BezierCurveHit hit(tp.lower[i]+dt,u[i],Ng_o);
+ return epilog(hit);
+ }
+
+ template<typename NativeCurve3ff, typename Ray, typename Epilog>
+ __forceinline bool intersect_bezier_iterative_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, float u, float t, const Epilog& epilog)
+ {
+ const Vec3fa org = zero;
+ const Vec3fa dir = ray.dir;
+ const float length_ray_dir = length(dir);
+
+ /* error of curve evaluations is propertional to largest coordinate */
+ const BBox3ff box = curve.bounds();
+ const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper)));
+
+ for (size_t i=0; i<numJacobianIterations; i++)
+ {
+ const Vec3fa Q = madd(Vec3fa(t),dir,org);
+ //const Vec3fa dQdu = zero;
+ const Vec3fa dQdt = dir;
+ const float Q_err = 16.0f*float(ulp)*length_ray_dir*t; // works as org=zero here
+
+ Vec3ff P,dPdu,ddPdu; curve.eval(u,P,dPdu,ddPdu);
+ //const Vec3fa dPdt = zero;
+
+ const Vec3fa R = Q-P;
+ const float len_R = length(R); //reduce_max(abs(R));
+ const float R_err = max(Q_err,P_err);
+ const Vec3fa dRdu = /*dQdu*/-dPdu;
+ const Vec3fa dRdt = dQdt;//-dPdt;
+
+ const Vec3fa T = normalize(dPdu);
+ const Vec3fa dTdu = dnormalize(dPdu,ddPdu);
+ //const Vec3fa dTdt = zero;
+ const float cos_err = P_err/length(dPdu);
+
+ /* Error estimate for dot(R,T):
+
+ dot(R,T) = cos(R,T) |R| |T|
+ = (cos(R,T) +- cos_error) * (|R| +- |R|_err) * (|T| +- |T|_err)
+ = cos(R,T)*|R|*|T|
+ +- cos(R,T)*(|R|*|T|_err + |T|*|R|_err)
+ +- cos_error*(|R| + |T|)
+ +- lower order terms
+ with cos(R,T) being in [0,1] and |T| = 1 we get:
+ dot(R,T)_err = |R|*|T|_err + |R|_err = cos_error*(|R|+1)
+ */
+
+ const float f = dot(R,T);
+ const float f_err = len_R*P_err + R_err + cos_err*(1.0f+len_R);
+ const float dfdu = dot(dRdu,T) + dot(R,dTdu);
+ const float dfdt = dot(dRdt,T);// + dot(R,dTdt);
+
+ const float K = dot(R,R)-sqr(f);
+ const float dKdu = /*2.0f*/(dot(R,dRdu)-f*dfdu);
+ const float dKdt = /*2.0f*/(dot(R,dRdt)-f*dfdt);
+ const float rsqrt_K = rsqrt(K);
+
+ const float g = sqrt(K)-P.w;
+ const float g_err = R_err + f_err + 16.0f*float(ulp)*box.upper.w;
+ const float dgdu = /*0.5f*/dKdu*rsqrt_K-dPdu.w;
+ const float dgdt = /*0.5f*/dKdt*rsqrt_K;//-dPdt.w;
+
+ const LinearSpace2f J = LinearSpace2f(dfdu,dfdt,dgdu,dgdt);
+ const Vec2f dut = rcp(J)*Vec2f(f,g);
+ const Vec2f ut = Vec2f(u,t) - dut;
+ u = ut.x; t = ut.y;
+
+ if (abs(f) < f_err && abs(g) < g_err)
+ {
+ t+=dt;
+ if (!(ray.tnear() <= t && t <= ray.tfar)) return false; // rejects NaNs
+ if (!(u >= 0.0f && u <= 1.0f)) return false; // rejects NaNs
+ const Vec3fa R = normalize(Q-P);
+ const Vec3fa U = madd(Vec3fa(dPdu.w),R,dPdu);
+ const Vec3fa V = cross(dPdu,R);
+ BezierCurveHit hit(t,u,cross(V,U));
+ return epilog(hit);
+ }
+ }
+ return false;
+ }
+
+ template<typename NativeCurve3ff, typename Ray, typename Epilog>
+ bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve,
+ float u0, float u1, unsigned int depth, const Epilog& epilog)
+ {
+#if defined(__AVX__)
+ typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues
+ typedef vint8 vintx;
+ typedef vfloat8 vfloatx;
+#else
+ typedef vbool4 vboolx;
+ typedef vint4 vintx;
+ typedef vfloat4 vfloatx;
+#endif
+ typedef Vec3<vfloatx> Vec3vfx;
+ typedef Vec4<vfloatx> Vec4vfx;
+
+ unsigned int maxDepth = numBezierSubdivisions;
+ bool found = false;
+ const Vec3fa org = zero;
+ const Vec3fa dir = ray.dir;
+
+ unsigned int sptr = 0;
+ const unsigned int stack_size = numBezierSubdivisions+1; // +1 because of unstable workaround below
+ struct StackEntry {
+ vboolx valid;
+ vfloatx tlower;
+ float u0;
+ float u1;
+ unsigned int depth;
+ };
+ StackEntry stack[stack_size];
+ goto entry;
+
+ /* terminate if stack is empty */
+ while (sptr)
+ {
+ /* pop from stack */
+ {
+ sptr--;
+ vboolx valid = stack[sptr].valid;
+ const vfloatx tlower = stack[sptr].tlower;
+ valid &= tlower+dt <= ray.tfar;
+ if (none(valid)) continue;
+ u0 = stack[sptr].u0;
+ u1 = stack[sptr].u1;
+ depth = stack[sptr].depth;
+ const size_t i = select_min(valid,tlower); clear(valid,i);
+ stack[sptr].valid = valid;
+ if (any(valid)) sptr++; // there are still items on the stack
+
+ /* process next segment */
+ const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+ u0 = vu0[i+0];
+ u1 = vu0[i+1];
+ }
+ entry:
+
+ /* subdivide curve */
+ const float dscale = (u1-u0)*(1.0f/(3.0f*(vfloatx::size-1)));
+ const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+ Vec4vfx P0, dP0du; curve.veval(vu0,P0,dP0du); dP0du = dP0du * Vec4vfx(dscale);
+ const Vec4vfx P3 = shift_right_1(P0);
+ const Vec4vfx dP3du = shift_right_1(dP0du);
+ const Vec4vfx P1 = P0 + dP0du;
+ const Vec4vfx P2 = P3 - dP3du;
+
+ /* calculate bounding cylinders */
+ const vfloatx rr1 = sqr_point_to_line_distance(Vec3vfx(dP0du),Vec3vfx(P3-P0));
+ const vfloatx rr2 = sqr_point_to_line_distance(Vec3vfx(dP3du),Vec3vfx(P3-P0));
+ const vfloatx maxr12 = sqrt(max(rr1,rr2));
+ const vfloatx one_plus_ulp = 1.0f+2.0f*float(ulp);
+ const vfloatx one_minus_ulp = 1.0f-2.0f*float(ulp);
+ vfloatx r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12;
+ vfloatx r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12;
+ r_outer = one_plus_ulp*r_outer;
+ r_inner = max(0.0f,one_minus_ulp*r_inner);
+ const CylinderN<vfloatx::size> cylinder_outer(Vec3vfx(P0),Vec3vfx(P3),r_outer);
+ const CylinderN<vfloatx::size> cylinder_inner(Vec3vfx(P0),Vec3vfx(P3),r_inner);
+ vboolx valid = true; clear(valid,vfloatx::size-1);
+
+ /* intersect with outer cylinder */
+ BBox<vfloatx> tc_outer; vfloatx u_outer0; Vec3vfx Ng_outer0; vfloatx u_outer1; Vec3vfx Ng_outer1;
+ valid &= cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1);
+ if (none(valid)) continue;
+
+ /* intersect with cap-planes */
+ BBox<vfloatx> tp(ray.tnear()-dt,ray.tfar-dt);
+ tp = embree::intersect(tp,tc_outer);
+ BBox<vfloatx> h0 = HalfPlaneN<vfloatx::size>(Vec3vfx(P0),+Vec3vfx(dP0du)).intersect(org,dir);
+ tp = embree::intersect(tp,h0);
+ BBox<vfloatx> h1 = HalfPlaneN<vfloatx::size>(Vec3vfx(P3),-Vec3vfx(dP3du)).intersect(org,dir);
+ tp = embree::intersect(tp,h1);
+ valid &= tp.lower <= tp.upper;
+ if (none(valid)) continue;
+
+ /* clamp and correct u parameter */
+ u_outer0 = clamp(u_outer0,vfloatx(0.0f),vfloatx(1.0f));
+ u_outer1 = clamp(u_outer1,vfloatx(0.0f),vfloatx(1.0f));
+ u_outer0 = lerp(u0,u1,(vfloatx(step)+u_outer0)*(1.0f/float(vfloatx::size)));
+ u_outer1 = lerp(u0,u1,(vfloatx(step)+u_outer1)*(1.0f/float(vfloatx::size)));
+
+ /* intersect with inner cylinder */
+ BBox<vfloatx> tc_inner;
+ vfloatx u_inner0 = zero; Vec3vfx Ng_inner0 = zero; vfloatx u_inner1 = zero; Vec3vfx Ng_inner1 = zero;
+ const vboolx valid_inner = cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1);
+
+ /* at the unstable area we subdivide deeper */
+ const vboolx unstable0 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner0)) < 0.3f);
+ const vboolx unstable1 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner1)) < 0.3f);
+
+ /* subtract the inner interval from the current hit interval */
+ BBox<vfloatx> tp0, tp1;
+ subtract(tp,tc_inner,tp0,tp1);
+ vboolx valid0 = valid & (tp0.lower <= tp0.upper);
+ vboolx valid1 = valid & (tp1.lower <= tp1.upper);
+ if (none(valid0 | valid1)) continue;
+
+ /* iterate over all first hits front to back */
+ const vintx termDepth0 = select(unstable0,vintx(maxDepth+1),vintx(maxDepth));
+ vboolx recursion_valid0 = valid0 & (depth < termDepth0);
+ valid0 &= depth >= termDepth0;
+
+ while (any(valid0))
+ {
+ const size_t i = select_min(valid0,tp0.lower); clear(valid0,i);
+ found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0[i],tp0.lower[i],epilog);
+ //found = found | intersect_bezier_iterative_debug (ray,dt,curve,i,u_outer0,tp0,h0,h1,Ng_outer0,dP0du,dP3du,epilog);
+ valid0 &= tp0.lower+dt <= ray.tfar;
+ }
+ valid1 &= tp1.lower+dt <= ray.tfar;
+
+ /* iterate over all second hits front to back */
+ const vintx termDepth1 = select(unstable1,vintx(maxDepth+1),vintx(maxDepth));
+ vboolx recursion_valid1 = valid1 & (depth < termDepth1);
+ valid1 &= depth >= termDepth1;
+ while (any(valid1))
+ {
+ const size_t i = select_min(valid1,tp1.lower); clear(valid1,i);
+ found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1[i],tp1.upper[i],epilog);
+ //found = found | intersect_bezier_iterative_debug (ray,dt,curve,i,u_outer1,tp1,h0,h1,Ng_outer1,dP0du,dP3du,epilog);
+ valid1 &= tp1.lower+dt <= ray.tfar;
+ }
+
+ /* push valid segments to stack */
+ recursion_valid0 &= tp0.lower+dt <= ray.tfar;
+ recursion_valid1 &= tp1.lower+dt <= ray.tfar;
+ const vboolx recursion_valid = recursion_valid0 | recursion_valid1;
+ if (any(recursion_valid))
+ {
+ assert(sptr < stack_size);
+ stack[sptr].valid = recursion_valid;
+ stack[sptr].tlower = select(recursion_valid0,tp0.lower,tp1.lower);
+ stack[sptr].u0 = u0;
+ stack[sptr].u1 = u1;
+ stack[sptr].depth = depth+1;
+ sptr++;
+ }
+ }
+ return found;
+ }
+
+ template<template<typename Ty> class NativeCurve>
+ struct SweepCurve1Intersector1
+ {
+ typedef NativeCurve<Vec3ff> NativeCurve3ff;
+
+ template<typename Epilog>
+ __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+ const Epilog& epilog)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+
+ /* move ray closer to make intersection stable */
+ NativeCurve3ff curve0(v0,v1,v2,v3);
+ curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+ const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+ const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+ const NativeCurve3ff curve1 = curve0-ref;
+ return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+ }
+ };
+
+ template<template<typename Ty> class NativeCurve, int K>
+ struct SweepCurve1IntersectorK
+ {
+ typedef NativeCurve<Vec3ff> NativeCurve3ff;
+
+ struct Ray1
+ {
+ __forceinline Ray1(RayK<K>& ray, size_t k)
+ : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+ Vec3fa org;
+ Vec3fa dir;
+ float _tnear;
+ float& tfar;
+
+ __forceinline float& tnear() { return _tnear; }
+ //__forceinline float& tfar() { return _tfar; }
+ __forceinline const float& tnear() const { return _tnear; }
+ //__forceinline const float& tfar() const { return _tfar; }
+
+ };
+
+ template<typename Epilog>
+ __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+ IntersectContext* context,
+ const CurveGeometry* geom, const unsigned int primID,
+ const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+ const Epilog& epilog)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Ray1 ray(vray,k);
+
+ /* move ray closer to make intersection stable */
+ NativeCurve3ff curve0(v0,v1,v2,v3);
+ curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+ const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+ const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+ const NativeCurve3ff curve1 = curve0-ref;
+ return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h
new file mode 100644
index 0000000000..e1f4238130
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h
@@ -0,0 +1,671 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "curve_intersector_precalculations.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+#include "intersector_epilog.h"
+
+#include "../subdiv/bezier_curve.h"
+#include "../subdiv/bspline_curve.h"
+#include "../subdiv/hermite_curve.h"
+#include "../subdiv/catmullrom_curve.h"
+
+#include "spherei_intersector.h"
+#include "disci_intersector.h"
+
+#include "linei_intersector.h"
+#include "roundlinei_intersector.h"
+#include "conelinei_intersector.h"
+
+#include "curveNi_intersector.h"
+#include "curveNv_intersector.h"
+#include "curveNi_mb_intersector.h"
+
+#include "curve_intersector_distance.h"
+#include "curve_intersector_ribbon.h"
+#include "curve_intersector_oriented.h"
+#include "curve_intersector_sweep.h"
+
+namespace embree
+{
+ struct VirtualCurveIntersector
+ {
+ typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive);
+ typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive);
+
+ typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+ typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+ typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+ typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+ typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+ typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+ public:
+ struct Intersectors
+ {
+ Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp.
+
+ template<int K> void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive);
+ template<int K> bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive);
+
+ template<int K> void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+ template<int K> bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+ public:
+ Intersect1Ty intersect1;
+ Occluded1Ty occluded1;
+ Intersect4Ty intersect4;
+ Occluded4Ty occluded4;
+ Intersect8Ty intersect8;
+ Occluded8Ty occluded8;
+ Intersect16Ty intersect16;
+ Occluded16Ty occluded16;
+ };
+
+ Intersectors vtbl[Geometry::GTY_END];
+ };
+
+ template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); }
+ template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); }
+
+ template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); }
+ template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); }
+
+#if defined(__AVX__)
+ template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); }
+ template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); }
+#endif
+
+#if defined(__AVX512F__)
+ template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); }
+ template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); }
+#endif
+
+ namespace isa
+ {
+ struct VirtualCurveIntersector1
+ {
+ typedef unsigned char Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+ leafIntersector.intersect<1>(&pre,&ray,context,prim);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+ return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+ }
+ };
+
+ template<int K>
+ struct VirtualCurveIntersectorK
+ {
+ typedef unsigned char Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ template<bool robust>
+ static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+ size_t mask = movemask(valid_i);
+ while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+ }
+
+ template<bool robust>
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+ vbool<K> valid_o = false;
+ size_t mask = movemask(valid_i);
+ while (mask) {
+ size_t k = bscf(mask);
+ if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+ set(valid_o, k);
+ }
+ return valid_o;
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+ leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ assert(num == 1);
+ RTCGeometryType ty = (RTCGeometryType)(*prim);
+ assert(This->leafIntersector);
+ VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+ return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+ }
+ };
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors LinearRoundConeNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &RoundLinearCurveMiIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &RoundLinearCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors LinearConeNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &ConeCurveMiIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &ConeCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors LinearRoundConeNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiMBIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &RoundLinearCurveMiMBIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiMBIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &RoundLinearCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiMBIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiMBIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors LinearConeNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiMBIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &ConeCurveMiMBIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiMBIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &ConeCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiMBIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiMBIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors LinearRibbonNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &FlatLinearCurveMiIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &FlatLinearCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors LinearRibbonNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiMBIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &FlatLinearCurveMiMBIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiMBIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &FlatLinearCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiMBIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiMBIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors SphereNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &SphereMiIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &SphereMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &SphereMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &SphereMiIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors SphereNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiMBIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &SphereMiMBIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiMBIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &SphereMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiMBIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &SphereMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiMBIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &SphereMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors DiscNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &DiscMiIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &DiscMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &DiscMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &DiscMiIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors DiscNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiMBIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &DiscMiMBIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiMBIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &DiscMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiMBIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &DiscMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiMBIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &DiscMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors OrientedDiscNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &OrientedDiscMiIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &OrientedDiscMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<int N>
+ static VirtualCurveIntersector::Intersectors OrientedDiscNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiMBIntersector1<N,N,true>::intersect;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &OrientedDiscMiMBIntersector1<N,N,true>::occluded;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiMBIntersectorK<N,N,4,true>::intersect;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &OrientedDiscMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiMBIntersectorK<N,N,8,true>::intersect;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiMBIntersectorK<N,N,16,true>::intersect;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors RibbonNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors RibbonNvIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNvIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNvIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors RibbonNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiMBIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors CurveNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors CurveNvIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNvIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNvIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors CurveNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors OrientedCurveNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors OrientedCurveNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors HermiteRibbonNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors HermiteRibbonNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors HermiteCurveNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors HermiteCurveNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+
+ template<template<typename Ty> class Curve, int N>
+ static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiMBIntersectors()
+ {
+ VirtualCurveIntersector::Intersectors intersectors;
+ intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+ intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+ intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+ intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+ intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+ intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+ intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+ intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+ return intersectors;
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h
new file mode 100644
index 0000000000..69cf612275
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ void AddVirtualCurveBezierCurveInterector4i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveBezierCurveInterector4v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveBezierCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+ void AddVirtualCurveBezierCurveInterector8i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveBezierCurveInterector8v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveBezierCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h
new file mode 100644
index 0000000000..d37e41098e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ void AddVirtualCurveBSplineCurveInterector4i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveBSplineCurveInterector4v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveBSplineCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+ void AddVirtualCurveBSplineCurveInterector8i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveBSplineCurveInterector8v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveBSplineCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
new file mode 100644
index 0000000000..a133a11d63
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ void AddVirtualCurveCatmullRomCurveInterector4i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveCatmullRomCurveInterector4v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveCatmullRomCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+ void AddVirtualCurveCatmullRomCurveInterector8i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveCatmullRomCurveInterector8v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveCatmullRomCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h
new file mode 100644
index 0000000000..9aec35da45
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ void AddVirtualCurveHermiteCurveInterector4i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveHermiteCurveInterector4v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveHermiteCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+ void AddVirtualCurveHermiteCurveInterector8i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveHermiteCurveInterector8v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveHermiteCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h
new file mode 100644
index 0000000000..dd37d194f5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ void AddVirtualCurveLinearCurveInterector4i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveLinearCurveInterector4v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveLinearCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+ void AddVirtualCurveLinearCurveInterector8i(VirtualCurveIntersector &prim);
+ void AddVirtualCurveLinearCurveInterector8v(VirtualCurveIntersector &prim);
+ void AddVirtualCurveLinearCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h
new file mode 100644
index 0000000000..fe5ceed840
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h
@@ -0,0 +1,22 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ void AddVirtualCurvePointInterector4i(VirtualCurveIntersector &prim);
+ void AddVirtualCurvePointInterector4v(VirtualCurveIntersector &prim);
+ void AddVirtualCurvePointInterector4iMB(VirtualCurveIntersector &prim);
+
+#if defined (__AVX__)
+ void AddVirtualCurvePointInterector8i(VirtualCurveIntersector &prim);
+ void AddVirtualCurvePointInterector8v(VirtualCurveIntersector &prim);
+ void AddVirtualCurvePointInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/cylinder.h b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h
new file mode 100644
index 0000000000..39a582864c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h
@@ -0,0 +1,223 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct Cylinder
+ {
+ const Vec3fa p0; //!< start location
+ const Vec3fa p1; //!< end position
+ const float rr; //!< squared radius of cylinder
+
+ __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float r)
+ : p0(p0), p1(p1), rr(sqr(r)) {}
+
+ __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float rr, bool)
+ : p0(p0), p1(p1), rr(rr) {}
+
+ __forceinline bool intersect(const Vec3fa& org,
+ const Vec3fa& dir,
+ BBox1f& t_o,
+ float& u0_o, Vec3fa& Ng0_o,
+ float& u1_o, Vec3fa& Ng1_o) const
+ {
+ /* calculate quadratic equation to solve */
+ const float rl = rcp_length(p1-p0);
+ const Vec3fa P0 = p0, dP = (p1-p0)*rl;
+ const Vec3fa O = org-P0, dO = dir;
+
+ const float dOdO = dot(dO,dO);
+ const float OdO = dot(dO,O);
+ const float OO = dot(O,O);
+ const float dOz = dot(dP,dO);
+ const float Oz = dot(dP,O);
+
+ const float A = dOdO - sqr(dOz);
+ const float B = 2.0f * (OdO - dOz*Oz);
+ const float C = OO - sqr(Oz) - rr;
+
+ /* we miss the cylinder if determinant is smaller than zero */
+ const float D = B*B - 4.0f*A*C;
+ if (D < 0.0f) {
+ t_o = BBox1f(pos_inf,neg_inf);
+ return false;
+ }
+
+ /* special case for rays that are parallel to the cylinder */
+ const float eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+ if (abs(A) < eps)
+ {
+ if (C <= 0.0f) {
+ t_o = BBox1f(neg_inf,pos_inf);
+ return true;
+ } else {
+ t_o = BBox1f(pos_inf,neg_inf);
+ return false;
+ }
+ }
+
+ /* standard case for rays that are not parallel to the cylinder */
+ const float Q = sqrt(D);
+ const float rcp_2A = rcp(2.0f*A);
+ const float t0 = (-B-Q)*rcp_2A;
+ const float t1 = (-B+Q)*rcp_2A;
+
+ /* calculates u and Ng for near hit */
+ {
+ u0_o = madd(t0,dOz,Oz)*rl;
+ const Vec3fa Pr = t0*dir;
+ const Vec3fa Pl = madd(u0_o,p1-p0,p0);
+ Ng0_o = Pr-Pl;
+ }
+
+ /* calculates u and Ng for far hit */
+ {
+ u1_o = madd(t1,dOz,Oz)*rl;
+ const Vec3fa Pr = t1*dir;
+ const Vec3fa Pl = madd(u1_o,p1-p0,p0);
+ Ng1_o = Pr-Pl;
+ }
+
+ t_o.lower = t0;
+ t_o.upper = t1;
+ return true;
+ }
+
+ __forceinline bool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox1f& t_o) const
+ {
+ float u0_o; Vec3fa Ng0_o;
+ float u1_o; Vec3fa Ng1_o;
+ return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+ }
+
+ static bool verify(const size_t id, const Cylinder& cylinder, const RayHit& ray, bool shouldhit, const float t0, const float t1)
+ {
+ float eps = 0.001f;
+ BBox1f t; bool hit;
+ hit = cylinder.intersect(ray.org,ray.dir,t);
+
+ bool failed = hit != shouldhit;
+ if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : abs(t0-t.lower) > eps;
+ if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : abs(t1-t.upper) > eps;
+ if (!failed) return true;
+ embree_cout << "Cylinder test " << id << " failed: cylinder = " << cylinder << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl;
+ return false;
+ }
+
+ /* verify cylinder class */
+ static bool verify()
+ {
+ bool passed = true;
+ const Cylinder cylinder(Vec3fa(0.0f,0.0f,0.0f),Vec3fa(1.0f,0.0f,0.0f),1.0f);
+ passed &= verify(0,cylinder,RayHit(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+ passed &= verify(1,cylinder,RayHit(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+ passed &= verify(2,cylinder,RayHit(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+ passed &= verify(3,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+ passed &= verify(4,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+ passed &= verify(5,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+ passed &= verify(6,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+ return passed;
+ }
+
+ /*! output operator */
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cylinder& c) {
+ return cout << "Cylinder { p0 = " << c.p0 << ", p1 = " << c.p1 << ", r = " << sqrtf(c.rr) << "}";
+ }
+ };
+
+ template<int N>
+ struct CylinderN
+ {
+ const Vec3vf<N> p0; //!< start location
+ const Vec3vf<N> p1; //!< end position
+ const vfloat<N> rr; //!< squared radius of cylinder
+
+ __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& r)
+ : p0(p0), p1(p1), rr(sqr(r)) {}
+
+ __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& rr, bool)
+ : p0(p0), p1(p1), rr(rr) {}
+
+
+ __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir,
+ BBox<vfloat<N>>& t_o,
+ vfloat<N>& u0_o, Vec3vf<N>& Ng0_o,
+ vfloat<N>& u1_o, Vec3vf<N>& Ng1_o) const
+ {
+ /* calculate quadratic equation to solve */
+ const vfloat<N> rl = rcp_length(p1-p0);
+ const Vec3vf<N> P0 = p0, dP = (p1-p0)*rl;
+ const Vec3vf<N> O = Vec3vf<N>(org)-P0, dO = dir;
+
+ const vfloat<N> dOdO = dot(dO,dO);
+ const vfloat<N> OdO = dot(dO,O);
+ const vfloat<N> OO = dot(O,O);
+ const vfloat<N> dOz = dot(dP,dO);
+ const vfloat<N> Oz = dot(dP,O);
+
+ const vfloat<N> A = dOdO - sqr(dOz);
+ const vfloat<N> B = 2.0f * (OdO - dOz*Oz);
+ const vfloat<N> C = OO - sqr(Oz) - rr;
+
+ /* we miss the cylinder if determinant is smaller than zero */
+ const vfloat<N> D = B*B - 4.0f*A*C;
+ vbool<N> valid = D >= 0.0f;
+ if (none(valid)) {
+ t_o = BBox<vfloat<N>>(empty);
+ return valid;
+ }
+
+ /* standard case for rays that are not parallel to the cylinder */
+ const vfloat<N> Q = sqrt(D);
+ const vfloat<N> rcp_2A = rcp(2.0f*A);
+ const vfloat<N> t0 = (-B-Q)*rcp_2A;
+ const vfloat<N> t1 = (-B+Q)*rcp_2A;
+
+ /* calculates u and Ng for near hit */
+ {
+ u0_o = madd(t0,dOz,Oz)*rl;
+ const Vec3vf<N> Pr = t0*Vec3vf<N>(dir);
+ const Vec3vf<N> Pl = madd(u0_o,p1-p0,p0);
+ Ng0_o = Pr-Pl;
+ }
+
+ /* calculates u and Ng for far hit */
+ {
+ u1_o = madd(t1,dOz,Oz)*rl;
+ const Vec3vf<N> Pr = t1*Vec3vf<N>(dir);
+ const Vec3vf<N> Pl = madd(u1_o,p1-p0,p0);
+ Ng1_o = Pr-Pl;
+ }
+
+ t_o.lower = select(valid, t0, vfloat<N>(pos_inf));
+ t_o.upper = select(valid, t1, vfloat<N>(neg_inf));
+
+ /* special case for rays that are parallel to the cylinder */
+ const vfloat<N> eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+ vbool<N> validt = valid & (abs(A) < eps);
+ if (unlikely(any(validt)))
+ {
+ vbool<N> inside = C <= 0.0f;
+ t_o.lower = select(validt,select(inside,vfloat<N>(neg_inf),vfloat<N>(pos_inf)),t_o.lower);
+ t_o.upper = select(validt,select(inside,vfloat<N>(pos_inf),vfloat<N>(neg_inf)),t_o.upper);
+ valid &= !validt | inside;
+ }
+ return valid;
+ }
+
+ __forceinline vbool<N> intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+ {
+ vfloat<N> u0_o; Vec3vf<N> Ng0_o;
+ vfloat<N> u1_o; Vec3vf<N> Ng1_o;
+ return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+ }
+ };
+ }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h
new file mode 100644
index 0000000000..e8305780e5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h
@@ -0,0 +1,216 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct DiscIntersectorHitM
+ {
+ __forceinline DiscIntersectorHitM() {}
+
+ __forceinline DiscIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+ : vu(u), vv(v), vt(t), vNg(Ng)
+ {
+ }
+
+ __forceinline void finalize() {}
+
+ __forceinline Vec2f uv(const size_t i) const
+ {
+ return Vec2f(vu[i], vv[i]);
+ }
+ __forceinline float t(const size_t i) const
+ {
+ return vt[i];
+ }
+ __forceinline Vec3fa Ng(const size_t i) const
+ {
+ return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+ }
+
+ public:
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ template<int M>
+ struct DiscIntersector1
+ {
+ typedef CurvePrecalculations1 Precalculations;
+
+ template<typename Epilog>
+ static __forceinline bool intersect(
+ const vbool<M>& valid_i,
+ Ray& ray,
+ IntersectContext* context,
+ const Points* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i,
+ const Epilog& epilog)
+ {
+ vbool<M> valid = valid_i;
+
+ const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+ const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+ const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir));
+
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec3vf<M> center = v0.xyz();
+ const vfloat<M> radius = v0.w;
+
+ const Vec3vf<M> c0 = center - ray_org;
+ const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+ valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar));
+ if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+ valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale; // ignore self intersections
+ if (unlikely(none(valid)))
+ return false;
+
+ const Vec3vf<M> perp = c0 - projC0 * ray_dir;
+ const vfloat<M> l2 = dot(perp, perp);
+ const vfloat<M> r2 = radius * radius;
+ valid &= (l2 <= r2);
+ if (unlikely(none(valid)))
+ return false;
+
+ DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+ return epilog(valid, hit);
+ }
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ Ray& ray,
+ IntersectContext* context,
+ const Points* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i,
+ const Vec3vf<M>& normal,
+ const Epilog& epilog)
+ {
+ vbool<M> valid = valid_i;
+ const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec3vf<M> center = v0.xyz();
+ const vfloat<M> radius = v0.w;
+
+ vfloat<M> divisor = dot(Vec3vf<M>((Vec3fa)ray.dir), normal);
+ const vbool<M> parallel = divisor == vfloat<M>(0.f);
+ valid &= !parallel;
+ divisor = select(parallel, 1.f, divisor); // prevent divide by zero
+
+ vfloat<M> t = dot(center - Vec3vf<M>((Vec3fa)ray.org), Vec3vf<M>(normal)) / divisor;
+
+ valid &= (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+ if (unlikely(none(valid)))
+ return false;
+
+ Vec3vf<M> intersection = Vec3vf<M>((Vec3fa)ray.org) + Vec3vf<M>((Vec3fa)ray.dir) * t;
+ vfloat<M> dist2 = dot(intersection - center, intersection - center);
+ valid &= dist2 < radius * radius;
+ if (unlikely(none(valid)))
+ return false;
+
+ DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+ return epilog(valid, hit);
+ }
+ };
+
+ template<int M, int K>
+ struct DiscIntersectorK
+ {
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ RayK<K>& ray,
+ size_t k,
+ IntersectContext* context,
+ const Points* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i,
+ const Epilog& epilog)
+ {
+ vbool<M> valid = valid_i;
+
+ const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+ const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir));
+
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec3vf<M> center = v0.xyz();
+ const vfloat<M> radius = v0.w;
+
+ const Vec3vf<M> c0 = center - ray_org;
+ const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+ valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k]));
+ if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+ valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k]; // ignore self intersections
+ if (unlikely(none(valid)))
+ return false;
+
+ const Vec3vf<M> perp = c0 - projC0 * ray_dir;
+ const vfloat<M> l2 = dot(perp, perp);
+ const vfloat<M> r2 = radius * radius;
+ valid &= (l2 <= r2);
+ if (unlikely(none(valid)))
+ return false;
+
+ DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+ return epilog(valid, hit);
+ }
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ RayK<K>& ray,
+ size_t k,
+ IntersectContext* context,
+ const Points* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i,
+ const Vec3vf<M>& normal,
+ const Epilog& epilog)
+ {
+ vbool<M> valid = valid_i;
+ const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec3vf<M> center = v0.xyz();
+ const vfloat<M> radius = v0.w;
+
+ vfloat<M> divisor = dot(Vec3vf<M>(ray_dir), normal);
+ const vbool<M> parallel = divisor == vfloat<M>(0.f);
+ valid &= !parallel;
+ divisor = select(parallel, 1.f, divisor); // prevent divide by zero
+
+ vfloat<M> t = dot(center - Vec3vf<M>(ray_org), Vec3vf<M>(normal)) / divisor;
+
+ valid &= (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+ if (unlikely(none(valid)))
+ return false;
+
+ Vec3vf<M> intersection = Vec3vf<M>(ray_org) + Vec3vf<M>(ray_dir) * t;
+ vfloat<M> dist2 = dot(intersection - center, intersection - center);
+ valid &= dist2 < radius * radius;
+ if (unlikely(none(valid)))
+ return false;
+
+ DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+ return epilog(valid, hit);
+ }
+ };
+ } // namespace isa
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h
new file mode 100644
index 0000000000..e1dc3aa98e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "disc_intersector.h"
+#include "intersector_epilog.h"
+#include "pointi.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M, int Mx, bool filter>
+ struct DiscMiIntersector1
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre,
+ RayHit& ray,
+ IntersectContext* context,
+ const Primitive& Disc)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Disc.gather(v0, geom);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ DiscIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre,
+ Ray& ray,
+ IntersectContext* context,
+ const Primitive& Disc)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Disc.gather(v0, geom);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ return DiscIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+ }
+ };
+
+ template<int M, int Mx, bool filter>
+ struct DiscMiMBIntersector1
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre,
+ RayHit& ray,
+ IntersectContext* context,
+ const Primitive& Disc)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ DiscIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre,
+ Ray& ray,
+ IntersectContext* context,
+ const Primitive& Disc)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ return DiscIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct DiscMiIntersectorK
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(
+ const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Disc.gather(v0, geom);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ DiscIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0,
+ Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+ }
+
+ static __forceinline bool occluded(
+ const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Disc.gather(v0, geom);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ return DiscIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0,
+ Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct DiscMiMBIntersectorK
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(
+ const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ DiscIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0,
+ Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+ }
+
+ static __forceinline bool occluded(
+ const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ return DiscIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0, Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+ }
+ };
+
+ template<int M, int Mx, bool filter>
+ struct OrientedDiscMiIntersector1
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre,
+ RayHit& ray,
+ IntersectContext* context,
+ const Primitive& Disc)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Vec3vf<M> n0;
+ Disc.gather(v0, n0, geom);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ DiscIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre,
+ Ray& ray,
+ IntersectContext* context,
+ const Primitive& Disc)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Vec3vf<M> n0;
+ Disc.gather(v0, n0, geom);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ return DiscIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+ }
+ };
+
+ template<int M, int Mx, bool filter>
+ struct OrientedDiscMiMBIntersector1
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre,
+ RayHit& ray,
+ IntersectContext* context,
+ const Primitive& Disc)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Vec3vf<M> n0;
+ Disc.gather(v0, n0, geom, ray.time());
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ DiscIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre,
+ Ray& ray,
+ IntersectContext* context,
+ const Primitive& Disc)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Vec3vf<M> n0;
+ Disc.gather(v0, n0, geom, ray.time());
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ return DiscIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct OrientedDiscMiIntersectorK
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(
+ const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Vec3vf<M> n0;
+ Disc.gather(v0, n0, geom);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ DiscIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0, n0,
+ Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+ }
+
+ static __forceinline bool occluded(
+ const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Vec3vf<M> n0;
+ Disc.gather(v0, n0, geom);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ return DiscIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0, n0,
+ Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct OrientedDiscMiMBIntersectorK
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(
+ const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Vec3vf<M> n0;
+ Disc.gather(v0, n0, geom, ray.time()[k]);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ DiscIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0, n0,
+ Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+ }
+
+ static __forceinline bool occluded(
+ const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(Disc.geomID());
+ Vec4vf<M> v0; Vec3vf<M> n0;
+ Disc.gather(v0, n0, geom, ray.time()[k]);
+ const vbool<Mx> valid = Disc.template valid<Mx>();
+ return DiscIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0, n0,
+ Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+ }
+ };
+ } // namespace isa
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/filter.h b/thirdparty/embree-aarch64/kernels/geometry/filter.h
new file mode 100644
index 0000000000..4cdf7a395a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/filter.h
@@ -0,0 +1,204 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/ray.h"
+#include "../common/hit.h"
+#include "../common/context.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+ {
+ if (geometry->intersectionFilterN)
+ {
+ assert(context->scene->hasGeometryFilterFunction());
+ geometry->intersectionFilterN(args);
+
+ if (args->valid[0] == 0)
+ return false;
+ }
+
+ if (context->user->filter) {
+ assert(context->scene->hasContextFilterFunction());
+ context->user->filter(args);
+
+ if (args->valid[0] == 0)
+ return false;
+ }
+
+ copyHitToRay(*(RayHit*)args->ray,*(Hit*)args->hit);
+ return true;
+ }
+
+ __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit)
+ {
+ RTCFilterFunctionNArguments args;
+ int mask = -1;
+ args.valid = &mask;
+ args.geometryUserPtr = geometry->userPtr;
+ args.context = context->user;
+ args.ray = (RTCRayN*)&ray;
+ args.hit = (RTCHitN*)&hit;
+ args.N = 1;
+ return runIntersectionFilter1Helper(&args,geometry,context);
+ }
+
+ __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+ {
+#if defined(EMBREE_FILTER_FUNCTION)
+ IntersectContext* MAYBE_UNUSED context = args->internal_context;
+ const Geometry* const geometry = args->geometry;
+ if (geometry->intersectionFilterN) {
+ assert(context->scene->hasGeometryFilterFunction());
+ geometry->intersectionFilterN(filter_args);
+ }
+
+ //if (args->valid[0] == 0)
+ // return;
+
+ if (context->user->filter) {
+ assert(context->scene->hasContextFilterFunction());
+ context->user->filter(filter_args);
+ }
+#endif
+ }
+
+ __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+ {
+ if (geometry->occlusionFilterN)
+ {
+ assert(context->scene->hasGeometryFilterFunction());
+ geometry->occlusionFilterN(args);
+
+ if (args->valid[0] == 0)
+ return false;
+ }
+
+ if (context->user->filter) {
+ assert(context->scene->hasContextFilterFunction());
+ context->user->filter(args);
+
+ if (args->valid[0] == 0)
+ return false;
+ }
+ return true;
+ }
+
+ __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit)
+ {
+ RTCFilterFunctionNArguments args;
+ int mask = -1;
+ args.valid = &mask;
+ args.geometryUserPtr = geometry->userPtr;
+ args.context = context->user;
+ args.ray = (RTCRayN*)&ray;
+ args.hit = (RTCHitN*)&hit;
+ args.N = 1;
+ return runOcclusionFilter1Helper(&args,geometry,context);
+ }
+
+ __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+ {
+#if defined(EMBREE_FILTER_FUNCTION)
+ IntersectContext* MAYBE_UNUSED context = args->internal_context;
+ const Geometry* const geometry = args->geometry;
+ if (geometry->occlusionFilterN) {
+ assert(context->scene->hasGeometryFilterFunction());
+ geometry->occlusionFilterN(filter_args);
+ }
+
+ //if (args->valid[0] == 0)
+ // return false;
+
+ if (context->user->filter) {
+ assert(context->scene->hasContextFilterFunction());
+ context->user->filter(filter_args);
+ }
+#endif
+ }
+
+ template<int K>
+ __forceinline vbool<K> runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+ {
+ vint<K>* mask = (vint<K>*) args->valid;
+ if (geometry->intersectionFilterN)
+ {
+ assert(context->scene->hasGeometryFilterFunction());
+ geometry->intersectionFilterN(args);
+ }
+
+ vbool<K> valid_o = *mask != vint<K>(zero);
+ if (none(valid_o)) return valid_o;
+
+ if (context->user->filter) {
+ assert(context->scene->hasContextFilterFunction());
+ context->user->filter(args);
+ }
+
+ valid_o = *mask != vint<K>(zero);
+ if (none(valid_o)) return valid_o;
+
+ copyHitToRay(valid_o,*(RayHitK<K>*)args->ray,*(HitK<K>*)args->hit);
+ return valid_o;
+ }
+
+ template<int K>
+ __forceinline vbool<K> runIntersectionFilter(const vbool<K>& valid, const Geometry* const geometry, RayHitK<K>& ray, IntersectContext* context, HitK<K>& hit)
+ {
+ RTCFilterFunctionNArguments args;
+ vint<K> mask = valid.mask32();
+ args.valid = (int*)&mask;
+ args.geometryUserPtr = geometry->userPtr;
+ args.context = context->user;
+ args.ray = (RTCRayN*)&ray;
+ args.hit = (RTCHitN*)&hit;
+ args.N = K;
+ return runIntersectionFilterHelper<K>(&args,geometry,context);
+ }
+
+ template<int K>
+ __forceinline vbool<K> runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+ {
+ vint<K>* mask = (vint<K>*) args->valid;
+ if (geometry->occlusionFilterN)
+ {
+ assert(context->scene->hasGeometryFilterFunction());
+ geometry->occlusionFilterN(args);
+ }
+
+ vbool<K> valid_o = *mask != vint<K>(zero);
+
+ if (none(valid_o)) return valid_o;
+
+ if (context->user->filter) {
+ assert(context->scene->hasContextFilterFunction());
+ context->user->filter(args);
+ }
+
+ valid_o = *mask != vint<K>(zero);
+
+ RayK<K>* ray = (RayK<K>*) args->ray;
+ ray->tfar = select(valid_o, vfloat<K>(neg_inf), ray->tfar);
+ return valid_o;
+ }
+
+ template<int K>
+ __forceinline vbool<K> runOcclusionFilter(const vbool<K>& valid, const Geometry* const geometry, RayK<K>& ray, IntersectContext* context, HitK<K>& hit)
+ {
+ RTCFilterFunctionNArguments args;
+ vint<K> mask = valid.mask32();
+ args.valid = (int*)&mask;
+ args.geometryUserPtr = geometry->userPtr;
+ args.context = context->user;
+ args.ray = (RTCRayN*)&ray;
+ args.hit = (RTCHitN*)&hit;
+ args.N = K;
+ return runOcclusionFilterHelper<K>(&args,geometry,context);
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h
new file mode 100644
index 0000000000..46a0af0827
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h
@@ -0,0 +1,99 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<typename T>
+ class SubdivPatch1Precalculations : public T
+ {
+ public:
+ __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+ : T(ray,ptr) {}
+ };
+
+ template<int K, typename T>
+ class SubdivPatch1PrecalculationsK : public T
+ {
+ public:
+ __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+ : T(valid,ray) {}
+ };
+
+ class Grid1Intersector1
+ {
+ public:
+ typedef GridSOA Primitive;
+ typedef Grid1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+ /*! Intersect a ray with the primitive. */
+ static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+ {
+ GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+ }
+ static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+ intersect(pre,ray,context,prim,ty,lazy_node);
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+ {
+ GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+ }
+ static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+ return occluded(pre,ray,context,prim,ty,lazy_node);
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) {
+ assert(false && "not implemented");
+ return false;
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+ assert(false && "not implemented");
+ return false;
+ }
+ };
+
+ template <int K>
+ struct GridIntersectorK
+ {
+ typedef GridSOA Primitive;
+ typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+
+
+ static __forceinline void intersect(const vbool<K>& valid, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+ {
+ GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+ }
+
+ static __forceinline vbool<K> occluded(const vbool<K>& valid, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+ {
+ GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+ }
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+ {
+ GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+ {
+ GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+ }
+ };
+
+ typedef Grid1IntersectorK<4> SubdivPatch1Intersector4;
+ typedef Grid1IntersectorK<8> SubdivPatch1Intersector8;
+ typedef Grid1IntersectorK<16> SubdivPatch1Intersector16;
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h
new file mode 100644
index 0000000000..d3b275586c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h
@@ -0,0 +1,275 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_subdiv_mesh.h"
+#include "../bvh/bvh.h"
+#include "../subdiv/tessellation.h"
+#include "../subdiv/tessellation_cache.h"
+#include "subdivpatch1.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ class GridSOA
+ {
+ public:
+
+ /*! GridSOA constructor */
+ GridSOA(const SubdivPatch1Base* patches, const unsigned time_steps,
+ const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight,
+ const SubdivMesh* const geom, const size_t totalBvhBytes, const size_t gridBytes, BBox3fa* bounds_o = nullptr);
+
+ /*! Subgrid creation */
+ template<typename Allocator>
+ static GridSOA* create(const SubdivPatch1Base* patches, const unsigned time_steps,
+ unsigned x0, unsigned x1, unsigned y0, unsigned y1,
+ const Scene* scene, Allocator& alloc, BBox3fa* bounds_o = nullptr)
+ {
+ const unsigned width = x1-x0+1;
+ const unsigned height = y1-y0+1;
+ const GridRange range(0,width-1,0,height-1);
+ size_t bvhBytes = 0;
+ if (time_steps == 1)
+ bvhBytes = getBVHBytes(range,sizeof(BVH4::AABBNode),0);
+ else {
+ bvhBytes = (time_steps-1)*getBVHBytes(range,sizeof(BVH4::AABBNodeMB),0);
+ bvhBytes += getTemporalBVHBytes(make_range(0,int(time_steps-1)),sizeof(BVH4::AABBNodeMB4D));
+ }
+ const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float);
+ size_t rootBytes = time_steps*sizeof(BVH4::NodeRef);
+#if !defined(__X86_64__) && !defined(__aarch64__)
+ rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding.
+#endif
+ void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes);
+ assert(data);
+ return new (data) GridSOA(patches,time_steps,x0,x1,y0,y1,patches->grid_u_res,patches->grid_v_res,scene->get<SubdivMesh>(patches->geomID()),bvhBytes,gridBytes,bounds_o);
+ }
+
+ /*! Grid creation */
+ template<typename Allocator>
+ static GridSOA* create(const SubdivPatch1Base* const patches, const unsigned time_steps,
+ const Scene* scene, const Allocator& alloc, BBox3fa* bounds_o = nullptr)
+ {
+ return create(patches,time_steps,0,patches->grid_u_res-1,0,patches->grid_v_res-1,scene,alloc,bounds_o);
+ }
+
+ /*! returns reference to root */
+ __forceinline BVH4::NodeRef& root(size_t t = 0) { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+ __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+
+ /*! returns pointer to BVH array */
+ __forceinline int8_t* bvhData() { return &data[0]; }
+ __forceinline const int8_t* bvhData() const { return &data[0]; }
+
+ /*! returns pointer to Grid array */
+ __forceinline float* gridData(size_t t = 0) { return (float*) &data[gridOffset + t*gridBytes]; }
+ __forceinline const float* gridData(size_t t = 0) const { return (float*) &data[gridOffset + t*gridBytes]; }
+
+ __forceinline void* encodeLeaf(size_t u, size_t v) {
+ return (void*) (16*(v * width + u + 1)); // +1 to not create empty leaf
+ }
+ __forceinline float* decodeLeaf(size_t t, const void* ptr) {
+ return gridData(t) + (((size_t) (ptr) >> 4) - 1);
+ }
+
+ /*! returns the size of the BVH over the grid in bytes */
+ static size_t getBVHBytes(const GridRange& range, const size_t nodeBytes, const size_t leafBytes);
+
+ /*! returns the size of the temporal BVH over the time range BVHs */
+ static size_t getTemporalBVHBytes(const range<int> time_range, const size_t nodeBytes);
+
+ /*! calculates bounding box of grid range */
+ __forceinline BBox3fa calculateBounds(size_t time, const GridRange& range) const
+ {
+ const float* const grid_array = gridData(time);
+ const float* const grid_x_array = grid_array + 0 * dim_offset;
+ const float* const grid_y_array = grid_array + 1 * dim_offset;
+ const float* const grid_z_array = grid_array + 2 * dim_offset;
+
+ /* compute the bounds just for the range! */
+ BBox3fa bounds( empty );
+ for (unsigned v = range.v_start; v<=range.v_end; v++)
+ {
+ for (unsigned u = range.u_start; u<=range.u_end; u++)
+ {
+ const float x = grid_x_array[ v * width + u];
+ const float y = grid_y_array[ v * width + u];
+ const float z = grid_z_array[ v * width + u];
+ bounds.extend( Vec3fa(x,y,z) );
+ }
+ }
+ assert(is_finite(bounds));
+ return bounds;
+ }
+
+ /*! Evaluates grid over patch and builds BVH4 tree over the grid. */
+ std::pair<BVH4::NodeRef,BBox3fa> buildBVH(BBox3fa* bounds_o);
+
+ /*! Create BVH4 tree over grid. */
+ std::pair<BVH4::NodeRef,BBox3fa> buildBVH(const GridRange& range, size_t& allocator);
+
+ /*! Evaluates grid over patch and builds MSMBlur BVH4 tree over the grid. */
+ std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, BBox3fa* bounds_o);
+
+ /*! Create MBlur BVH4 tree over grid. */
+ std::pair<BVH4::NodeRef,LBBox3fa> buildMBlurBVH(size_t time, const GridRange& range, size_t& allocator);
+
+ /*! Create MSMBlur BVH4 tree over grid. */
+ std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, size_t& allocator, BBox3fa* bounds_o);
+
+ template<typename Loader>
+ struct MapUV
+ {
+ typedef typename Loader::vfloat vfloat;
+ const float* const grid_uv;
+ size_t line_offset;
+ size_t lines;
+
+ __forceinline MapUV(const float* const grid_uv, size_t line_offset, const size_t lines)
+ : grid_uv(grid_uv), line_offset(line_offset), lines(lines) {}
+
+ __forceinline void operator() (vfloat& u, vfloat& v) const {
+ const Vec3<vfloat> tri_v012_uv = Loader::gather(grid_uv,line_offset,lines);
+ const Vec2<vfloat> uv0 = GridSOA::decodeUV(tri_v012_uv[0]);
+ const Vec2<vfloat> uv1 = GridSOA::decodeUV(tri_v012_uv[1]);
+ const Vec2<vfloat> uv2 = GridSOA::decodeUV(tri_v012_uv[2]);
+ const Vec2<vfloat> uv = u * uv1 + v * uv2 + (1.0f-u-v) * uv0;
+ u = uv[0];v = uv[1];
+ }
+ };
+
+ struct Gather2x3
+ {
+ enum { M = 4 };
+ typedef vbool4 vbool;
+ typedef vint4 vint;
+ typedef vfloat4 vfloat;
+
+ static __forceinline const Vec3vf4 gather(const float* const grid, const size_t line_offset, const size_t lines)
+ {
+ vfloat4 r0 = vfloat4::loadu(grid + 0*line_offset);
+ vfloat4 r1 = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+ if (unlikely(line_offset == 2))
+ {
+ r0 = shuffle<0,1,1,1>(r0);
+ r1 = shuffle<0,1,1,1>(r1);
+ }
+ return Vec3vf4(unpacklo(r0,r1), // r00, r10, r01, r11
+ shuffle<1,1,2,2>(r0), // r01, r01, r02, r02
+ shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12
+ }
+
+ static __forceinline void gather(const float* const grid_x,
+ const float* const grid_y,
+ const float* const grid_z,
+ const size_t line_offset,
+ const size_t lines,
+ Vec3vf4& v0_o,
+ Vec3vf4& v1_o,
+ Vec3vf4& v2_o)
+ {
+ const Vec3vf4 tri_v012_x = gather(grid_x,line_offset,lines);
+ const Vec3vf4 tri_v012_y = gather(grid_y,line_offset,lines);
+ const Vec3vf4 tri_v012_z = gather(grid_z,line_offset,lines);
+ v0_o = Vec3vf4(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+ v1_o = Vec3vf4(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+ v2_o = Vec3vf4(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+ }
+ };
+
+#if defined (__AVX__)
+ struct Gather3x3
+ {
+ enum { M = 8 };
+ typedef vbool8 vbool;
+ typedef vint8 vint;
+ typedef vfloat8 vfloat;
+
+ static __forceinline const Vec3vf8 gather(const float* const grid, const size_t line_offset, const size_t lines)
+ {
+ vfloat4 ra = vfloat4::loadu(grid + 0*line_offset);
+ vfloat4 rb = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+ vfloat4 rc;
+ if (likely(lines > 2))
+ rc = vfloat4::loadu(grid + 2*line_offset);
+ else
+ rc = rb;
+
+ if (unlikely(line_offset == 2))
+ {
+ ra = shuffle<0,1,1,1>(ra);
+ rb = shuffle<0,1,1,1>(rb);
+ rc = shuffle<0,1,1,1>(rc);
+ }
+
+ const vfloat8 r0 = vfloat8(ra,rb);
+ const vfloat8 r1 = vfloat8(rb,rc);
+ return Vec3vf8(unpacklo(r0,r1), // r00, r10, r01, r11, r10, r20, r11, r21
+ shuffle<1,1,2,2>(r0), // r01, r01, r02, r02, r11, r11, r12, r12
+ shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12, r20, r21, r21, r22
+ }
+
+ static __forceinline void gather(const float* const grid_x,
+ const float* const grid_y,
+ const float* const grid_z,
+ const size_t line_offset,
+ const size_t lines,
+ Vec3vf8& v0_o,
+ Vec3vf8& v1_o,
+ Vec3vf8& v2_o)
+ {
+ const Vec3vf8 tri_v012_x = gather(grid_x,line_offset,lines);
+ const Vec3vf8 tri_v012_y = gather(grid_y,line_offset,lines);
+ const Vec3vf8 tri_v012_z = gather(grid_z,line_offset,lines);
+ v0_o = Vec3vf8(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+ v1_o = Vec3vf8(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+ v2_o = Vec3vf8(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+ }
+ };
+#endif
+
+ template<typename vfloat>
+ static __forceinline Vec2<vfloat> decodeUV(const vfloat& uv)
+ {
+ typedef typename vfloat::Int vint;
+ const vint iu = asInt(uv) & 0xffff;
+ const vint iv = srl(asInt(uv),16);
+ const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000);
+ const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000);
+ return Vec2<vfloat>(u,v);
+ }
+
+ __forceinline unsigned int geomID() const {
+ return _geomID;
+ }
+
+ __forceinline unsigned int primID() const {
+ return _primID;
+ }
+
+ public:
+ BVH4::NodeRef troot;
+#if !defined(__X86_64__) && !defined(__aarch64__)
+ unsigned align1;
+#endif
+ unsigned time_steps;
+ unsigned width;
+
+ unsigned height;
+ unsigned dim_offset;
+ unsigned _geomID;
+ unsigned _primID;
+
+ unsigned align2;
+ unsigned gridOffset;
+ unsigned gridBytes;
+ unsigned rootOffset;
+
+ int8_t data[1]; //!< after the struct we first store the BVH, then the grid, and finally the roots
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h
new file mode 100644
index 0000000000..2ed922a5ae
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h
@@ -0,0 +1,207 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ class GridSOAIntersector1
+ {
+ public:
+ typedef void Primitive;
+
+ class Precalculations
+ {
+ public:
+ __forceinline Precalculations (const Ray& ray, const void* ptr)
+ : grid(nullptr) {}
+
+ public:
+ GridSOA* grid;
+ int itime;
+ float ftime;
+ };
+
+ template<typename Loader>
+ static __forceinline void intersect(RayHit& ray,
+ IntersectContext* context,
+ const float* const grid_x,
+ const size_t line_offset,
+ const size_t lines,
+ Precalculations& pre)
+ {
+ typedef typename Loader::vfloat vfloat;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+ Vec3<vfloat> v0, v1, v2;
+ Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+ GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+ PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+ intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+ };
+
+ template<typename Loader>
+ static __forceinline bool occluded(Ray& ray,
+ IntersectContext* context,
+ const float* const grid_x,
+ const size_t line_offset,
+ const size_t lines,
+ Precalculations& pre)
+ {
+ typedef typename Loader::vfloat vfloat;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ Vec3<vfloat> v0, v1, v2;
+ Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+
+ GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+ PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+ return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+ }
+
+ /*! Intersect a ray with the primitive. */
+ static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t line_offset = pre.grid->width;
+ const size_t lines = pre.grid->height;
+ const float* const grid_x = pre.grid->decodeLeaf(0,prim);
+
+#if defined(__AVX__)
+ intersect<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+ intersect<GridSOA::Gather2x3>(ray, context, grid_x , line_offset, lines, pre);
+ if (likely(lines > 2))
+ intersect<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t line_offset = pre.grid->width;
+ const size_t lines = pre.grid->height;
+ const float* const grid_x = pre.grid->decodeLeaf(0,prim);
+
+#if defined(__AVX__)
+ return occluded<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+ if (occluded<GridSOA::Gather2x3>(ray, context, grid_x , line_offset, lines, pre)) return true;
+ if (likely(lines > 2))
+ if (occluded<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+ return false;
+ }
+ };
+
+ class GridSOAMBIntersector1
+ {
+ public:
+ typedef void Primitive;
+ typedef GridSOAIntersector1::Precalculations Precalculations;
+
+ template<typename Loader>
+ static __forceinline void intersect(RayHit& ray, const float ftime,
+ IntersectContext* context,
+ const float* const grid_x,
+ const size_t line_offset,
+ const size_t lines,
+ Precalculations& pre)
+ {
+ typedef typename Loader::vfloat vfloat;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const size_t grid_offset = pre.grid->gridBytes >> 2;
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ Vec3<vfloat> a0, a1, a2;
+ Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+ Vec3<vfloat> b0, b1, b2;
+ Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+ Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+ Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+ Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+ GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+ PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+ intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+ };
+
+ template<typename Loader>
+ static __forceinline bool occluded(Ray& ray, const float ftime,
+ IntersectContext* context,
+ const float* const grid_x,
+ const size_t line_offset,
+ const size_t lines,
+ Precalculations& pre)
+ {
+ typedef typename Loader::vfloat vfloat;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const size_t grid_offset = pre.grid->gridBytes >> 2;
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ Vec3<vfloat> a0, a1, a2;
+ Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+ Vec3<vfloat> b0, b1, b2;
+ Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+ Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+ Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+ Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+ GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+ PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+ return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+ }
+
+ /*! Intersect a ray with the primitive. */
+ static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t line_offset = pre.grid->width;
+ const size_t lines = pre.grid->height;
+ const float* const grid_x = pre.grid->decodeLeaf(pre.itime,prim);
+
+#if defined(__AVX__)
+ intersect<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+ intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+ if (likely(lines > 2))
+ intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t line_offset = pre.grid->width;
+ const size_t lines = pre.grid->height;
+ const float* const grid_x = pre.grid->decodeLeaf(pre.itime,prim);
+
+#if defined(__AVX__)
+ return occluded<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+ if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x , line_offset, lines, pre)) return true;
+ if (likely(lines > 2))
+ if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+ return false;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h
new file mode 100644
index 0000000000..41d66e1e28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h
@@ -0,0 +1,445 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int K>
+ struct MapUV0
+ {
+ const float* const grid_uv;
+ size_t ofs00, ofs01, ofs10, ofs11;
+
+ __forceinline MapUV0(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+ : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+
+ __forceinline void operator() (vfloat<K>& u, vfloat<K>& v) const {
+ const vfloat<K> uv00(grid_uv[ofs00]);
+ const vfloat<K> uv01(grid_uv[ofs01]);
+ const vfloat<K> uv10(grid_uv[ofs10]);
+ const vfloat<K> uv11(grid_uv[ofs11]);
+ const Vec2vf<K> uv0 = GridSOA::decodeUV(uv00);
+ const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+ const Vec2vf<K> uv2 = GridSOA::decodeUV(uv10);
+ const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+ u = uv[0]; v = uv[1];
+ }
+ };
+
+ template<int K>
+ struct MapUV1
+ {
+ const float* const grid_uv;
+ size_t ofs00, ofs01, ofs10, ofs11;
+
+ __forceinline MapUV1(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+ : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+
+ __forceinline void operator() (vfloat<K>& u, vfloat<K>& v) const {
+ const vfloat<K> uv00(grid_uv[ofs00]);
+ const vfloat<K> uv01(grid_uv[ofs01]);
+ const vfloat<K> uv10(grid_uv[ofs10]);
+ const vfloat<K> uv11(grid_uv[ofs11]);
+ const Vec2vf<K> uv0 = GridSOA::decodeUV(uv10);
+ const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+ const Vec2vf<K> uv2 = GridSOA::decodeUV(uv11);
+ const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+ u = uv[0]; v = uv[1];
+ }
+ };
+
+ template<int K>
+ class GridSOAIntersectorK
+ {
+ public:
+ typedef void Primitive;
+
+ class Precalculations
+ {
+#if defined(__AVX__)
+ static const int M = 8;
+#else
+ static const int M = 4;
+#endif
+
+ public:
+ __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray)
+ : grid(nullptr), intersector(valid,ray) {}
+
+ public:
+ GridSOA* grid;
+ PlueckerIntersectorK<M,K> intersector; // FIXME: use quad intersector
+ };
+
+ /*! Intersect a ray with the primitive. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t dim_offset = pre.grid->dim_offset;
+ const size_t line_offset = pre.grid->width;
+ const float* const grid_x = pre.grid->decodeLeaf(0,prim);
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ const size_t max_x = pre.grid->width == 2 ? 1 : 2;
+ const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+ for (size_t y=0; y<max_y; y++)
+ {
+ for (size_t x=0; x<max_x; x++)
+ {
+ const size_t ofs00 = (y+0)*line_offset+(x+0);
+ const size_t ofs01 = (y+0)*line_offset+(x+1);
+ const size_t ofs10 = (y+1)*line_offset+(x+0);
+ const size_t ofs11 = (y+1)*line_offset+(x+1);
+ const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+ const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+ const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+ const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+ pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+ pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+ }
+ }
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t dim_offset = pre.grid->dim_offset;
+ const size_t line_offset = pre.grid->width;
+ const float* const grid_x = pre.grid->decodeLeaf(0,prim);
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ vbool<K> valid = valid_i;
+ const size_t max_x = pre.grid->width == 2 ? 1 : 2;
+ const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+ for (size_t y=0; y<max_y; y++)
+ {
+ for (size_t x=0; x<max_x; x++)
+ {
+ const size_t ofs00 = (y+0)*line_offset+(x+0);
+ const size_t ofs01 = (y+0)*line_offset+(x+1);
+ const size_t ofs10 = (y+1)*line_offset+(x+0);
+ const size_t ofs11 = (y+1)*line_offset+(x+1);
+ const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+ const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+ const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+ const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+ pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+ if (none(valid)) break;
+ pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+ if (none(valid)) break;
+ }
+ }
+ return !valid;
+ }
+
+ template<typename Loader>
+ static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+ IntersectContext* context,
+ const float* const grid_x,
+ const size_t line_offset,
+ const size_t lines,
+ Precalculations& pre)
+ {
+ typedef typename Loader::vfloat vfloat;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+ Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+ pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+ };
+
+ template<typename Loader>
+ static __forceinline bool occluded(RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const float* const grid_x,
+ const size_t line_offset,
+ const size_t lines,
+ Precalculations& pre)
+ {
+ typedef typename Loader::vfloat vfloat;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+ Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+ return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+ }
+
+ /*! Intersect a ray with the primitive. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t line_offset = pre.grid->width;
+ const size_t lines = pre.grid->height;
+ const float* const grid_x = pre.grid->decodeLeaf(0,prim);
+#if defined(__AVX__)
+ intersect<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+ intersect<GridSOA::Gather2x3>(ray, k, context, grid_x , line_offset, lines, pre);
+ if (likely(lines > 2))
+ intersect<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t line_offset = pre.grid->width;
+ const size_t lines = pre.grid->height;
+ const float* const grid_x = pre.grid->decodeLeaf(0,prim);
+
+#if defined(__AVX__)
+ return occluded<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+ if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x , line_offset, lines, pre)) return true;
+ if (likely(lines > 2))
+ if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+ return false;
+ }
+ };
+
+ template<int K>
+ class GridSOAMBIntersectorK
+ {
+ public:
+ typedef void Primitive;
+ typedef typename GridSOAIntersectorK<K>::Precalculations Precalculations;
+
+ /*! Intersect a ray with the primitive. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ vfloat<K> vftime;
+ vint<K> vitime = getTimeSegment(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+ vbool<K> valid1 = valid_i;
+ while (any(valid1)) {
+ const size_t j = bsf(movemask(valid1));
+ const int itime = vitime[j];
+ const vbool<K> valid2 = valid1 & (itime == vitime);
+ valid1 = valid1 & !valid2;
+ intersect(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+ }
+ }
+
+ /*! Intersect a ray with the primitive. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t grid_offset = pre.grid->gridBytes >> 2;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const size_t line_offset = pre.grid->width;
+ const float* const grid_x = pre.grid->decodeLeaf(itime,prim);
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ const size_t max_x = pre.grid->width == 2 ? 1 : 2;
+ const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+ for (size_t y=0; y<max_y; y++)
+ {
+ for (size_t x=0; x<max_x; x++)
+ {
+ size_t ofs00 = (y+0)*line_offset+(x+0);
+ size_t ofs01 = (y+0)*line_offset+(x+1);
+ size_t ofs10 = (y+1)*line_offset+(x+0);
+ size_t ofs11 = (y+1)*line_offset+(x+1);
+ const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+ const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+ const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+ const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+ ofs00 += grid_offset;
+ ofs01 += grid_offset;
+ ofs10 += grid_offset;
+ ofs11 += grid_offset;
+ const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+ const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+ const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+ const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+ const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+ const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+ const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+ const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+ pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+ pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+ }
+ }
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ vfloat<K> vftime;
+ vint<K> vitime = getTimeSegment(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+ vbool<K> valid_o = valid_i;
+ vbool<K> valid1 = valid_i;
+ while (any(valid1)) {
+ const int j = int(bsf(movemask(valid1)));
+ const int itime = vitime[j];
+ const vbool<K> valid2 = valid1 & (itime == vitime);
+ valid1 = valid1 & !valid2;
+ valid_o &= !valid2 | occluded(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+ }
+ return !valid_o;
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ const size_t grid_offset = pre.grid->gridBytes >> 2;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const size_t line_offset = pre.grid->width;
+ const float* const grid_x = pre.grid->decodeLeaf(itime,prim);
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ vbool<K> valid = valid_i;
+ const size_t max_x = pre.grid->width == 2 ? 1 : 2;
+ const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+ for (size_t y=0; y<max_y; y++)
+ {
+ for (size_t x=0; x<max_x; x++)
+ {
+ size_t ofs00 = (y+0)*line_offset+(x+0);
+ size_t ofs01 = (y+0)*line_offset+(x+1);
+ size_t ofs10 = (y+1)*line_offset+(x+0);
+ size_t ofs11 = (y+1)*line_offset+(x+1);
+ const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+ const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+ const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+ const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+ ofs00 += grid_offset;
+ ofs01 += grid_offset;
+ ofs10 += grid_offset;
+ ofs11 += grid_offset;
+ const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+ const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+ const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+ const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+ const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+ const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+ const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+ const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+ pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+ if (none(valid)) break;
+ pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+ if (none(valid)) break;
+ }
+ }
+ return valid;
+ }
+
+ template<typename Loader>
+ static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+ const float ftime,
+ IntersectContext* context,
+ const float* const grid_x,
+ const size_t line_offset,
+ const size_t lines,
+ Precalculations& pre)
+ {
+ typedef typename Loader::vfloat vfloat;
+ const size_t grid_offset = pre.grid->gridBytes >> 2;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ Vec3<vfloat> a0, a1, a2;
+ Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+ Vec3<vfloat> b0, b1, b2;
+ Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+ Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+ Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+ Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+ pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+ };
+
+ template<typename Loader>
+ static __forceinline bool occluded(RayK<K>& ray, size_t k,
+ const float ftime,
+ IntersectContext* context,
+ const float* const grid_x,
+ const size_t line_offset,
+ const size_t lines,
+ Precalculations& pre)
+ {
+ typedef typename Loader::vfloat vfloat;
+ const size_t grid_offset = pre.grid->gridBytes >> 2;
+ const size_t dim_offset = pre.grid->dim_offset;
+ const float* const grid_y = grid_x + 1 * dim_offset;
+ const float* const grid_z = grid_x + 2 * dim_offset;
+ const float* const grid_uv = grid_x + 3 * dim_offset;
+
+ Vec3<vfloat> a0, a1, a2;
+ Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+ Vec3<vfloat> b0, b1, b2;
+ Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+ Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+ Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+ Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+ return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+ }
+
+ /*! Intersect a ray with the primitive. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ float ftime;
+ int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+ const size_t line_offset = pre.grid->width;
+ const size_t lines = pre.grid->height;
+ const float* const grid_x = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+ intersect<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+ intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre);
+ if (likely(lines > 2))
+ intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ float ftime;
+ int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+ const size_t line_offset = pre.grid->width;
+ const size_t lines = pre.grid->height;
+ const float* const grid_x = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+ return occluded<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+ if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre)) return true;
+ if (likely(lines > 2))
+ if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+ return false;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance.h b/thirdparty/embree-aarch64/kernels/geometry/instance.h
new file mode 100644
index 0000000000..66893d581f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/instance.h
@@ -0,0 +1,78 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene_instance.h"
+
+namespace embree
+{
+ struct InstancePrimitive
+ {
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* primitive supports multiple time segments */
+ static const bool singleTimeSegment = false;
+
+ /* Returns maximum number of stored primitives */
+ static __forceinline size_t max_size() { return 1; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return N; }
+
+ public:
+
+ InstancePrimitive (const Instance* instance, unsigned int instID)
+ : instance(instance)
+ , instID_(instID)
+ {}
+
+ __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+ {
+ assert(end-i == 1);
+ const PrimRef& prim = prims[i]; i++;
+ const unsigned int geomID = prim.geomID();
+ const Instance* instance = scene->get<Instance>(geomID);
+ new (this) InstancePrimitive(instance, geomID);
+ }
+
+ __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+ {
+ assert(end-i == 1);
+ const PrimRef& prim = prims[i]; i++;
+ const unsigned int geomID = prim.geomID();
+ const Instance* instance = scene->get<Instance>(geomID);
+ new (this) InstancePrimitive(instance,geomID);
+ return instance->linearBounds(0,itime);
+ }
+
+ __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+ {
+ assert(end-i == 1);
+ const PrimRefMB& prim = prims[i]; i++;
+ const unsigned int geomID = prim.geomID();
+ const Instance* instance = scene->get<Instance>(geomID);
+ new (this) InstancePrimitive(instance,geomID);
+ return instance->linearBounds(0,time_range);
+ }
+
+ /* Updates the primitive */
+ __forceinline BBox3fa update(Instance* instance) {
+ return instance->bounds(0);
+ }
+
+ public:
+ const Instance* instance;
+ const unsigned int instID_ = std::numeric_limits<unsigned int>::max ();
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h
new file mode 100644
index 0000000000..91731a39c5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "instance.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct InstanceIntersector1
+ {
+ typedef InstancePrimitive Primitive;
+
+ struct Precalculations {
+ __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+ };
+
+ static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+ static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+ static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+ };
+
+ struct InstanceIntersector1MB
+ {
+ typedef InstancePrimitive Primitive;
+
+ struct Precalculations {
+ __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+ };
+
+ static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+ static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+ static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+ };
+
+ template<int K>
+ struct InstanceIntersectorK
+ {
+ typedef InstancePrimitive Primitive;
+
+ struct Precalculations {
+ __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+ };
+
+ static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+ static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+ intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+ occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+ return ray.tfar[k] < 0.0f;
+ }
+ };
+
+ template<int K>
+ struct InstanceIntersectorKMB
+ {
+ typedef InstancePrimitive Primitive;
+
+ struct Precalculations {
+ __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+ };
+
+ static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+ static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+ intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+ occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+ return ray.tfar[k] < 0.0f;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h b/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h
new file mode 100644
index 0000000000..0df49dd6e9
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h
@@ -0,0 +1,1074 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/context.h"
+#include "filter.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct UVIdentity {
+ __forceinline void operator() (vfloat<M>& u, vfloat<M>& v) const {}
+ };
+
+
+ template<bool filter>
+ struct Intersect1Epilog1
+ {
+ RayHit& ray;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline Intersect1Epilog1(RayHit& ray,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (Hit& hit) const
+ {
+ /* ray mask test */
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+ if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+ hit.finalize();
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+ HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+ const float old_t = ray.tfar;
+ ray.tfar = hit.t;
+ bool found = runIntersectionFilter1(geometry,ray,context,h);
+ if (!found) ray.tfar = old_t;
+ return found;
+ }
+ }
+#endif
+
+ /* update hit information */
+ ray.tfar = hit.t;
+ ray.Ng = hit.Ng;
+ ray.u = hit.u;
+ ray.v = hit.v;
+ ray.primID = primID;
+ ray.geomID = geomID;
+ instance_id_stack::copy(context->user->instID, ray.instID);
+ return true;
+ }
+ };
+
+ template<bool filter>
+ struct Occluded1Epilog1
+ {
+ Ray& ray;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline Occluded1Epilog1(Ray& ray,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (Hit& hit) const
+ {
+ /* ray mask test */
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+
+#if defined(EMBREE_RAY_MASK)
+ if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+ hit.finalize();
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+ HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+ const float old_t = ray.tfar;
+ ray.tfar = hit.t;
+ const bool found = runOcclusionFilter1(geometry,ray,context,h);
+ if (!found) ray.tfar = old_t;
+ return found;
+ }
+ }
+#endif
+ return true;
+ }
+ };
+
+ template<int K, bool filter>
+ struct Intersect1KEpilog1
+ {
+ RayHitK<K>& ray;
+ size_t k;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline Intersect1KEpilog1(RayHitK<K>& ray, size_t k,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (Hit& hit) const
+ {
+ /* ray mask test */
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+ if ((geometry->mask & ray.mask[k]) == 0)
+ return false;
+#endif
+ hit.finalize();
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+ HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+ const float old_t = ray.tfar[k];
+ ray.tfar[k] = hit.t;
+ const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+ if (!found) ray.tfar[k] = old_t;
+ return found;
+ }
+ }
+#endif
+
+ /* update hit information */
+ ray.tfar[k] = hit.t;
+ ray.Ng.x[k] = hit.Ng.x;
+ ray.Ng.y[k] = hit.Ng.y;
+ ray.Ng.z[k] = hit.Ng.z;
+ ray.u[k] = hit.u;
+ ray.v[k] = hit.v;
+ ray.primID[k] = primID;
+ ray.geomID[k] = geomID;
+ instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+ return true;
+ }
+ };
+
+ template<int K, bool filter>
+ struct Occluded1KEpilog1
+ {
+ RayK<K>& ray;
+ size_t k;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline Occluded1KEpilog1(RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (Hit& hit) const
+ {
+ /* ray mask test */
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+ if ((geometry->mask & ray.mask[k]) == 0)
+ return false;
+#endif
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+ hit.finalize();
+ HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+ const float old_t = ray.tfar[k];
+ ray.tfar[k] = hit.t;
+ const bool found = any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+ if (!found) ray.tfar[k] = old_t;
+ return found;
+ }
+ }
+#endif
+ return true;
+ }
+ };
+
+ template<int M, int Mx, bool filter>
+ struct Intersect1EpilogM
+ {
+ RayHit& ray;
+ IntersectContext* context;
+ const vuint<M>& geomIDs;
+ const vuint<M>& primIDs;
+
+ __forceinline Intersect1EpilogM(RayHit& ray,
+ IntersectContext* context,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs)
+ : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+ {
+ Scene* scene MAYBE_UNUSED = context->scene;
+ vbool<Mx> valid = valid_i;
+ if (Mx > M) valid &= (1<<M)-1;
+ hit.finalize();
+ size_t i = select_min(valid,hit.vt);
+ unsigned int geomID = geomIDs[i];
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+ bool foundhit = false;
+ goto entry;
+ while (true)
+ {
+ if (unlikely(none(valid))) return foundhit;
+ i = select_min(valid,hit.vt);
+
+ geomID = geomIDs[i];
+ entry:
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+ /* goto next hit if mask test fails */
+ if ((geometry->mask & ray.mask) == 0) {
+ clear(valid,i);
+ continue;
+ }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+ /* call intersection filter function */
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+ const Vec2f uv = hit.uv(i);
+ HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+ const float old_t = ray.tfar;
+ ray.tfar = hit.t(i);
+ const bool found = runIntersectionFilter1(geometry,ray,context,h);
+ if (!found) ray.tfar = old_t;
+ foundhit |= found;
+ clear(valid,i);
+ valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+ continue;
+ }
+ }
+#endif
+ break;
+ }
+#endif
+
+ /* update hit information */
+ const Vec2f uv = hit.uv(i);
+ ray.tfar = hit.vt[i];
+ ray.Ng.x = hit.vNg.x[i];
+ ray.Ng.y = hit.vNg.y[i];
+ ray.Ng.z = hit.vNg.z[i];
+ ray.u = uv.x;
+ ray.v = uv.y;
+ ray.primID = primIDs[i];
+ ray.geomID = geomID;
+ instance_id_stack::copy(context->user->instID, ray.instID);
+ return true;
+
+ }
+ };
+
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+ template<int M, bool filter>
+ struct Intersect1EpilogM<M,16,filter>
+ {
+ static const size_t Mx = 16;
+ RayHit& ray;
+ IntersectContext* context;
+ const vuint<M>& geomIDs;
+ const vuint<M>& primIDs;
+
+ __forceinline Intersect1EpilogM(RayHit& ray,
+ IntersectContext* context,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs)
+ : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+ {
+ Scene* MAYBE_UNUSED scene = context->scene;
+ vbool<Mx> valid = valid_i;
+ if (Mx > M) valid &= (1<<M)-1;
+ hit.finalize();
+ size_t i = select_min(valid,hit.vt);
+ unsigned int geomID = geomIDs[i];
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+ bool foundhit = false;
+ goto entry;
+ while (true)
+ {
+ if (unlikely(none(valid))) return foundhit;
+ i = select_min(valid,hit.vt);
+
+ geomID = geomIDs[i];
+ entry:
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+ /* goto next hit if mask test fails */
+ if ((geometry->mask & ray.mask) == 0) {
+ clear(valid,i);
+ continue;
+ }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+ /* call intersection filter function */
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+ const Vec2f uv = hit.uv(i);
+ HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+ const float old_t = ray.tfar;
+ ray.tfar = hit.t(i);
+ const bool found = runIntersectionFilter1(geometry,ray,context,h);
+ if (!found) ray.tfar = old_t;
+ foundhit |= found;
+ clear(valid,i);
+ valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+ continue;
+ }
+ }
+#endif
+ break;
+ }
+#endif
+
+ vbool<Mx> finalMask(((unsigned int)1 << i));
+ ray.update(finalMask,hit.vt,hit.vu,hit.vv,hit.vNg.x,hit.vNg.y,hit.vNg.z,geomID,primIDs);
+ instance_id_stack::foreach([&](unsigned level)
+ {
+ ray.instID[level] = context->user->instID[level];
+ return (context->user->instID[level] != RTC_INVALID_GEOMETRY_ID);
+ });
+ return true;
+
+ }
+ };
+#endif
+
+ template<int M, int Mx, bool filter>
+ struct Occluded1EpilogM
+ {
+ Ray& ray;
+ IntersectContext* context;
+ const vuint<M>& geomIDs;
+ const vuint<M>& primIDs;
+
+ __forceinline Occluded1EpilogM(Ray& ray,
+ IntersectContext* context,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs)
+ : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+ {
+ Scene* scene MAYBE_UNUSED = context->scene;
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+ if (unlikely(filter))
+ hit.finalize(); /* called only once */
+
+ vbool<Mx> valid = valid_i;
+ if (Mx > M) valid &= (1<<M)-1;
+ size_t m=movemask(valid);
+ goto entry;
+ while (true)
+ {
+ if (unlikely(m == 0)) return false;
+ entry:
+ size_t i=bsf(m);
+
+ const unsigned int geomID = geomIDs[i];
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+ /* goto next hit if mask test fails */
+ if ((geometry->mask & ray.mask) == 0) {
+ m=btc(m,i);
+ continue;
+ }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+ /* if we have no filter then the test passed */
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+ {
+ const Vec2f uv = hit.uv(i);
+ HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+ const float old_t = ray.tfar;
+ ray.tfar = hit.t(i);
+ if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+ ray.tfar = old_t;
+ m=btc(m,i);
+ continue;
+ }
+ }
+#endif
+ break;
+ }
+#endif
+
+ return true;
+ }
+ };
+
+ template<int M, bool filter>
+ struct Intersect1EpilogMU
+ {
+ RayHit& ray;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline Intersect1EpilogMU(RayHit& ray,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+ {
+ /* ray mask test */
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+ if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+ vbool<M> valid = valid_i;
+ hit.finalize();
+
+ size_t i = select_min(valid,hit.vt);
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+ {
+ bool foundhit = false;
+ while (true)
+ {
+ /* call intersection filter function */
+ Vec2f uv = hit.uv(i);
+ const float old_t = ray.tfar;
+ ray.tfar = hit.t(i);
+ HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+ const bool found = runIntersectionFilter1(geometry,ray,context,h);
+ if (!found) ray.tfar = old_t;
+ foundhit |= found;
+ clear(valid,i);
+ valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+ if (unlikely(none(valid))) break;
+ i = select_min(valid,hit.vt);
+ }
+ return foundhit;
+ }
+#endif
+
+ /* update hit information */
+ const Vec2f uv = hit.uv(i);
+ const Vec3fa Ng = hit.Ng(i);
+ ray.tfar = hit.t(i);
+ ray.Ng.x = Ng.x;
+ ray.Ng.y = Ng.y;
+ ray.Ng.z = Ng.z;
+ ray.u = uv.x;
+ ray.v = uv.y;
+ ray.primID = primID;
+ ray.geomID = geomID;
+ instance_id_stack::copy(context->user->instID, ray.instID);
+ return true;
+ }
+ };
+
+ template<int M, bool filter>
+ struct Occluded1EpilogMU
+ {
+ Ray& ray;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline Occluded1EpilogMU(Ray& ray,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<M>& valid, Hit& hit) const
+ {
+ /* ray mask test */
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+ if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+ {
+ hit.finalize();
+ for (size_t m=movemask(valid), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+ {
+ const Vec2f uv = hit.uv(i);
+ const float old_t = ray.tfar;
+ ray.tfar = hit.t(i);
+ HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+ if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+ ray.tfar = old_t;
+ }
+ return false;
+ }
+#endif
+ return true;
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct IntersectKEpilogM
+ {
+ RayHitK<K>& ray;
+ IntersectContext* context;
+ const vuint<M>& geomIDs;
+ const vuint<M>& primIDs;
+ const size_t i;
+
+ __forceinline IntersectKEpilogM(RayHitK<K>& ray,
+ IntersectContext* context,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs,
+ size_t i)
+ : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+ template<typename Hit>
+ __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+ {
+ Scene* scene MAYBE_UNUSED = context->scene;
+
+ vfloat<K> u, v, t;
+ Vec3vf<K> Ng;
+ vbool<K> valid = valid_i;
+
+ std::tie(u,v,t,Ng) = hit();
+
+ const unsigned int geomID = geomIDs[i];
+ const unsigned int primID = primIDs[i];
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+ /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+ valid &= (geometry->mask & ray.mask) != 0;
+ if (unlikely(none(valid))) return false;
+#endif
+
+ /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+ HitK<K> h(context->user,geomID,primID,u,v,Ng);
+ const vfloat<K> old_t = ray.tfar;
+ ray.tfar = select(valid,t,ray.tfar);
+ const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+ ray.tfar = select(m_accept,ray.tfar,old_t);
+ return m_accept;
+ }
+ }
+#endif
+
+ /* update hit information */
+ vfloat<K>::store(valid,&ray.tfar,t);
+ vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+ vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+ vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+ vfloat<K>::store(valid,&ray.u,u);
+ vfloat<K>::store(valid,&ray.v,v);
+ vuint<K>::store(valid,&ray.primID,primID);
+ vuint<K>::store(valid,&ray.geomID,geomID);
+ instance_id_stack::copy<const unsigned*, vuint<K>*, const vbool<K>&>(context->user->instID, ray.instID, valid);
+ return valid;
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct OccludedKEpilogM
+ {
+ vbool<K>& valid0;
+ RayK<K>& ray;
+ IntersectContext* context;
+ const vuint<M>& geomIDs;
+ const vuint<M>& primIDs;
+ const size_t i;
+
+ __forceinline OccludedKEpilogM(vbool<K>& valid0,
+ RayK<K>& ray,
+ IntersectContext* context,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs,
+ size_t i)
+ : valid0(valid0), ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+ template<typename Hit>
+ __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+ {
+ vbool<K> valid = valid_i;
+
+ /* ray masking test */
+ Scene* scene MAYBE_UNUSED = context->scene;
+ const unsigned int geomID = geomIDs[i];
+ const unsigned int primID = primIDs[i];
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+ valid &= (geometry->mask & ray.mask) != 0;
+ if (unlikely(none(valid))) return valid;
+#endif
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+ {
+ vfloat<K> u, v, t;
+ Vec3vf<K> Ng;
+ std::tie(u,v,t,Ng) = hit();
+ HitK<K> h(context->user,geomID,primID,u,v,Ng);
+ const vfloat<K> old_t = ray.tfar;
+ ray.tfar = select(valid,t,ray.tfar);
+ valid = runOcclusionFilter(valid,geometry,ray,context,h);
+ ray.tfar = select(valid,ray.tfar,old_t);
+ }
+ }
+#endif
+
+ /* update occlusion */
+ valid0 = valid0 & !valid;
+ return valid;
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct IntersectKEpilogMU
+ {
+ RayHitK<K>& ray;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline IntersectKEpilogMU(RayHitK<K>& ray,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline vbool<K> operator() (const vbool<K>& valid_org, const Hit& hit) const
+ {
+ vbool<K> valid = valid_org;
+ vfloat<K> u, v, t;
+ Vec3vf<K> Ng;
+ std::tie(u,v,t,Ng) = hit();
+
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+ /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+ valid &= (geometry->mask & ray.mask) != 0;
+ if (unlikely(none(valid))) return false;
+#endif
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+ HitK<K> h(context->user,geomID,primID,u,v,Ng);
+ const vfloat<K> old_t = ray.tfar;
+ ray.tfar = select(valid,t,ray.tfar);
+ const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+ ray.tfar = select(m_accept,ray.tfar,old_t);
+ return m_accept;
+ }
+ }
+#endif
+
+ /* update hit information */
+ vfloat<K>::store(valid,&ray.tfar,t);
+ vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+ vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+ vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+ vfloat<K>::store(valid,&ray.u,u);
+ vfloat<K>::store(valid,&ray.v,v);
+ vuint<K>::store(valid,&ray.primID,primID);
+ vuint<K>::store(valid,&ray.geomID,geomID);
+ instance_id_stack::copy<const unsigned*, vuint<K>*, const vbool<K>&>(context->user->instID, ray.instID, valid);
+
+ return valid;
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct OccludedKEpilogMU
+ {
+ vbool<K>& valid0;
+ RayK<K>& ray;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline OccludedKEpilogMU(vbool<K>& valid0,
+ RayK<K>& ray,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+ {
+ vbool<K> valid = valid_i;
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+ valid &= (geometry->mask & ray.mask) != 0;
+ if (unlikely(none(valid))) return false;
+#endif
+
+ /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+ {
+ vfloat<K> u, v, t;
+ Vec3vf<K> Ng;
+ std::tie(u,v,t,Ng) = hit();
+ HitK<K> h(context->user,geomID,primID,u,v,Ng);
+ const vfloat<K> old_t = ray.tfar;
+ ray.tfar = select(valid,t,ray.tfar);
+ valid = runOcclusionFilter(valid,geometry,ray,context,h);
+ ray.tfar = select(valid,ray.tfar,old_t);
+ }
+ }
+#endif
+
+ /* update occlusion */
+ valid0 = valid0 & !valid;
+ return valid;
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct Intersect1KEpilogM
+ {
+ RayHitK<K>& ray;
+ size_t k;
+ IntersectContext* context;
+ const vuint<M>& geomIDs;
+ const vuint<M>& primIDs;
+
+ __forceinline Intersect1KEpilogM(RayHitK<K>& ray, size_t k,
+ IntersectContext* context,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs)
+ : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+ {
+ Scene* scene MAYBE_UNUSED = context->scene;
+ vbool<Mx> valid = valid_i;
+ hit.finalize();
+ if (Mx > M) valid &= (1<<M)-1;
+ size_t i = select_min(valid,hit.vt);
+ assert(i<M);
+ unsigned int geomID = geomIDs[i];
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+ bool foundhit = false;
+ goto entry;
+ while (true)
+ {
+ if (unlikely(none(valid))) return foundhit;
+ i = select_min(valid,hit.vt);
+ assert(i<M);
+ geomID = geomIDs[i];
+ entry:
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+ /* goto next hit if mask test fails */
+ if ((geometry->mask & ray.mask[k]) == 0) {
+ clear(valid,i);
+ continue;
+ }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+ /* call intersection filter function */
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+ assert(i<M);
+ const Vec2f uv = hit.uv(i);
+ HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+ const float old_t = ray.tfar[k];
+ ray.tfar[k] = hit.t(i);
+ const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+ if (!found) ray.tfar[k] = old_t;
+ foundhit = foundhit | found;
+ clear(valid,i);
+ valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+ continue;
+ }
+ }
+#endif
+ break;
+ }
+#endif
+ assert(i<M);
+ /* update hit information */
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+ ray.updateK(i,k,hit.vt,hit.vu,hit.vv,vfloat<Mx>(hit.vNg.x),vfloat<Mx>(hit.vNg.y),vfloat<Mx>(hit.vNg.z),geomID,vuint<Mx>(primIDs));
+#else
+ const Vec2f uv = hit.uv(i);
+ ray.tfar[k] = hit.t(i);
+ ray.Ng.x[k] = hit.vNg.x[i];
+ ray.Ng.y[k] = hit.vNg.y[i];
+ ray.Ng.z[k] = hit.vNg.z[i];
+ ray.u[k] = uv.x;
+ ray.v[k] = uv.y;
+ ray.primID[k] = primIDs[i];
+ ray.geomID[k] = geomID;
+ instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+#endif
+ return true;
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct Occluded1KEpilogM
+ {
+ RayK<K>& ray;
+ size_t k;
+ IntersectContext* context;
+ const vuint<M>& geomIDs;
+ const vuint<M>& primIDs;
+
+ __forceinline Occluded1KEpilogM(RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs)
+ : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+ {
+ Scene* scene MAYBE_UNUSED = context->scene;
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+ if (unlikely(filter))
+ hit.finalize(); /* called only once */
+
+ vbool<Mx> valid = valid_i;
+ if (Mx > M) valid &= (1<<M)-1;
+ size_t m=movemask(valid);
+ goto entry;
+ while (true)
+ {
+ if (unlikely(m == 0)) return false;
+ entry:
+ size_t i=bsf(m);
+
+ const unsigned int geomID = geomIDs[i];
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+ /* goto next hit if mask test fails */
+ if ((geometry->mask & ray.mask[k]) == 0) {
+ m=btc(m,i);
+ continue;
+ }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+ /* execute occlusion filer */
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+ {
+ const Vec2f uv = hit.uv(i);
+ const float old_t = ray.tfar[k];
+ ray.tfar[k] = hit.t(i);
+ HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+ if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+ ray.tfar[k] = old_t;
+ m=btc(m,i);
+ continue;
+ }
+ }
+#endif
+ break;
+ }
+#endif
+ return true;
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct Intersect1KEpilogMU
+ {
+ RayHitK<K>& ray;
+ size_t k;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline Intersect1KEpilogMU(RayHitK<K>& ray, size_t k,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+ {
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+ /* ray mask test */
+ if ((geometry->mask & ray.mask[k]) == 0)
+ return false;
+#endif
+
+ /* finalize hit calculation */
+ vbool<M> valid = valid_i;
+ hit.finalize();
+ size_t i = select_min(valid,hit.vt);
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+ {
+ bool foundhit = false;
+ while (true)
+ {
+ const Vec2f uv = hit.uv(i);
+ const float old_t = ray.tfar[k];
+ ray.tfar[k] = hit.t(i);
+ HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+ const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+ if (!found) ray.tfar[k] = old_t;
+ foundhit = foundhit | found;
+ clear(valid,i);
+ valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+ if (unlikely(none(valid))) break;
+ i = select_min(valid,hit.vt);
+ }
+ return foundhit;
+ }
+ }
+#endif
+
+ /* update hit information */
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+ const Vec3fa Ng = hit.Ng(i);
+ ray.updateK(i,k,hit.vt,hit.vu,hit.vv,vfloat<M>(Ng.x),vfloat<M>(Ng.y),vfloat<M>(Ng.z),geomID,vuint<M>(primID));
+#else
+ const Vec2f uv = hit.uv(i);
+ const Vec3fa Ng = hit.Ng(i);
+ ray.tfar[k] = hit.t(i);
+ ray.Ng.x[k] = Ng.x;
+ ray.Ng.y[k] = Ng.y;
+ ray.Ng.z[k] = Ng.z;
+ ray.u[k] = uv.x;
+ ray.v[k] = uv.y;
+ ray.primID[k] = primID;
+ ray.geomID[k] = geomID;
+ instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+#endif
+ return true;
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct Occluded1KEpilogMU
+ {
+ RayK<K>& ray;
+ size_t k;
+ IntersectContext* context;
+ const unsigned int geomID;
+ const unsigned int primID;
+
+ __forceinline Occluded1KEpilogMU(RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+ template<typename Hit>
+ __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+ {
+ Scene* scene MAYBE_UNUSED = context->scene;
+ Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+ /* ray mask test */
+ if ((geometry->mask & ray.mask[k]) == 0)
+ return false;
+#endif
+
+ /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+ if (filter) {
+ if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+ {
+ hit.finalize();
+ for (size_t m=movemask(valid_i), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+ {
+ const Vec2f uv = hit.uv(i);
+ const float old_t = ray.tfar[k];
+ ray.tfar[k] = hit.t(i);
+ HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+ if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+ ray.tfar[k] = old_t;
+ }
+ return false;
+ }
+ }
+#endif
+ return true;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h b/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h
new file mode 100644
index 0000000000..5c1ba5cb61
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h
@@ -0,0 +1,172 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<typename Intersector>
+ struct ArrayIntersector1
+ {
+ typedef typename Intersector::Primitive Primitive;
+ typedef typename Intersector::Precalculations Precalculations;
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ for (size_t i=0; i<num; i++)
+ Intersector::intersect(pre,ray,context,prim[i]);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ for (size_t i=0; i<num; i++) {
+ if (Intersector::occluded(pre,ray,context,prim[i]))
+ return true;
+ }
+ return false;
+ }
+
+ template<int N>
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+ {
+ bool changed = false;
+ for (size_t i=0; i<num; i++)
+ changed |= Intersector::pointQuery(query, context, prim[i]);
+ return changed;
+ }
+
+ template<int K>
+ static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+ {
+ }
+
+ template<int K>
+ static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+ {
+ return valid;
+ }
+ };
+
+ template<int K, typename Intersector>
+ struct ArrayIntersectorK_1
+ {
+ typedef typename Intersector::Primitive Primitive;
+ typedef typename Intersector::Precalculations Precalculations;
+
+ template<bool robust>
+ static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ for (size_t i=0; i<num; i++) {
+ Intersector::intersect(valid,pre,ray,context,prim[i]);
+ }
+ }
+
+ template<bool robust>
+ static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ vbool<K> valid0 = valid;
+ for (size_t i=0; i<num; i++) {
+ valid0 &= !Intersector::occluded(valid0,pre,ray,context,prim[i]);
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ for (size_t i=0; i<num; i++) {
+ Intersector::intersect(pre,ray,k,context,prim[i]);
+ }
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ for (size_t i=0; i<num; i++) {
+ if (Intersector::occluded(pre,ray,k,context,prim[i]))
+ return true;
+ }
+ return false;
+ }
+ };
+
+ // =============================================================================================
+
+ template<int K, typename IntersectorK>
+ struct ArrayIntersectorKStream
+ {
+ typedef typename IntersectorK::Primitive PrimitiveK;
+ typedef typename IntersectorK::Precalculations PrecalculationsK;
+
+ static __forceinline void intersectK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+ {
+ PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+
+ for (size_t i=0; i<num; i++) {
+ IntersectorK::intersect(valid,pre,ray,context,prim[i]);
+ }
+ }
+
+ static __forceinline vbool<K> occludedK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+ {
+ PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+ vbool<K> valid0 = valid;
+ for (size_t i=0; i<num; i++) {
+ valid0 &= !IntersectorK::occluded(valid0,pre,ray,context,prim[i]);
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ static __forceinline void intersect(const Accel::Intersectors* This, RayHitK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+ {
+ PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+ for (size_t i=0; i<num; i++) {
+ IntersectorK::intersect(pre,ray,k,context,prim[i]);
+ }
+ }
+
+ static __forceinline bool occluded(const Accel::Intersectors* This, RayK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+ {
+ PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+ for (size_t i=0; i<num; i++) {
+ if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+ return true;
+ }
+ return false;
+ }
+
+ static __forceinline size_t occluded(const Accel::Intersectors* This, size_t cur_mask, RayK<K>** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+ {
+ size_t m_occluded = 0;
+ for (size_t i=0; i<num; i++) {
+ size_t bits = cur_mask & (~m_occluded);
+ for (; bits!=0; )
+ {
+ const size_t rayID = bscf(bits);
+ RayHitK<K> &ray = *inputPackets[rayID / K];
+ const size_t k = rayID % K;
+ PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+ if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+ {
+ m_occluded |= (size_t)1 << rayID;
+ ray.tfar[k] = neg_inf;
+ }
+ }
+ }
+ return m_occluded;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h
new file mode 100644
index 0000000000..eef5b0b1fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h
@@ -0,0 +1,141 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct LineIntersectorHitM
+ {
+ __forceinline LineIntersectorHitM() {}
+
+ __forceinline LineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+ : vu(u), vv(v), vt(t), vNg(Ng) {}
+
+ __forceinline void finalize() {}
+
+ __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+ __forceinline float t (const size_t i) const { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ public:
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ template<int M>
+ struct FlatLinearCurveIntersector1
+ {
+ typedef CurvePrecalculations1 Precalculations;
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ Ray& ray,
+ IntersectContext* context,
+ const LineSegments* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+ const Epilog& epilog)
+ {
+ /* transform end points into ray space */
+ vbool<M> valid = valid_i;
+ vfloat<M> depth_scale = pre.depth_scale;
+ LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space;
+
+ const Vec3vf<M> ray_org ((Vec3fa)ray.org);
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+
+ Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+ Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+
+ /* approximative intersection with cone */
+ const Vec4vf<M> v = p1-p0;
+ const Vec4vf<M> w = -p0;
+ const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+ const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+ const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+ const Vec4vf<M> p = madd(u,v,p0);
+ const vfloat<M> t = p.z;
+ const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+ const vfloat<M> r = p.w;
+ const vfloat<M> r2 = r*r;
+ valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+ if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+ valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+ if (unlikely(none(valid))) return false;
+
+ /* ignore denormalized segments */
+ const Vec3vf<M> T = v1.xyz()-v0.xyz();
+ valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ LineIntersectorHitM<M> hit(u,zero,t,T);
+ return epilog(valid,hit);
+ }
+ };
+
+ template<int M, int K>
+ struct FlatLinearCurveIntersectorK
+ {
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const LineSegments* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+ const Epilog& epilog)
+ {
+ /* transform end points into ray space */
+ vbool<M> valid = valid_i;
+ vfloat<M> depth_scale = pre.depth_scale[k];
+ LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space[k];
+ const Vec3vf<M> ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+ const Vec3vf<M> ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+
+ Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+ Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+
+ /* approximative intersection with cone */
+ const Vec4vf<M> v = p1-p0;
+ const Vec4vf<M> w = -p0;
+ const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+ const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+ const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+ const Vec4vf<M> p = madd(u,v,p0);
+ const vfloat<M> t = p.z;
+ const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+ const vfloat<M> r = p.w;
+ const vfloat<M> r2 = r*r;
+ valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+ if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+ valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+ if (unlikely(none(valid))) return false;
+
+ /* ignore denormalized segments */
+ const Vec3vf<M> T = v1.xyz()-v0.xyz();
+ valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ LineIntersectorHitM<M> hit(u,zero,t,T);
+ return epilog(valid,hit);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei.h b/thirdparty/embree-aarch64/kernels/geometry/linei.h
new file mode 100644
index 0000000000..a72029ca53
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/linei.h
@@ -0,0 +1,709 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+ template<int M>
+ struct LineMi
+ {
+ /* Virtual interface to query information about the line segment type */
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* primitive supports multiple time segments */
+ static const bool singleTimeSegment = false;
+
+ /* Returns maximum number of stored line segments */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N line segments */
+ static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+ /* Returns required number of bytes for N line segments */
+ static __forceinline size_t bytes(size_t N) { return blocks(N)*sizeof(LineMi); }
+
+ public:
+
+ /* Default constructor */
+ __forceinline LineMi() { }
+
+ /* Construction from vertices and IDs */
+ __forceinline LineMi(const vuint<M>& v0, unsigned short leftExists, unsigned short rightExists, const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype)
+ : gtype((unsigned char)gtype), m((unsigned char)popcnt(vuint<M>(primIDs) != vuint<M>(-1))), sharedGeomID(geomIDs[0]), leftExists (leftExists), rightExists(rightExists), v0(v0), primIDs(primIDs)
+ {
+ assert(all(vuint<M>(geomID()) == geomIDs));
+ }
+
+ /* Returns a mask that tells which line segments are valid */
+ __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+ /* Returns a mask that tells which line segments are valid */
+ template<int Mx>
+ __forceinline vbool<Mx> valid() const { return vuint<Mx>(primIDs) != vuint<Mx>(-1); }
+
+ /* Returns if the specified line segment is valid */
+ __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+ /* Returns the number of stored line segments */
+ __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+ /* Returns the geometry IDs */
+ //template<class T>
+ //static __forceinline T unmask(T &index) { return index & 0x3fffffff; }
+
+ __forceinline unsigned int geomID(unsigned int i = 0) const { return sharedGeomID; }
+ //__forceinline vuint<M> geomID() { return unmask(geomIDs); }
+ //__forceinline const vuint<M> geomID() const { return unmask(geomIDs); }
+ //__forceinline unsigned int geomID(const size_t i) const { assert(i<M); return unmask(geomIDs[i]); }
+
+ /* Returns the primitive IDs */
+ __forceinline vuint<M>& primID() { return primIDs; }
+ __forceinline const vuint<M>& primID() const { return primIDs; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+ /* gather the line segments */
+ __forceinline void gather(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ const LineSegments* geom) const;
+
+ __forceinline void gatheri(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ const LineSegments* geom,
+ const int itime) const;
+
+ __forceinline void gather(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ const LineSegments* geom,
+ float time) const;
+
+ /* gather the line segments with lateral info */
+ __forceinline void gather(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ Vec4vf<M>& pL,
+ Vec4vf<M>& pR,
+ const LineSegments* geom) const;
+
+ __forceinline void gatheri(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ Vec4vf<M>& pL,
+ Vec4vf<M>& pR,
+ const LineSegments* geom,
+ const int itime) const;
+
+ __forceinline void gather(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ Vec4vf<M>& pL,
+ Vec4vf<M>& pR,
+ const LineSegments* geom,
+ float time) const;
+
+ __forceinline void gather(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ vbool<M>& cL,
+ vbool<M>& cR,
+ const LineSegments* geom) const;
+
+ __forceinline void gatheri(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ vbool<M>& cL,
+ vbool<M>& cR,
+ const LineSegments* geom,
+ const int itime) const;
+
+ __forceinline void gather(Vec4vf<M>& p0,
+ Vec4vf<M>& p1,
+ vbool<M>& cL,
+ vbool<M>& cR,
+ const LineSegments* geom,
+ float time) const;
+
+ /* Calculate the bounds of the line segments */
+ __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+ {
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<M && valid(i); i++)
+ {
+ const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+ const Vec3ff& p0 = geom->vertex(v0[i]+0,itime);
+ const Vec3ff& p1 = geom->vertex(v0[i]+1,itime);
+ BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+ b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+ bounds.extend(b);
+ }
+ return bounds;
+ }
+
+ /* Calculate the linear bounds of the primitive */
+ __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+ return LBBox3fa(bounds(scene,itime+0), bounds(scene,itime+1));
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) {
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<M && valid(i); i++)
+ {
+ const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+ allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+ }
+ return allBounds;
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+ {
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<M && valid(i); i++)
+ {
+ const LineSegments* geom = scene->get<LineSegments>(geomID((unsigned int)i));
+ allBounds.extend(geom->linearBounds(primID(i), time_range));
+ }
+ return allBounds;
+ }
+
+ /* Fill line segment from line segment list */
+ template<typename PrimRefT>
+ __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+ {
+ Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+ vuint<M> geomID, primID;
+ vuint<M> v0;
+ unsigned short leftExists = 0;
+ unsigned short rightExists = 0;
+ const PrimRefT* prim = &prims[begin];
+
+ for (size_t i=0; i<M; i++)
+ {
+ const LineSegments* geom = scene->get<LineSegments>(prim->geomID());
+ if (begin<end) {
+ geomID[i] = prim->geomID();
+ primID[i] = prim->primID();
+ v0[i] = geom->segment(prim->primID());
+ leftExists |= geom->segmentLeftExists(primID[i]) << i;
+ rightExists |= geom->segmentRightExists(primID[i]) << i;
+ begin++;
+ } else {
+ assert(i);
+ if (i>0) {
+ geomID[i] = geomID[i-1];
+ primID[i] = -1;
+ v0[i] = v0[i-1];
+ }
+ }
+ if (begin<end) prim = &prims[begin]; // FIXME: remove this line
+ }
+ new (this) LineMi(v0,leftExists,rightExists,geomID,primID,gty); // FIXME: use non temporal store
+ }
+
+ template<typename BVH, typename Allocator>
+ __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+ {
+ size_t start = set.begin();
+ size_t items = LineMi::blocks(set.size());
+ size_t numbytes = LineMi::bytes(set.size());
+ LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+ for (size_t i=0; i<items; i++) {
+ accel[i].fill(prims,start,set.end(),bvh->scene);
+ }
+ return bvh->encodeLeaf((char*)accel,items);
+ };
+
+ __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+ {
+ fill(prims,begin,end,scene);
+ return linearBounds(scene,itime);
+ }
+
+ __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+ {
+ fill(prims,begin,end,scene);
+ return linearBounds(scene,time_range);
+ }
+
+ template<typename BVH, typename SetMB, typename Allocator>
+ __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+ {
+ size_t start = prims.begin();
+ size_t end = prims.end();
+ size_t items = LineMi::blocks(prims.size());
+ size_t numbytes = LineMi::bytes(prims.size());
+ LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+ const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
+
+ LBBox3fa bounds = empty;
+ for (size_t i=0; i<items; i++)
+ bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+
+ return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+ };
+
+ /* Updates the primitive */
+ __forceinline BBox3fa update(LineSegments* geom)
+ {
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<M && valid(i); i++)
+ {
+ const Vec3ff& p0 = geom->vertex(v0[i]+0);
+ const Vec3ff& p1 = geom->vertex(v0[i]+1);
+ BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+ b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+ bounds.extend(b);
+ }
+ return bounds;
+ }
+
+ /*! output operator */
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const LineMi& line) {
+ return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+ }
+
+ public:
+ unsigned char gtype;
+ unsigned char m;
+ unsigned int sharedGeomID;
+ unsigned short leftExists, rightExists;
+ vuint<M> v0; // index of start vertex
+ private:
+ vuint<M> primIDs; // primitive ID
+ };
+
+ template<>
+ __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+ Vec4vf4& p1,
+ const LineSegments* geom) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+ transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+ const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+ const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+ const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+ const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+ transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+ }
+
+ template<>
+ __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+ Vec4vf4& p1,
+ const LineSegments* geom,
+ const int itime) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+ transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+ const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+ const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+ const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+ const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+ transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+ }
+
+ template<>
+ __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+ Vec4vf4& p1,
+ const LineSegments* geom,
+ float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf4 a0,a1;
+ gatheri(a0,a1,geom,itime);
+ Vec4vf4 b0,b1;
+ gatheri(b0,b1,geom,itime+1);
+ p0 = lerp(a0,b0,vfloat4(ftime));
+ p1 = lerp(a1,b1,vfloat4(ftime));
+ }
+
+ template<>
+ __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+ Vec4vf4& p1,
+ vbool4& cL,
+ vbool4& cR,
+ const LineSegments* geom) const
+ {
+ gather(p0,p1,geom);
+ cL = !vbool4(leftExists);
+ cR = !vbool4(rightExists);
+ }
+
+ template<>
+ __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+ Vec4vf4& p1,
+ vbool4& cL,
+ vbool4& cR,
+ const LineSegments* geom,
+ const int itime) const
+ {
+ gatheri(p0,p1,geom,itime);
+ cL = !vbool4(leftExists);
+ cR = !vbool4(rightExists);
+ }
+
+ template<>
+ __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+ Vec4vf4& p1,
+ vbool4& cL,
+ vbool4& cR,
+ const LineSegments* geom,
+ float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf4 a0,a1;
+ gatheri(a0,a1,geom,itime);
+ Vec4vf4 b0,b1;
+ gatheri(b0,b1,geom,itime+1);
+ p0 = lerp(a0,b0,vfloat4(ftime));
+ p1 = lerp(a1,b1,vfloat4(ftime));
+ cL = !vbool4(leftExists);
+ cR = !vbool4(rightExists);
+ }
+
+ template<>
+ __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+ Vec4vf4& p1,
+ Vec4vf4& pL,
+ Vec4vf4& pR,
+ const LineSegments* geom) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+ transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+ const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+ const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+ const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+ const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+ transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+
+ const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+ const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+ const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+ const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+ transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+
+ const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+ const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+ const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+ const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+ transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+ }
+
+ template<>
+ __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+ Vec4vf4& p1,
+ Vec4vf4& pL,
+ Vec4vf4& pR,
+ const LineSegments* geom,
+ const int itime) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+ transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+ const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+ const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+ const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+ const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+ transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+
+ const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+ const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+ const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+ const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+ transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+
+ const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+ const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+ const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+ const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+ transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+ }
+
+ template<>
+ __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+ Vec4vf4& p1,
+ Vec4vf4& pL,
+ Vec4vf4& pR,
+ const LineSegments* geom,
+ float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf4 a0,a1,aL,aR;
+ gatheri(a0,a1,aL,aR,geom,itime);
+ Vec4vf4 b0,b1,bL,bR;
+ gatheri(b0,b1,bL,bR,geom,itime+1);
+ p0 = lerp(a0,b0,vfloat4(ftime));
+ p1 = lerp(a1,b1,vfloat4(ftime));
+ pL = lerp(aL,bL,vfloat4(ftime));
+ pR = lerp(aR,bR,vfloat4(ftime));
+ }
+
+#if defined(__AVX__)
+
+ template<>
+ __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+ Vec4vf8& p1,
+ const LineSegments* geom) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+ const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+ const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+ const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+ const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+ transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+ const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+ const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+ const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+ const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+ const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+ const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+ const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+ const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+ transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+ }
+
+ template<>
+ __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+ Vec4vf8& p1,
+ const LineSegments* geom,
+ const int itime) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+ const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+ const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+ const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+ const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+ transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+ const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+ const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+ const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+ const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+ const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+ const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+ const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+ const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+ transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+ }
+
+ template<>
+ __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+ Vec4vf8& p1,
+ const LineSegments* geom,
+ float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf8 a0,a1;
+ gatheri(a0,a1,geom,itime);
+ Vec4vf8 b0,b1;
+ gatheri(b0,b1,geom,itime+1);
+ p0 = lerp(a0,b0,vfloat8(ftime));
+ p1 = lerp(a1,b1,vfloat8(ftime));
+ }
+
+ template<>
+ __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+ Vec4vf8& p1,
+ Vec4vf8& pL,
+ Vec4vf8& pR,
+ const LineSegments* geom) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+ const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+ const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+ const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+ const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+ transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+ const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+ const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+ const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+ const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+ const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+ const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+ const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+ const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+ transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+
+ const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+ const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+ const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+ const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+ const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1)) : vfloat4(inf);
+ const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1)) : vfloat4(inf);
+ const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1)) : vfloat4(inf);
+ const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1)) : vfloat4(inf);
+ transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+
+ const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+ const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+ const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+ const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+ const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2)) : vfloat4(inf);
+ const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2)) : vfloat4(inf);
+ const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2)) : vfloat4(inf);
+ const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2)) : vfloat4(inf);
+ transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+ }
+
+ template<>
+ __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+ Vec4vf8& p1,
+ Vec4vf8& pL,
+ Vec4vf8& pR,
+ const LineSegments* geom,
+ const int itime) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+ const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+ const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+ const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+ const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+ transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+ const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+ const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+ const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+ const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+ const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+ const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+ const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+ const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+ transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+
+ const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+ const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+ const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+ const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+ const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1,itime)) : vfloat4(inf);
+ const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1,itime)) : vfloat4(inf);
+ const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1,itime)) : vfloat4(inf);
+ const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1,itime)) : vfloat4(inf);
+ transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+
+ const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+ const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+ const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+ const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+ const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2,itime)) : vfloat4(inf);
+ const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2,itime)) : vfloat4(inf);
+ const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2,itime)) : vfloat4(inf);
+ const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2,itime)) : vfloat4(inf);
+ transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+ }
+
+ template<>
+ __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+ Vec4vf8& p1,
+ Vec4vf8& pL,
+ Vec4vf8& pR,
+ const LineSegments* geom,
+ float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf8 a0,a1,aL,aR;
+ gatheri(a0,a1,aL,aR,geom,itime);
+ Vec4vf8 b0,b1,bL,bR;
+ gatheri(b0,b1,bL,bR,geom,itime+1);
+ p0 = lerp(a0,b0,vfloat8(ftime));
+ p1 = lerp(a1,b1,vfloat8(ftime));
+ pL = lerp(aL,bL,vfloat8(ftime));
+ pR = lerp(aR,bR,vfloat8(ftime));
+ }
+
+ template<>
+ __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+ Vec4vf8& p1,
+ vbool8& cL,
+ vbool8& cR,
+ const LineSegments* geom) const
+ {
+ gather(p0,p1,geom);
+ cL = !vbool8(leftExists);
+ cR = !vbool8(rightExists);
+ }
+
+ template<>
+ __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+ Vec4vf8& p1,
+ vbool8& cL,
+ vbool8& cR,
+ const LineSegments* geom,
+ const int itime) const
+ {
+ gatheri(p0,p1,geom,itime);
+ cL = !vbool8(leftExists);
+ cR = !vbool8(rightExists);
+ }
+
+ template<>
+ __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+ Vec4vf8& p1,
+ vbool8& cL,
+ vbool8& cR,
+ const LineSegments* geom,
+ float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf8 a0,a1;
+ gatheri(a0,a1,geom,itime);
+ Vec4vf8 b0,b1;
+ gatheri(b0,b1,geom,itime+1);
+ p0 = lerp(a0,b0,vfloat8(ftime));
+ p1 = lerp(a1,b1,vfloat8(ftime));
+ cL = !vbool8(leftExists);
+ cR = !vbool8(rightExists);
+ }
+
+#endif
+
+ template<int M>
+ typename LineMi<M>::Type LineMi<M>::type;
+
+ typedef LineMi<4> Line4i;
+ typedef LineMi<8> Line8i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h
new file mode 100644
index 0000000000..a431796a88
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h
@@ -0,0 +1,124 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linei.h"
+#include "line_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M, int Mx, bool filter>
+ struct FlatLinearCurveMiIntersector1
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+ }
+ };
+
+ template<int M, int Mx, bool filter>
+ struct FlatLinearCurveMiMBIntersector1
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+ const vbool<Mx> valid = line.template valid<Mx>();
+ FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct FlatLinearCurveMiIntersectorK
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct FlatLinearCurveMiMBIntersectorK
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/object.h b/thirdparty/embree-aarch64/kernels/geometry/object.h
new file mode 100644
index 0000000000..f26391de52
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/object.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+ struct Object
+ {
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* primitive supports multiple time segments */
+ static const bool singleTimeSegment = false;
+
+ /* Returns maximum number of stored primitives */
+ static __forceinline size_t max_size() { return 1; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return N; }
+
+ public:
+
+ /*! constructs a virtual object */
+ Object (unsigned geomID, unsigned primID)
+ : _geomID(geomID), _primID(primID) {}
+
+ __forceinline unsigned geomID() const {
+ return _geomID;
+ }
+
+ __forceinline unsigned primID() const {
+ return _primID;
+ }
+
+ /*! fill triangle from triangle list */
+ __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+ {
+ const PrimRef& prim = prims[i]; i++;
+ new (this) Object(prim.geomID(), prim.primID());
+ }
+
+ /*! fill triangle from triangle list */
+ __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+ {
+ const PrimRef& prim = prims[i]; i++;
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ new (this) Object(geomID, primID);
+ AccelSet* accel = (AccelSet*) scene->get(geomID);
+ return accel->linearBounds(primID,itime);
+ }
+
+ /*! fill triangle from triangle list */
+ __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+ {
+ const PrimRefMB& prim = prims[i]; i++;
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ new (this) Object(geomID, primID);
+ AccelSet* accel = (AccelSet*) scene->get(geomID);
+ return accel->linearBounds(primID,time_range);
+ }
+
+ /* Updates the primitive */
+ __forceinline BBox3fa update(AccelSet* mesh) {
+ return mesh->bounds(primID());
+ }
+
+ private:
+ unsigned int _geomID; //!< geometry ID
+ unsigned int _primID; //!< primitive ID
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h
new file mode 100644
index 0000000000..97882e0e59
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "object.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<bool mblur>
+ struct ObjectIntersector1
+ {
+ typedef Object Primitive;
+
+ static const bool validIntersectorK = false;
+
+ struct Precalculations {
+ __forceinline Precalculations() {}
+ __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+ };
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+ {
+ AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+
+ /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+ if ((ray.mask & accel->mask) == 0)
+ return;
+#endif
+
+ accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1);
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+ {
+ AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+ /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+ if ((ray.mask & accel->mask) == 0)
+ return false;
+#endif
+
+ accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+ return ray.tfar < 0.0f;
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+ {
+ AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID());
+ context->geomID = prim.geomID();
+ context->primID = prim.primID();
+ return accel->pointQuery(query, context);
+ }
+
+ template<int K>
+ static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+ {
+ assert(false);
+ }
+
+ template<int K>
+ static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+ {
+ assert(false);
+ return valid;
+ }
+ };
+
+ template<int K, bool mblur>
+ struct ObjectIntersectorK
+ {
+ typedef Object Primitive;
+
+ struct Precalculations {
+ __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+ };
+
+ static __forceinline void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vbool<K> valid = valid_i;
+ AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+
+ /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+ valid &= (ray.mask & accel->mask) != 0;
+ if (none(valid)) return;
+#endif
+ accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1);
+ }
+
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
+ {
+ vbool<K> valid = valid_i;
+ AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+
+ /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+ valid &= (ray.mask & accel->mask) != 0;
+ if (none(valid)) return false;
+#endif
+ accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+ return ray.tfar < 0.0f;
+ }
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+ intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+ occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+ return ray.tfar[k] < 0.0f;
+ }
+ };
+
+ typedef ObjectIntersectorK<4,false> ObjectIntersector4;
+ typedef ObjectIntersectorK<8,false> ObjectIntersector8;
+ typedef ObjectIntersectorK<16,false> ObjectIntersector16;
+
+ typedef ObjectIntersectorK<4,true> ObjectIntersector4MB;
+ typedef ObjectIntersectorK<8,true> ObjectIntersector8MB;
+ typedef ObjectIntersectorK<16,true> ObjectIntersector16MB;
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/plane.h b/thirdparty/embree-aarch64/kernels/geometry/plane.h
new file mode 100644
index 0000000000..ebe45db558
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/plane.h
@@ -0,0 +1,57 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct HalfPlane
+ {
+ const Vec3fa P; //!< plane origin
+ const Vec3fa N; //!< plane normal
+
+ __forceinline HalfPlane(const Vec3fa& P, const Vec3fa& N)
+ : P(P), N(N) {}
+
+ __forceinline BBox1f intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+ {
+ Vec3fa O = Vec3fa(ray_org) - P;
+ Vec3fa D = Vec3fa(ray_dir);
+ float ON = dot(O,N);
+ float DN = dot(D,N);
+ bool eps = abs(DN) < min_rcp_input;
+ float t = -ON*rcp(DN);
+ float lower = select(eps || DN < 0.0f, float(neg_inf), t);
+ float upper = select(eps || DN > 0.0f, float(pos_inf), t);
+ return BBox1f(lower,upper);
+ }
+ };
+
+ template<int M>
+ struct HalfPlaneN
+ {
+ const Vec3vf<M> P; //!< plane origin
+ const Vec3vf<M> N; //!< plane normal
+
+ __forceinline HalfPlaneN(const Vec3vf<M>& P, const Vec3vf<M>& N)
+ : P(P), N(N) {}
+
+ __forceinline BBox<vfloat<M>> intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+ {
+ Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray_org) - P;
+ Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray_dir);
+ vfloat<M> ON = dot(O,N);
+ vfloat<M> DN = dot(D,N);
+ vbool<M> eps = abs(DN) < min_rcp_input;
+ vfloat<M> t = -ON*rcp(DN);
+ vfloat<M> lower = select(eps | DN < 0.0f, vfloat<M>(neg_inf), t);
+ vfloat<M> upper = select(eps | DN > 0.0f, vfloat<M>(pos_inf), t);
+ return BBox<vfloat<M>>(lower,upper);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/pointi.h b/thirdparty/embree-aarch64/kernels/geometry/pointi.h
new file mode 100644
index 0000000000..4ba298e86b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/pointi.h
@@ -0,0 +1,417 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+ template<int M>
+ struct PointMi
+ {
+ /* Virtual interface to query information about the line segment type */
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+ /* primitive supports multiple time segments */
+ static const bool singleTimeSegment = false;
+
+ /* Returns maximum number of stored line segments */
+ static __forceinline size_t max_size()
+ {
+ return M;
+ }
+
+ /* Returns required number of primitive blocks for N line segments */
+ static __forceinline size_t blocks(size_t N)
+ {
+ return (N + max_size() - 1) / max_size();
+ }
+
+ /* Returns required number of bytes for N line segments */
+ static __forceinline size_t bytes(size_t N)
+ {
+ return blocks(N) * sizeof(PointMi);
+ }
+
+ public:
+ /* Default constructor */
+ __forceinline PointMi() {}
+
+ /* Construction from vertices and IDs */
+ __forceinline PointMi(const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype, uint32_t numPrimitives)
+ : gtype((unsigned char)gtype),
+ numPrimitives(numPrimitives),
+ sharedGeomID(geomIDs[0]),
+ primIDs(primIDs)
+ {
+ assert(all(vuint<M>(geomID()) == geomIDs));
+ }
+
+ /* Returns a mask that tells which line segments are valid */
+ __forceinline vbool<M> valid() const {
+ return vint<M>(step) < vint<M>(numPrimitives);
+ }
+
+ /* Returns a mask that tells which line segments are valid */
+ template<int Mx> __forceinline vbool<Mx> valid() const {
+ return vint<Mx>(step) < vint<Mx>(numPrimitives);
+ }
+
+ /* Returns if the specified line segment is valid */
+ __forceinline bool valid(const size_t i) const
+ {
+ assert(i < M);
+ return i < numPrimitives;
+ }
+
+ /* Returns the number of stored line segments */
+ __forceinline size_t size() const {
+ return numPrimitives;
+ }
+
+ __forceinline unsigned int geomID(unsigned int i = 0) const {
+ return sharedGeomID;
+ }
+
+ __forceinline vuint<M>& primID() {
+ return primIDs;
+ }
+ __forceinline const vuint<M>& primID() const {
+ return primIDs;
+ }
+ __forceinline unsigned int primID(const size_t i) const {
+ assert(i < M);
+ return primIDs[i];
+ }
+
+ /* gather the line segments */
+ __forceinline void gather(Vec4vf<M>& p0, const Points* geom) const;
+ __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom) const;
+
+ __forceinline void gatheri(Vec4vf<M>& p0, const Points* geom, const int itime) const;
+ __forceinline void gatheri(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, const int itime) const;
+
+ __forceinline void gather(Vec4vf<M>& p0, const Points* geom, float time) const;
+ __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, float time) const;
+
+ /* Calculate the bounds of the line segments */
+ __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+ {
+ BBox3fa bounds = empty;
+ for (size_t i = 0; i < M && valid(i); i++) {
+ const Points* geom = scene->get<Points>(geomID(i));
+ bounds.extend(geom->bounds(primID(i),itime));
+ }
+ return bounds;
+ }
+
+ /* Calculate the linear bounds of the primitive */
+ __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+ return LBBox3fa(bounds(scene, itime + 0), bounds(scene, itime + 1));
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene* const scene, size_t itime, size_t numTimeSteps)
+ {
+ LBBox3fa allBounds = empty;
+ for (size_t i = 0; i < M && valid(i); i++) {
+ const Points* geom = scene->get<Points>(geomID(i));
+ allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+ }
+ return allBounds;
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene* const scene, const BBox1f time_range)
+ {
+ LBBox3fa allBounds = empty;
+ for (size_t i = 0; i < M && valid(i); i++) {
+ const Points* geom = scene->get<Points>(geomID((unsigned int)i));
+ allBounds.extend(geom->linearBounds(primID(i), time_range));
+ }
+ return allBounds;
+ }
+
+ /* Fill line segment from line segment list */
+ template<typename PrimRefT>
+ __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+ {
+ Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+ vuint<M> geomID, primID;
+ vuint<M> v0;
+ const PrimRefT* prim = &prims[begin];
+
+ int numPrimitives = 0;
+ for (size_t i = 0; i < M; i++) {
+ if (begin < end) {
+ geomID[i] = prim->geomID();
+ primID[i] = prim->primID();
+ begin++;
+ numPrimitives++;
+ } else {
+ assert(i);
+ if (i > 0) {
+ geomID[i] = geomID[i - 1];
+ primID[i] = primID[i - 1];
+ }
+ }
+ if (begin < end)
+ prim = &prims[begin]; // FIXME: remove this line
+ }
+ new (this) PointMi(geomID, primID, gty, numPrimitives); // FIXME: use non temporal store
+ }
+
+ template<typename BVH, typename Allocator>
+ __forceinline static typename BVH::NodeRef createLeaf(BVH* bvh,
+ const PrimRef* prims,
+ const range<size_t>& set,
+ const Allocator& alloc)
+ {
+ size_t start = set.begin();
+ size_t items = PointMi::blocks(set.size());
+ size_t numbytes = PointMi::bytes(set.size());
+ PointMi* accel = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+ for (size_t i = 0; i < items; i++) {
+ accel[i].fill(prims, start, set.end(), bvh->scene);
+ }
+ return bvh->encodeLeaf((char*)accel, items);
+ };
+
+ __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+ {
+ fill(prims, begin, end, scene);
+ return linearBounds(scene, itime);
+ }
+
+ __forceinline LBBox3fa fillMB(
+ const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+ {
+ fill(prims, begin, end, scene);
+ return linearBounds(scene, time_range);
+ }
+
+ template<typename BVH, typename SetMB, typename Allocator>
+ __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+ {
+ size_t start = prims.object_range.begin();
+ size_t end = prims.object_range.end();
+ size_t items = PointMi::blocks(prims.object_range.size());
+ size_t numbytes = PointMi::bytes(prims.object_range.size());
+ PointMi* accel = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+ const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel, items);
+
+ LBBox3fa bounds = empty;
+ for (size_t i = 0; i < items; i++)
+ bounds.extend(accel[i].fillMB(prims.prims->data(), start, end, bvh->scene, prims.time_range));
+
+ return typename BVH::NodeRecordMB4D(node, bounds, prims.time_range);
+ };
+
+ /*! output operator */
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const PointMi& line)
+ {
+ return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+ }
+
+ public:
+ unsigned char gtype;
+ unsigned char numPrimitives;
+ unsigned int sharedGeomID;
+
+ private:
+ vuint<M> primIDs; // primitive ID
+ };
+
+ template<>
+ __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+ transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+ }
+
+ template<>
+ __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+ transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+ const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+ const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+ const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+ const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+ transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+ }
+
+ template<>
+ __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, const Points* geom, const int itime) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+ transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+ }
+
+ template<>
+ __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, const int itime) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+ transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+ const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+ const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+ const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+ const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+ transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+ }
+
+ template<>
+ __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom, float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf4 a0; gatheri(a0, geom, itime);
+ Vec4vf4 b0; gatheri(b0, geom, itime + 1);
+ p0 = lerp(a0, b0, vfloat4(ftime));
+ }
+
+ template<>
+ __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf4 a0, b0;
+ Vec3vf4 norm0, norm1;
+ gatheri(a0, norm0, geom, itime);
+ gatheri(b0, norm1, geom, itime + 1);
+ p0 = lerp(a0, b0, vfloat4(ftime));
+ n0 = lerp(norm0, norm1, vfloat4(ftime));
+ }
+
+#if defined(__AVX__)
+
+ template<>
+ __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+ const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+ const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+ const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+ const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+ transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+ }
+
+ template<>
+ __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+ const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+ const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+ const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+ const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+ transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+ const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+ const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+ const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+ const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+ const vfloat4 b4 = vfloat4(geom->normal(primID(4)));
+ const vfloat4 b5 = vfloat4(geom->normal(primID(5)));
+ const vfloat4 b6 = vfloat4(geom->normal(primID(6)));
+ const vfloat4 b7 = vfloat4(geom->normal(primID(7)));
+ transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+ }
+
+ template<>
+ __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, const Points* geom, const int itime) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+ const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+ const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+ const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+ const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+ transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+ }
+
+ template<>
+ __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, const int itime) const
+ {
+ const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+ const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+ const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+ const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+ const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+ const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+ const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+ const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+ transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+ const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+ const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+ const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+ const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+ const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime));
+ const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime));
+ const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime));
+ const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime));
+ transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+ }
+
+ template<>
+ __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom, float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf8 a0;
+ gatheri(a0, geom, itime);
+ Vec4vf8 b0;
+ gatheri(b0, geom, itime + 1);
+ p0 = lerp(a0, b0, vfloat8(ftime));
+ }
+
+ template<>
+ __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, float time) const
+ {
+ float ftime;
+ const int itime = geom->timeSegment(time, ftime);
+
+ Vec4vf8 a0, b0;
+ Vec3vf8 norm0, norm1;
+ gatheri(a0, norm0, geom, itime);
+ gatheri(b0, norm1, geom, itime + 1);
+ p0 = lerp(a0, b0, vfloat8(ftime));
+ n0 = lerp(norm0, norm1, vfloat8(ftime));
+ }
+#endif
+
+ template<int M>
+ typename PointMi<M>::Type PointMi<M>::type;
+
+ typedef PointMi<4> Point4i;
+ typedef PointMi<8> Point8i;
+
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive.h b/thirdparty/embree-aarch64/kernels/geometry/primitive.h
new file mode 100644
index 0000000000..41e5b2b304
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/primitive.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene.h"
+#include "../../common/simd/simd.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+ struct PrimitiveType
+ {
+ /*! returns name of this primitive type */
+ virtual const char* name() const = 0;
+
+ /*! Returns the number of stored active primitives in a block. */
+ virtual size_t sizeActive(const char* This) const = 0;
+
+ /*! Returns the number of stored active and inactive primitives in a block. */
+ virtual size_t sizeTotal(const char* This) const = 0;
+
+ /*! Returns the number of bytes of block. */
+ virtual size_t getBytes(const char* This) const = 0;
+ };
+
+ template<typename Primitive>
+ struct PrimitivePointQuery1
+ {
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+ {
+ bool changed = false;
+ for (size_t i = 0; i < Primitive::max_size(); i++)
+ {
+ if (!prim.valid(i)) break;
+ STAT3(point_query.trav_prims,1,1,1);
+ AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID(i));
+ context->geomID = prim.geomID(i);
+ context->primID = prim.primID(i);
+ changed |= accel->pointQuery(query, context);
+ }
+ return changed;
+ }
+
+ static __forceinline void pointQueryNoop(PointQuery* query, PointQueryContext* context, const Primitive& prim) { }
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp
new file mode 100644
index 0000000000..f93574c9c8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp
@@ -0,0 +1,379 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+#include "curveNv.h"
+#include "curveNi.h"
+#include "curveNi_mb.h"
+#include "linei.h"
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "trianglei.h"
+#include "quadv.h"
+#include "quadi.h"
+#include "subdivpatch1.h"
+#include "object.h"
+#include "instance.h"
+#include "subgrid.h"
+
+namespace embree
+{
+ /********************** Curve4v **************************/
+
+ template<>
+ const char* Curve4v::Type::name () const {
+ return "curve4v";
+ }
+
+ template<>
+ size_t Curve4v::Type::sizeActive(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return ((Line4i*)This)->size();
+ else
+ return ((Curve4v*)This)->N;
+ }
+
+ template<>
+ size_t Curve4v::Type::sizeTotal(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return 4;
+ else
+ return ((Curve4v*)This)->N;
+ }
+
+ template<>
+ size_t Curve4v::Type::getBytes(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return Line4i::bytes(sizeActive(This));
+ else
+ return Curve4v::bytes(sizeActive(This));
+ }
+
+ /********************** Curve4i **************************/
+
+ template<>
+ const char* Curve4i::Type::name () const {
+ return "curve4i";
+ }
+
+ template<>
+ size_t Curve4i::Type::sizeActive(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return ((Line4i*)This)->size();
+ else
+ return ((Curve4i*)This)->N;
+ }
+
+ template<>
+ size_t Curve4i::Type::sizeTotal(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return 4;
+ else
+ return ((Curve4i*)This)->N;
+ }
+
+ template<>
+ size_t Curve4i::Type::getBytes(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return Line4i::bytes(sizeActive(This));
+ else
+ return Curve4i::bytes(sizeActive(This));
+ }
+
+ /********************** Curve4iMB **************************/
+
+ template<>
+ const char* Curve4iMB::Type::name () const {
+ return "curve4imb";
+ }
+
+ template<>
+ size_t Curve4iMB::Type::sizeActive(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return ((Line4i*)This)->size();
+ else
+ return ((Curve4iMB*)This)->N;
+ }
+
+ template<>
+ size_t Curve4iMB::Type::sizeTotal(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return 4;
+ else
+ return ((Curve4iMB*)This)->N;
+ }
+
+ template<>
+ size_t Curve4iMB::Type::getBytes(const char* This) const
+ {
+ if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+ return Line4i::bytes(sizeActive(This));
+ else
+ return Curve4iMB::bytes(sizeActive(This));
+ }
+
+ /********************** Line4i **************************/
+
+ template<>
+ const char* Line4i::Type::name () const {
+ return "line4i";
+ }
+
+ template<>
+ size_t Line4i::Type::sizeActive(const char* This) const {
+ return ((Line4i*)This)->size();
+ }
+
+ template<>
+ size_t Line4i::Type::sizeTotal(const char* This) const {
+ return 4;
+ }
+
+ template<>
+ size_t Line4i::Type::getBytes(const char* This) const {
+ return sizeof(Line4i);
+ }
+
+ /********************** Triangle4 **************************/
+
+ template<>
+ const char* Triangle4::Type::name () const {
+ return "triangle4";
+ }
+
+ template<>
+ size_t Triangle4::Type::sizeActive(const char* This) const {
+ return ((Triangle4*)This)->size();
+ }
+
+ template<>
+ size_t Triangle4::Type::sizeTotal(const char* This) const {
+ return 4;
+ }
+
+ template<>
+ size_t Triangle4::Type::getBytes(const char* This) const {
+ return sizeof(Triangle4);
+ }
+
+ /********************** Triangle4v **************************/
+
+ template<>
+ const char* Triangle4v::Type::name () const {
+ return "triangle4v";
+ }
+
+ template<>
+ size_t Triangle4v::Type::sizeActive(const char* This) const {
+ return ((Triangle4v*)This)->size();
+ }
+
+ template<>
+ size_t Triangle4v::Type::sizeTotal(const char* This) const {
+ return 4;
+ }
+
+ template<>
+ size_t Triangle4v::Type::getBytes(const char* This) const {
+ return sizeof(Triangle4v);
+ }
+
+ /********************** Triangle4i **************************/
+
+ template<>
+ const char* Triangle4i::Type::name () const {
+ return "triangle4i";
+ }
+
+ template<>
+ size_t Triangle4i::Type::sizeActive(const char* This) const {
+ return ((Triangle4i*)This)->size();
+ }
+
+ template<>
+ size_t Triangle4i::Type::sizeTotal(const char* This) const {
+ return 4;
+ }
+
+ template<>
+ size_t Triangle4i::Type::getBytes(const char* This) const {
+ return sizeof(Triangle4i);
+ }
+
+ /********************** Triangle4vMB **************************/
+
+ template<>
+ const char* Triangle4vMB::Type::name () const {
+ return "triangle4vmb";
+ }
+
+ template<>
+ size_t Triangle4vMB::Type::sizeActive(const char* This) const {
+ return ((Triangle4vMB*)This)->size();
+ }
+
+ template<>
+ size_t Triangle4vMB::Type::sizeTotal(const char* This) const {
+ return 4;
+ }
+
+ template<>
+ size_t Triangle4vMB::Type::getBytes(const char* This) const {
+ return sizeof(Triangle4vMB);
+ }
+
+ /********************** Quad4v **************************/
+
+ template<>
+ const char* Quad4v::Type::name () const {
+ return "quad4v";
+ }
+
+ template<>
+ size_t Quad4v::Type::sizeActive(const char* This) const {
+ return ((Quad4v*)This)->size();
+ }
+
+ template<>
+ size_t Quad4v::Type::sizeTotal(const char* This) const {
+ return 4;
+ }
+
+ template<>
+ size_t Quad4v::Type::getBytes(const char* This) const {
+ return sizeof(Quad4v);
+ }
+
+ /********************** Quad4i **************************/
+
+ template<>
+ const char* Quad4i::Type::name () const {
+ return "quad4i";
+ }
+
+ template<>
+ size_t Quad4i::Type::sizeActive(const char* This) const {
+ return ((Quad4i*)This)->size();
+ }
+
+ template<>
+ size_t Quad4i::Type::sizeTotal(const char* This) const {
+ return 4;
+ }
+
+ template<>
+ size_t Quad4i::Type::getBytes(const char* This) const {
+ return sizeof(Quad4i);
+ }
+
+ /********************** SubdivPatch1 **************************/
+
+ const char* SubdivPatch1::Type::name () const {
+ return "subdivpatch1";
+ }
+
+ size_t SubdivPatch1::Type::sizeActive(const char* This) const {
+ return 1;
+ }
+
+ size_t SubdivPatch1::Type::sizeTotal(const char* This) const {
+ return 1;
+ }
+
+ size_t SubdivPatch1::Type::getBytes(const char* This) const {
+ return sizeof(SubdivPatch1);
+ }
+
+ SubdivPatch1::Type SubdivPatch1::type;
+
+ /********************** Virtual Object **************************/
+
+ const char* Object::Type::name () const {
+ return "object";
+ }
+
+ size_t Object::Type::sizeActive(const char* This) const {
+ return 1;
+ }
+
+ size_t Object::Type::sizeTotal(const char* This) const {
+ return 1;
+ }
+
+ size_t Object::Type::getBytes(const char* This) const {
+ return sizeof(Object);
+ }
+
+ Object::Type Object::type;
+
+ /********************** Instance **************************/
+
+ const char* InstancePrimitive::Type::name () const {
+ return "instance";
+ }
+
+ size_t InstancePrimitive::Type::sizeActive(const char* This) const {
+ return 1;
+ }
+
+ size_t InstancePrimitive::Type::sizeTotal(const char* This) const {
+ return 1;
+ }
+
+ size_t InstancePrimitive::Type::getBytes(const char* This) const {
+ return sizeof(InstancePrimitive);
+ }
+
+ InstancePrimitive::Type InstancePrimitive::type;
+
+ /********************** SubGrid **************************/
+
+ const char* SubGrid::Type::name () const {
+ return "subgrid";
+ }
+
+ size_t SubGrid::Type::sizeActive(const char* This) const {
+ return 1;
+ }
+
+ size_t SubGrid::Type::sizeTotal(const char* This) const {
+ return 1;
+ }
+
+ size_t SubGrid::Type::getBytes(const char* This) const {
+ return sizeof(SubGrid);
+ }
+
+ SubGrid::Type SubGrid::type;
+
+ /********************** SubGridQBVH4 **************************/
+
+ template<>
+ const char* SubGridQBVH4::Type::name () const {
+ return "SubGridQBVH4";
+ }
+
+ template<>
+ size_t SubGridQBVH4::Type::sizeActive(const char* This) const {
+ return 1;
+ }
+
+ template<>
+ size_t SubGridQBVH4::Type::sizeTotal(const char* This) const {
+ return 1;
+ }
+
+ template<>
+ size_t SubGridQBVH4::Type::getBytes(const char* This) const {
+ return sizeof(SubGridQBVH4);
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h
new file mode 100644
index 0000000000..57ff4e60e5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Intersects a ray with a quad with backface culling
+ * enabled. The quad v0,v1,v2,v3 is split into two triangles
+ * v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
+ * triangles gets intersected. */
+ template<int N>
+ __forceinline vbool<N> intersect_quad_backface_culling(const vbool<N>& valid0,
+ const Vec3fa& ray_org,
+ const Vec3fa& ray_dir,
+ const float ray_tnear,
+ const float ray_tfar,
+ const Vec3vf<N>& quad_v0,
+ const Vec3vf<N>& quad_v1,
+ const Vec3vf<N>& quad_v2,
+ const Vec3vf<N>& quad_v3,
+ vfloat<N>& u_o,
+ vfloat<N>& v_o,
+ vfloat<N>& t_o)
+ {
+ /* calculate vertices relative to ray origin */
+ vbool<N> valid = valid0;
+ const Vec3vf<N> O = Vec3vf<N>(ray_org);
+ const Vec3vf<N> D = Vec3vf<N>(ray_dir);
+ const Vec3vf<N> va = quad_v0-O;
+ const Vec3vf<N> vb = quad_v1-O;
+ const Vec3vf<N> vc = quad_v2-O;
+ const Vec3vf<N> vd = quad_v3-O;
+
+ const Vec3vf<N> edb = vb-vd;
+ const vfloat<N> WW = dot(cross(vd,edb),D);
+ const Vec3vf<N> v0 = select(WW <= 0.0f,va,vc);
+ const Vec3vf<N> v1 = select(WW <= 0.0f,vb,vd);
+ const Vec3vf<N> v2 = select(WW <= 0.0f,vd,vb);
+
+ /* calculate edges */
+ const Vec3vf<N> e0 = v2-v0;
+ const Vec3vf<N> e1 = v0-v1;
+
+ /* perform edge tests */
+ const vfloat<N> U = dot(cross(v0,e0),D);
+ const vfloat<N> V = dot(cross(v1,e1),D);
+ valid &= max(U,V) <= 0.0f;
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<N> Ng = cross(e1,e0);
+ const vfloat<N> den = dot(Ng,D);
+ const vfloat<N> rcpDen = rcp(den);
+
+ /* perform depth test */
+ const vfloat<N> t = rcpDen*dot(v0,Ng);
+ valid &= vfloat<N>(ray_tnear) <= t & t <= vfloat<N>(ray_tfar);
+ if (unlikely(none(valid))) return false;
+
+ /* avoid division by 0 */
+ valid &= den != vfloat<N>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ t_o = t;
+ u_o = U * rcpDen;
+ v_o = V * rcpDen;
+ u_o = select(WW <= 0.0f,u_o,1.0f-u_o);
+ v_o = select(WW <= 0.0f,v_o,1.0f-v_o);
+ return valid;
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h
new file mode 100644
index 0000000000..74e8c7720c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h
@@ -0,0 +1,566 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct QuadHitM
+ {
+ __forceinline QuadHitM() {}
+
+ __forceinline QuadHitM(const vbool<M>& valid,
+ const vfloat<M>& U,
+ const vfloat<M>& V,
+ const vfloat<M>& T,
+ const vfloat<M>& absDen,
+ const Vec3vf<M>& Ng,
+ const vbool<M>& flags)
+ : U(U), V(V), T(T), absDen(absDen), tri_Ng(Ng), valid(valid), flags(flags) {}
+
+ __forceinline void finalize()
+ {
+ const vfloat<M> rcpAbsDen = rcp(absDen);
+ vt = T * rcpAbsDen;
+ const vfloat<M> u = min(U * rcpAbsDen,1.0f);
+ const vfloat<M> v = min(V * rcpAbsDen,1.0f);
+ const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+ const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+ vu = select(flags,u1,u);
+ vv = select(flags,v1,v);
+ vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+ const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+ vv = select(flags,u1,v);
+ vu = select(flags,v1,u);
+ vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+ }
+
+ __forceinline Vec2f uv(const size_t i)
+ {
+ const float u = vu[i];
+ const float v = vv[i];
+ return Vec2f(u,v);
+ }
+
+ __forceinline float t(const size_t i) { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ private:
+ vfloat<M> U;
+ vfloat<M> V;
+ vfloat<M> T;
+ vfloat<M> absDen;
+ Vec3vf<M> tri_Ng;
+
+ public:
+ vbool<M> valid;
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+
+ public:
+ const vbool<M> flags;
+ };
+
+ template<int K>
+ struct QuadHitK
+ {
+ __forceinline QuadHitK(const vfloat<K>& U,
+ const vfloat<K>& V,
+ const vfloat<K>& T,
+ const vfloat<K>& absDen,
+ const Vec3vf<K>& Ng,
+ const vbool<K>& flags)
+ : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng) {}
+
+ __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+ {
+ const vfloat<K> rcpAbsDen = rcp(absDen);
+ const vfloat<K> t = T * rcpAbsDen;
+ const vfloat<K> u0 = min(U * rcpAbsDen,1.0f);
+ const vfloat<K> v0 = min(V * rcpAbsDen,1.0f);
+ const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+ const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+ const vfloat<K> u = select(flags,u1,u0);
+ const vfloat<K> v = select(flags,v1,v0);
+ const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+ return std::make_tuple(u,v,t,Ng);
+ }
+
+ private:
+ const vfloat<K> U;
+ const vfloat<K> V;
+ const vfloat<K> T;
+ const vfloat<K> absDen;
+ const vbool<K> flags;
+ const Vec3vf<K> tri_Ng;
+ };
+
+ /* ----------------------------- */
+ /* -- single ray intersectors -- */
+ /* ----------------------------- */
+
+
+ template<int M, bool filter>
+ struct QuadMIntersector1MoellerTrumbore;
+
+ /*! Intersects M quads with 1 ray */
+ template<int M, bool filter>
+ struct QuadMIntersector1MoellerTrumbore
+ {
+ __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+ __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+ __forceinline void intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const vuint<M>& geomID, const vuint<M>& primID) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+ Intersect1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+
+ /* intersect first triangle */
+ if (intersector.intersect(ray,v0,v1,v3,hit))
+ epilog(hit.valid,hit);
+
+ /* intersect second triangle */
+ if (intersector.intersect(ray,v2,v3,v1,hit))
+ {
+ hit.U = hit.absDen - hit.U;
+ hit.V = hit.absDen - hit.V;
+ epilog(hit.valid,hit);
+ }
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const vuint<M>& geomID, const vuint<M>& primID) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+ Occluded1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+
+ /* intersect first triangle */
+ if (intersector.intersect(ray,v0,v1,v3,hit))
+ {
+ if (epilog(hit.valid,hit))
+ return true;
+ }
+
+ /* intersect second triangle */
+ if (intersector.intersect(ray,v2,v3,v1,hit))
+ {
+ hit.U = hit.absDen - hit.U;
+ hit.V = hit.absDen - hit.V;
+ if (epilog(hit.valid,hit))
+ return true;
+ }
+ return false;
+ }
+ };
+
+#if defined(__AVX512ER__) // KNL
+
+ /*! Intersects 4 quads with 1 ray using AVX512 */
+ template<bool filter>
+ struct QuadMIntersector1MoellerTrumbore<4,filter>
+ {
+ __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+ __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+ {
+ const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+ select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+ select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+ const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+ const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+ select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+ select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+ const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+ select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+ select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+ const vbool16 flags(0xf0f0);
+
+ MoellerTrumboreHitM<16> hit;
+ MoellerTrumboreIntersector1<16> intersector(ray,nullptr);
+ if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit)))
+ {
+ vfloat16 U = hit.U, V = hit.V, absDen = hit.absDen;
+#if !defined(EMBREE_BACKFACE_CULLING)
+ hit.U = select(flags,absDen-V,U);
+ hit.V = select(flags,absDen-U,V);
+ hit.vNg *= select(flags,vfloat16(-1.0f),vfloat16(1.0f)); // FIXME: use XOR
+#else
+ hit.U = select(flags,absDen-U,U);
+ hit.V = select(flags,absDen-V,V);
+#endif
+ if (likely(epilog(hit.valid,hit)))
+ return true;
+ }
+ return false;
+ }
+
+ __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+ }
+ };
+
+#elif defined(__AVX__)
+
+ /*! Intersects 4 quads with 1 ray using AVX */
+ template<bool filter>
+ struct QuadMIntersector1MoellerTrumbore<4,filter>
+ {
+ __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+ __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+ {
+ const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+ const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+ MoellerTrumboreHitM<8> hit;
+ MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+ const vbool8 flags(0,0,0,0,1,1,1,1);
+ if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit)))
+ {
+ vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+
+#if !defined(EMBREE_BACKFACE_CULLING)
+ hit.U = select(flags,absDen-V,U);
+ hit.V = select(flags,absDen-U,V);
+ hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); // FIXME: use XOR
+#else
+ hit.U = select(flags,absDen-U,U);
+ hit.V = select(flags,absDen-V,V);
+#endif
+ if (unlikely(epilog(hit.valid,hit)))
+ return true;
+ }
+ return false;
+ }
+
+ __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+ }
+ };
+
+#endif
+
+ /* ----------------------------- */
+ /* -- ray packet intersectors -- */
+ /* ----------------------------- */
+
+
+ struct MoellerTrumboreIntersector1KTriangleM
+ {
+ /*! Intersect k'th ray from ray packet of size K with M triangles. */
+ template<int M, int K, typename Epilog>
+ static __forceinline bool intersect(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ const Vec3vf<M>& tri_Ng,
+ const vbool<M>& flags,
+ const Epilog& epilog)
+ {
+ /* calculate denominator */
+ const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+ const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+ const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+ const Vec3vf<M> R = cross(C,D);
+ const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+ const vfloat<M> absDen = abs(den);
+ const vfloat<M> sgnDen = signmsk(den);
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+ const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+ vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+ valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+ if (likely(none(valid))) return false;
+
+ /* calculate hit information */
+ QuadHitM<M> hit(valid,U,V,T,absDen,tri_Ng,flags);
+ return epilog(valid,hit);
+ }
+
+ template<int M, int K, typename Epilog>
+ static __forceinline bool intersect1(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const vbool<M>& flags,
+ const Epilog& epilog)
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ const Vec3vf<M> Ng = cross(e2,e1);
+ return intersect(ray,k,v0,e1,e2,Ng,flags,epilog);
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct QuadMIntersectorKMoellerTrumboreBase
+ {
+ __forceinline QuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_e1,
+ const Vec3vf<K>& tri_e2,
+ const Vec3vf<K>& tri_Ng,
+ const vbool<K>& flags,
+ const Epilog& epilog) const
+ {
+ /* calculate denominator */
+ vbool<K> valid = valid0;
+ const Vec3vf<K> C = tri_v0 - ray.org;
+ const Vec3vf<K> R = cross(C,ray.dir);
+ const vfloat<K> den = dot(tri_Ng,ray.dir);
+ const vfloat<K> absDen = abs(den);
+ const vfloat<K> sgnDen = signmsk(den);
+
+ /* test against edge p2 p0 */
+ const vfloat<K> U = dot(R,tri_e2) ^ sgnDen;
+ valid &= U >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* test against edge p0 p1 */
+ const vfloat<K> V = dot(R,tri_e1) ^ sgnDen;
+ valid &= V >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* test against edge p1 p2 */
+ const vfloat<K> W = absDen-U-V;
+ valid &= W >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+ valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar);
+ if (unlikely(none(valid))) return false;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ valid &= den < vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+#else
+ valid &= den != vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+#endif
+
+ /* calculate hit information */
+ QuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags);
+ return epilog(valid,hit);
+ }
+
+ /*! Intersects K rays with one of M quads. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_v1,
+ const Vec3vf<K>& tri_v2,
+ const vbool<K>& flags,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<K> e1 = tri_v0-tri_v1;
+ const Vec3vf<K> e2 = tri_v2-tri_v0;
+ const Vec3vf<K> Ng = cross(e2,e1);
+ return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,epilog);
+ }
+
+ /*! Intersects K rays with one of M quads. */
+ template<typename Epilog>
+ __forceinline bool intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& v0,
+ const Vec3vf<K>& v1,
+ const Vec3vf<K>& v2,
+ const Vec3vf<K>& v3,
+ const Epilog& epilog) const
+ {
+ intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+ if (none(valid0)) return true;
+ intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+ return none(valid0);
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct QuadMIntersectorKMoellerTrumbore : public QuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+ {
+ __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+ : QuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+ __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const vuint<M>& geomID, const vuint<M>& primID) const
+ {
+ Intersect1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+ MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+ MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const vuint<M>& geomID, const vuint<M>& primID) const
+ {
+ Occluded1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+ if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+ if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+ return false;
+ }
+ };
+
+
+#if defined(__AVX512ER__) // KNL
+
+ /*! Intersects 4 quads with 1 ray using AVX512 */
+ template<int K, bool filter>
+ struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+ {
+ __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+ : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect1(RayK<K>& ray, size_t k,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+ {
+ const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+ select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+ select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+ const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+ const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+ select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+ select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+ const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+ select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+ select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+ const vbool16 flags(0xf0f0);
+ return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog);
+ }
+
+ __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+ }
+ };
+
+#elif defined(__AVX__)
+
+ /*! Intersects 4 quads with 1 ray using AVX */
+ template<int K, bool filter>
+ struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+ {
+ __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+ : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect1(RayK<K>& ray, size_t k,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+ {
+ const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+ const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+ const vbool8 flags(0,0,0,0,1,1,1,1);
+ return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog);
+ }
+
+ __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+ }
+ };
+
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h
new file mode 100644
index 0000000000..7ca3aed0a0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h
@@ -0,0 +1,529 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quad_intersector_moeller.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ * the ray origin into the origin of the coordinate system and then
+ * uses Pluecker coordinates for the intersection. Due to the shift,
+ * the Pluecker coordinate calculation simplifies and the tests get
+ * numerically stable. The edge equations are watertight along the
+ * edge for neighboring triangles. */
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct QuadHitPlueckerM
+ {
+ __forceinline QuadHitPlueckerM() {}
+
+ __forceinline QuadHitPlueckerM(const vbool<M>& valid,
+ const vfloat<M>& U,
+ const vfloat<M>& V,
+ const vfloat<M>& UVW,
+ const vfloat<M>& t,
+ const Vec3vf<M>& Ng,
+ const vbool<M>& flags)
+ : U(U), V(V), UVW(UVW), tri_Ng(Ng), valid(valid), vt(t), flags(flags) {}
+
+ __forceinline void finalize()
+ {
+ const vbool<M> invalid = abs(UVW) < min_rcp_input;
+ const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+ const vfloat<M> u = min(U * rcpUVW,1.0f);
+ const vfloat<M> v = min(V * rcpUVW,1.0f);
+ const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+ const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+ vu = select(flags,u1,u);
+ vv = select(flags,v1,v);
+ vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+ const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+ vv = select(flags,u1,v);
+ vu = select(flags,v1,u);
+ vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+ }
+
+ __forceinline Vec2f uv(const size_t i)
+ {
+ const float u = vu[i];
+ const float v = vv[i];
+ return Vec2f(u,v);
+ }
+
+ __forceinline float t(const size_t i) { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ private:
+ vfloat<M> U;
+ vfloat<M> V;
+ vfloat<M> UVW;
+ Vec3vf<M> tri_Ng;
+
+ public:
+ vbool<M> valid;
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+
+ public:
+ const vbool<M> flags;
+ };
+
+ template<int K>
+ struct QuadHitPlueckerK
+ {
+ __forceinline QuadHitPlueckerK(const vfloat<K>& U,
+ const vfloat<K>& V,
+ const vfloat<K>& UVW,
+ const vfloat<K>& t,
+ const Vec3vf<K>& Ng,
+ const vbool<K>& flags)
+ : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng) {}
+
+ __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+ {
+ const vbool<K> invalid = abs(UVW) < min_rcp_input;
+ const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+ const vfloat<K> u0 = min(U * rcpUVW,1.0f);
+ const vfloat<K> v0 = min(V * rcpUVW,1.0f);
+ const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+ const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+ const vfloat<K> u = select(flags,u1,u0);
+ const vfloat<K> v = select(flags,v1,v0);
+ const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+ return std::make_tuple(u,v,t,Ng);
+ }
+
+ private:
+ const vfloat<K> U;
+ const vfloat<K> V;
+ const vfloat<K> UVW;
+ const vfloat<K> t;
+ const vbool<K> flags;
+ const Vec3vf<K> tri_Ng;
+ };
+
+ struct PlueckerIntersectorTriangle1
+ {
+ template<int M, typename Epilog>
+ static __forceinline bool intersect(Ray& ray,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_v1,
+ const Vec3vf<M>& tri_v2,
+ const vbool<M>& flags,
+ const Epilog& epilog)
+ {
+ /* calculate vertices relative to ray origin */
+ const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+ const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+ const Vec3vf<M> v0 = tri_v0-O;
+ const Vec3vf<M> v1 = tri_v1-O;
+ const Vec3vf<M> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<M> e0 = v2-v0;
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(cross(e0,v2+v0),D);
+ const vfloat<M> V = dot(cross(e1,v0+v1),D);
+ const vfloat<M> W = dot(cross(e2,v1+v2),D);
+ const vfloat<M> UVW = U+V+W;
+ const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = max(U,V,W) <= eps;
+#else
+ vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<M> den = twice(dot(Ng,D));
+
+ /* perform depth test */
+ const vfloat<M> T = twice(dot(v0,Ng));
+ const vfloat<M> t = rcp(den)*T;
+ valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+ valid &= den != vfloat<M>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+ return epilog(valid,hit);
+ }
+ };
+
+ /*! Intersects M quads with 1 ray */
+ template<int M, bool filter>
+ struct QuadMIntersector1Pluecker
+ {
+ __forceinline QuadMIntersector1Pluecker() {}
+
+ __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+ __forceinline void intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const vuint<M>& geomID, const vuint<M>& primID) const
+ {
+ Intersect1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+ PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool<M>(false),epilog);
+ PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool<M>(true),epilog);
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const vuint<M>& geomID, const vuint<M>& primID) const
+ {
+ Occluded1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+ if (PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool<M>(false),epilog)) return true;
+ if (PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+ return false;
+ }
+ };
+
+#if defined(__AVX512ER__) // KNL
+
+ /*! Intersects 4 quads with 1 ray using AVX512 */
+ template<bool filter>
+ struct QuadMIntersector1Pluecker<4,filter>
+ {
+ __forceinline QuadMIntersector1Pluecker() {}
+
+ __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+ {
+ const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+ select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+ select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+ const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+ const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+ select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+ select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+ const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+ select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+ select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+ const vbool16 flags(0xf0f0);
+ return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog);
+ }
+
+ __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+ }
+ };
+
+#elif defined(__AVX__)
+
+ /*! Intersects 4 quads with 1 ray using AVX */
+ template<bool filter>
+ struct QuadMIntersector1Pluecker<4,filter>
+ {
+ __forceinline QuadMIntersector1Pluecker() {}
+
+ __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+ {
+ const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+ const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+ const vbool8 flags(0,0,0,0,1,1,1,1);
+ return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog);
+ }
+
+ __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+ }
+ };
+
+#endif
+
+
+ /* ----------------------------- */
+ /* -- ray packet intersectors -- */
+ /* ----------------------------- */
+
+ struct PlueckerIntersector1KTriangleM
+ {
+ /*! Intersect k'th ray from ray packet of size K with M triangles. */
+ template<int M, int K, typename Epilog>
+ static __forceinline bool intersect1(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_v1,
+ const Vec3vf<M>& tri_v2,
+ const vbool<M>& flags,
+ const Epilog& epilog)
+ {
+ /* calculate vertices relative to ray origin */
+ const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+ const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+ const Vec3vf<M> v0 = tri_v0-O;
+ const Vec3vf<M> v1 = tri_v1-O;
+ const Vec3vf<M> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<M> e0 = v2-v0;
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(cross(e0,v2+v0),D);
+ const vfloat<M> V = dot(cross(e1,v0+v1),D);
+ const vfloat<M> W = dot(cross(e2,v1+v2),D);
+ const vfloat<M> UVW = U+V+W;
+ const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = max(U,V,W) <= eps;
+#else
+ vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<M> den = twice(dot(Ng,D));
+
+ /* perform depth test */
+ const vfloat<M> T = twice(dot(v0,Ng));
+ const vfloat<M> t = rcp(den)*T;
+ valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+ if (unlikely(none(valid))) return false;
+
+ /* avoid division by 0 */
+ valid &= den != vfloat<M>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+ return epilog(valid,hit);
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct QuadMIntersectorKPlueckerBase
+ {
+ __forceinline QuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_v1,
+ const Vec3vf<K>& tri_v2,
+ const vbool<K>& flags,
+ const Epilog& epilog) const
+ {
+ /* calculate vertices relative to ray origin */
+ vbool<K> valid = valid0;
+ const Vec3vf<K> O = ray.org;
+ const Vec3vf<K> D = ray.dir;
+ const Vec3vf<K> v0 = tri_v0-O;
+ const Vec3vf<K> v1 = tri_v1-O;
+ const Vec3vf<K> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<K> e0 = v2-v0;
+ const Vec3vf<K> e1 = v0-v1;
+ const Vec3vf<K> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+ const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+ const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+ const vfloat<K> UVW = U+V+W;
+ const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ valid &= max(U,V,W) <= eps;
+#else
+ valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+ /* perform depth test */
+ const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+ const vfloat<K> t = rcp(den)*T;
+ valid &= ray.tnear() <= t & t <= ray.tfar;
+ valid &= den != vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* calculate hit information */
+ QuadHitPlueckerK<K> hit(U,V,UVW,t,Ng,flags);
+ return epilog(valid,hit);
+ }
+
+ /*! Intersects K rays with one of M quads. */
+ template<typename Epilog>
+ __forceinline bool intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& v0,
+ const Vec3vf<K>& v1,
+ const Vec3vf<K>& v2,
+ const Vec3vf<K>& v3,
+ const Epilog& epilog) const
+ {
+ intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+ if (none(valid0)) return true;
+ intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+ return none(valid0);
+ }
+ };
+
+ template<int M, int K, bool filter>
+ struct QuadMIntersectorKPluecker : public QuadMIntersectorKPlueckerBase<M,K,filter>
+ {
+ __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+ : QuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+ __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const vuint<M>& geomID, const vuint<M>& primID) const
+ {
+ Intersect1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+ PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+ PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const vuint<M>& geomID, const vuint<M>& primID) const
+ {
+ Occluded1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+ if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+ if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+ return false;
+ }
+ };
+
+#if defined(__AVX512ER__) // KNL
+
+ /*! Intersects 4 quads with 1 ray using AVX512 */
+ template<int K, bool filter>
+ struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter>
+ {
+ __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+ : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+ {
+ const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+ select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+ select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+ const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+ const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+ select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+ select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+ const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+ select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+ select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+
+ const vbool16 flags(0xf0f0);
+ return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog);
+ }
+
+ __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+ }
+ };
+
+#elif defined(__AVX__)
+
+ /*! Intersects 4 quads with 1 ray using AVX */
+ template<int K, bool filter>
+ struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter>
+ {
+ __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+ : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+ {
+ const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+ const vbool8 flags(0,0,0,0,1,1,1,1);
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+ const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+ return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog);
+ }
+
+ __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const vuint4& geomID, const vuint4& primID) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+ }
+ };
+
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi.h b/thirdparty/embree-aarch64/kernels/geometry/quadi.h
new file mode 100644
index 0000000000..741ec519ab
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadi.h
@@ -0,0 +1,483 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+ /* Stores M quads from an indexed face set */
+ template <int M>
+ struct QuadMi
+ {
+ /* Virtual interface to query information about the quad type */
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* primitive supports multiple time segments */
+ static const bool singleTimeSegment = false;
+
+ /* Returns maximum number of stored quads */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+ public:
+
+ /* Default constructor */
+ __forceinline QuadMi() { }
+
+ /* Construction from vertices and IDs */
+ __forceinline QuadMi(const vuint<M>& v0,
+ const vuint<M>& v1,
+ const vuint<M>& v2,
+ const vuint<M>& v3,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+ : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+ : v0_(v0),v1_(v1), v2_(v2), v3_(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+ /* Returns a mask that tells which quads are valid */
+ __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+ /* Returns if the specified quad is valid */
+ __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+ /* Returns the number of stored quads */
+ __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+ /* Returns the geometry IDs */
+ __forceinline vuint<M>& geomID() { return geomIDs; }
+ __forceinline const vuint<M>& geomID() const { return geomIDs; }
+ __forceinline unsigned int geomID(const size_t i) const { assert(i<M); assert(geomIDs[i] != -1); return geomIDs[i]; }
+
+ /* Returns the primitive IDs */
+ __forceinline vuint<M>& primID() { return primIDs; }
+ __forceinline const vuint<M>& primID() const { return primIDs; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+ /* Calculate the bounds of the quads */
+ __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+ {
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<M && valid(i); i++) {
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+ bounds.extend(mesh->bounds(primID(i),itime));
+ }
+ return bounds;
+ }
+
+ /* Calculate the linear bounds of the primitive */
+ __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) {
+ return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+ {
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<M && valid(i); i++)
+ {
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+ allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+ }
+ return allBounds;
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+ {
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<M && valid(i); i++)
+ {
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+ allBounds.extend(mesh->linearBounds(primID(i), time_range));
+ }
+ return allBounds;
+ }
+
+ /* Fill quad from quad list */
+ template<typename PrimRefT>
+ __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+ {
+ vuint<M> geomID = -1, primID = -1;
+ const PrimRefT* prim = &prims[begin];
+ vuint<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+
+ for (size_t i=0; i<M; i++)
+ {
+ if (begin<end) {
+ geomID[i] = prim->geomID();
+ primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+ const QuadMesh* mesh = scene->get<QuadMesh>(prim->geomID());
+ const QuadMesh::Quad& q = mesh->quad(prim->primID());
+ unsigned int_stride = mesh->vertices0.getStride()/4;
+ v0[i] = q.v[0] * int_stride;
+ v1[i] = q.v[1] * int_stride;
+ v2[i] = q.v[2] * int_stride;
+ v3[i] = q.v[3] * int_stride;
+#endif
+ begin++;
+ } else {
+ assert(i);
+ if (likely(i > 0)) {
+ geomID[i] = geomID[0]; // always valid geomIDs
+ primID[i] = -1; // indicates invalid data
+ v0[i] = v0[0];
+ v1[i] = v0[0];
+ v2[i] = v0[0];
+ v3[i] = v0[0];
+ }
+ }
+ if (begin<end) prim = &prims[begin];
+ }
+ new (this) QuadMi(v0,v1,v2,v3,geomID,primID); // FIXME: use non temporal store
+ }
+
+ __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+ {
+ fill(prims, begin, end, scene);
+ return linearBounds(scene, itime);
+ }
+
+ __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+ {
+ fill(prims, begin, end, scene);
+ return linearBounds(scene, time_range);
+ }
+
+ friend embree_ostream operator<<(embree_ostream cout, const QuadMi& quad) {
+ return cout << "QuadMi<" << M << ">( "
+#if !defined(EMBREE_COMPACT_POLYS)
+ << "v0 = " << quad.v0_ << ", v1 = " << quad.v1_ << ", v2 = " << quad.v2_ << ", v3 = " << quad.v3_ << ", "
+#endif
+ << "geomID = " << quad.geomIDs << ", primID = " << quad.primIDs << " )";
+ }
+
+ protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+ vuint<M> v0_; // 4 byte offset of 1st vertex
+ vuint<M> v1_; // 4 byte offset of 2nd vertex
+ vuint<M> v2_; // 4 byte offset of 3rd vertex
+ vuint<M> v3_; // 4 byte offset of 4th vertex
+#endif
+ vuint<M> geomIDs; // geometry ID of mesh
+ vuint<M> primIDs; // primitive ID of primitive inside mesh
+ };
+
+ namespace isa
+ {
+
+ template<int M>
+ struct QuadMi : public embree::QuadMi<M>
+ {
+#if !defined(EMBREE_COMPACT_POLYS)
+ using embree::QuadMi<M>::v0_;
+ using embree::QuadMi<M>::v1_;
+ using embree::QuadMi<M>::v2_;
+ using embree::QuadMi<M>::v3_;
+#endif
+ using embree::QuadMi<M>::geomIDs;
+ using embree::QuadMi<M>::primIDs;
+ using embree::QuadMi<M>::geomID;
+ using embree::QuadMi<M>::primID;
+ using embree::QuadMi<M>::valid;
+
+ template<int vid>
+ __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+ {
+#if defined(EMBREE_COMPACT_POLYS)
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+ const QuadMesh::Quad& quad = mesh->quad(primID(index));
+ return (Vec3f) mesh->vertices[0][quad.v[vid]];
+#else
+ const vuint<M>& v = getVertexOffset<vid>();
+ const float* vertices = scene->vertices[geomID(index)];
+ return (Vec3f&) vertices[v[index]];
+#endif
+ }
+
+ template<int vid, typename T>
+ __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+ {
+#if defined(EMBREE_COMPACT_POLYS)
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+ const QuadMesh::Quad& quad = mesh->quad(primID(index));
+ const Vec3fa v0 = mesh->vertices[itime+0][quad.v[vid]];
+ const Vec3fa v1 = mesh->vertices[itime+1][quad.v[vid]];
+#else
+ const vuint<M>& v = getVertexOffset<vid>();
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+ const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+ const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+ const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+ const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+ const Vec3<T> p0(v0.x,v0.y,v0.z);
+ const Vec3<T> p1(v1.x,v1.y,v1.z);
+ return lerp(p0,p1,ftime);
+ }
+
+ template<int vid, int K, typename T>
+ __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+ {
+ Vec3<T> p0, p1;
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+ for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+ {
+#if defined(EMBREE_COMPACT_POLYS)
+ const QuadMesh::Quad& quad = mesh->quad(primID(index));
+ const Vec3fa v0 = mesh->vertices[itime[i]+0][quad.v[vid]];
+ const Vec3fa v1 = mesh->vertices[itime[i]+1][quad.v[vid]];
+#else
+ const vuint<M>& v = getVertexOffset<vid>();
+ const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+ const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+ const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+ const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+ p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+ p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+ }
+ return (T(one)-ftime)*p0 + ftime*p1;
+ }
+
+ struct Quad {
+ vfloat4 v0,v1,v2,v3;
+ };
+
+#if defined(EMBREE_COMPACT_POLYS)
+
+ __forceinline Quad loadQuad(const int i, const Scene* const scene) const
+ {
+ const unsigned int geomID = geomIDs[i];
+ const unsigned int primID = primIDs[i];
+ if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+ const QuadMesh::Quad& quad = mesh->quad(primID);
+ const vfloat4 v0 = (vfloat4) mesh->vertices0[quad.v[0]];
+ const vfloat4 v1 = (vfloat4) mesh->vertices0[quad.v[1]];
+ const vfloat4 v2 = (vfloat4) mesh->vertices0[quad.v[2]];
+ const vfloat4 v3 = (vfloat4) mesh->vertices0[quad.v[3]];
+ return { v0, v1, v2, v3 };
+ }
+
+ __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const
+ {
+ const unsigned int geomID = geomIDs[i];
+ const unsigned int primID = primIDs[i];
+ if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+ const QuadMesh::Quad& quad = mesh->quad(primID);
+ const vfloat4 v0 = (vfloat4) mesh->vertices[itime][quad.v[0]];
+ const vfloat4 v1 = (vfloat4) mesh->vertices[itime][quad.v[1]];
+ const vfloat4 v2 = (vfloat4) mesh->vertices[itime][quad.v[2]];
+ const vfloat4 v3 = (vfloat4) mesh->vertices[itime][quad.v[3]];
+ return { v0, v1, v2, v3 };
+ }
+
+#else
+
+ __forceinline Quad loadQuad(const int i, const Scene* const scene) const
+ {
+ const float* vertices = scene->vertices[geomID(i)];
+ const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+ const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+ const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+ const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+ return { v0, v1, v2, v3 };
+ }
+
+ __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const
+ {
+ const unsigned int geomID = geomIDs[i];
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+ const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+ const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+ const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+ const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+ const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+ return { v0, v1, v2, v3 };
+ }
+
+#endif
+
+ /* Gather the quads */
+ __forceinline void gather(Vec3vf<M>& p0,
+ Vec3vf<M>& p1,
+ Vec3vf<M>& p2,
+ Vec3vf<M>& p3,
+ const Scene *const scene) const;
+
+#if defined(__AVX512F__)
+ __forceinline void gather(Vec3vf16& p0,
+ Vec3vf16& p1,
+ Vec3vf16& p2,
+ Vec3vf16& p3,
+ const Scene *const scene) const;
+#endif
+
+ template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+ __noinline
+#else
+ __forceinline
+#endif
+ void gather(const vbool<K>& valid,
+ Vec3vf<K>& p0,
+ Vec3vf<K>& p1,
+ Vec3vf<K>& p2,
+ Vec3vf<K>& p3,
+ const size_t index,
+ const Scene* const scene,
+ const vfloat<K>& time) const
+ {
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+ vfloat<K> ftime;
+ const vint<K> itime = mesh->timeSegment(time, ftime);
+
+ const size_t first = bsf(movemask(valid));
+ if (likely(all(valid,itime[first] == itime)))
+ {
+ p0 = getVertex<0>(index, scene, itime[first], ftime);
+ p1 = getVertex<1>(index, scene, itime[first], ftime);
+ p2 = getVertex<2>(index, scene, itime[first], ftime);
+ p3 = getVertex<3>(index, scene, itime[first], ftime);
+ }
+ else
+ {
+ p0 = getVertex<0>(valid, index, scene, itime, ftime);
+ p1 = getVertex<1>(valid, index, scene, itime, ftime);
+ p2 = getVertex<2>(valid, index, scene, itime, ftime);
+ p3 = getVertex<3>(valid, index, scene, itime, ftime);
+ }
+ }
+
+ __forceinline void gather(Vec3vf<M>& p0,
+ Vec3vf<M>& p1,
+ Vec3vf<M>& p2,
+ Vec3vf<M>& p3,
+ const QuadMesh* mesh,
+ const Scene *const scene,
+ const int itime) const;
+
+ __forceinline void gather(Vec3vf<M>& p0,
+ Vec3vf<M>& p1,
+ Vec3vf<M>& p2,
+ Vec3vf<M>& p3,
+ const Scene *const scene,
+ const float time) const;
+
+ /* Updates the primitive */
+ __forceinline BBox3fa update(QuadMesh* mesh)
+ {
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<M; i++)
+ {
+ if (!valid(i)) break;
+ const unsigned primId = primID(i);
+ const QuadMesh::Quad& q = mesh->quad(primId);
+ const Vec3fa p0 = mesh->vertex(q.v[0]);
+ const Vec3fa p1 = mesh->vertex(q.v[1]);
+ const Vec3fa p2 = mesh->vertex(q.v[2]);
+ const Vec3fa p3 = mesh->vertex(q.v[3]);
+ bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+ }
+ return bounds;
+ }
+
+ private:
+#if !defined(EMBREE_COMPACT_POLYS)
+ template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+ };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+ template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<0>() const { return v0_; }
+ template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<1>() const { return v1_; }
+ template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<2>() const { return v2_; }
+ template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<3>() const { return v3_; }
+#endif
+
+ template<>
+ __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ Vec3vf4& p3,
+ const Scene *const scene) const
+ {
+ prefetchL1(((char*)this)+0*64);
+ prefetchL1(((char*)this)+1*64);
+ const Quad tri0 = loadQuad(0,scene);
+ const Quad tri1 = loadQuad(1,scene);
+ const Quad tri2 = loadQuad(2,scene);
+ const Quad tri3 = loadQuad(3,scene);
+ transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+ transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+ transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+ transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+ }
+
+ template<>
+ __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ Vec3vf4& p3,
+ const QuadMesh* mesh,
+ const Scene *const scene,
+ const int itime) const
+ {
+ // FIXME: for trianglei there all geometries are identical, is this the case here too?
+
+ const Quad tri0 = loadQuad(0,itime,scene);
+ const Quad tri1 = loadQuad(1,itime,scene);
+ const Quad tri2 = loadQuad(2,itime,scene);
+ const Quad tri3 = loadQuad(3,itime,scene);
+ transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+ transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+ transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+ transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+ }
+
+ template<>
+ __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ Vec3vf4& p3,
+ const Scene *const scene,
+ const float time) const
+ {
+ const QuadMesh* mesh = scene->get<QuadMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+ float ftime;
+ const int itime = mesh->timeSegment(time, ftime);
+
+ Vec3vf4 a0,a1,a2,a3; gather(a0,a1,a2,a3,mesh,scene,itime);
+ Vec3vf4 b0,b1,b2,b3; gather(b0,b1,b2,b3,mesh,scene,itime+1);
+ p0 = lerp(a0,b0,vfloat4(ftime));
+ p1 = lerp(a1,b1,vfloat4(ftime));
+ p2 = lerp(a2,b2,vfloat4(ftime));
+ p3 = lerp(a3,b3,vfloat4(ftime));
+ }
+ }
+
+ template<int M>
+ typename QuadMi<M>::Type QuadMi<M>::type;
+
+ typedef QuadMi<4> Quad4i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h
new file mode 100644
index 0000000000..96cf7f1ca2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h
@@ -0,0 +1,350 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadi.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Intersects M quads with 1 ray */
+ template<int M, bool filter>
+ struct QuadMiIntersector1Moeller
+ {
+ typedef QuadMi<M> Primitive;
+ typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+ /*! Intersect a ray with the M quads and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+ pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of M quads. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+ return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+ }
+ };
+
+ /*! Intersects M triangles with K rays. */
+ template<int M, int K, bool filter>
+ struct QuadMiIntersectorKMoeller
+ {
+ typedef QuadMi<M> Primitive;
+ typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ Scene* scene = context->scene;
+ for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+ const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+ const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+ const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+ pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ Scene* scene = context->scene;
+ vbool<K> valid0 = valid_i;
+ for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+ const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+ const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+ const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+ if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+ break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+ pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+ return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+ };
+
+ /*! Intersects M quads with 1 ray */
+ template<int M, bool filter>
+ struct QuadMiIntersector1Pluecker
+ {
+ typedef QuadMi<M> Primitive;
+ typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+ /*! Intersect a ray with the M quads and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+ pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of M quads. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+ return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+ }
+ };
+
+ /*! Intersects M triangles with K rays. */
+ template<int M, int K, bool filter>
+ struct QuadMiIntersectorKPluecker
+ {
+ typedef QuadMi<M> Primitive;
+ typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ Scene* scene = context->scene;
+ for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+ const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+ const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+ const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+ pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ Scene* scene = context->scene;
+ vbool<K> valid0 = valid_i;
+ for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+ const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+ const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+ const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+ if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+ break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+ pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+ return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+ };
+
+ /*! Intersects M motion blur quads with 1 ray */
+ template<int M, bool filter>
+ struct QuadMiMBIntersector1Moeller
+ {
+ typedef QuadMi<M> Primitive;
+ typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+ /*! Intersect a ray with the M quads and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+ pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of M quads. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+ return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+ }
+ };
+
+ /*! Intersects M motion blur quads with K rays. */
+ template<int M, int K, bool filter>
+ struct QuadMiMBIntersectorKMoeller
+ {
+ typedef QuadMi<M> Primitive;
+ typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+ /*! Intersects K rays with M quads. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+ pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M quads. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ vbool<K> valid0 = valid_i;
+ for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+ if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+ break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M quads and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+ pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of the M quads. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+ return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+ };
+
+ /*! Intersects M motion blur quads with 1 ray */
+ template<int M, bool filter>
+ struct QuadMiMBIntersector1Pluecker
+ {
+ typedef QuadMi<M> Primitive;
+ typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+ /*! Intersect a ray with the M quads and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+ pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of M quads. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+ return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+ }
+ };
+
+ /*! Intersects M motion blur quads with K rays. */
+ template<int M, int K, bool filter>
+ struct QuadMiMBIntersectorKPluecker
+ {
+ typedef QuadMi<M> Primitive;
+ typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+ /*! Intersects K rays with M quads. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+ pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M quads. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ vbool<K> valid0 = valid_i;
+ for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+ if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+ break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M quads and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+ pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of the M quads. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+ return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv.h b/thirdparty/embree-aarch64/kernels/geometry/quadv.h
new file mode 100644
index 0000000000..0a1fe4d128
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadv.h
@@ -0,0 +1,165 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+ /* Stores the vertices of M quads in struct of array layout */
+ template <int M>
+ struct QuadMv
+ {
+ public:
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* Returns maximum number of stored quads */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+ public:
+
+ /* Default constructor */
+ __forceinline QuadMv() {}
+
+ /* Construction from vertices and IDs */
+ __forceinline QuadMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+ : v0(v0), v1(v1), v2(v2), v3(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ /* Returns a mask that tells which quads are valid */
+ __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+ /* Returns true if the specified quad is valid */
+ __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+ /* Returns the number of stored quads */
+ __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+ /* Returns the geometry IDs */
+ __forceinline vuint<M>& geomID() { return geomIDs; }
+ __forceinline const vuint<M>& geomID() const { return geomIDs; }
+ __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+ /* Returns the primitive IDs */
+ __forceinline vuint<M> primID() { return primIDs; }
+ __forceinline const vuint<M> primID() const { return primIDs; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+ /* Calculate the bounds of the quads */
+ __forceinline BBox3fa bounds() const
+ {
+ Vec3vf<M> lower = min(v0,v1,v2,v3);
+ Vec3vf<M> upper = max(v0,v1,v2,v3);
+ vbool<M> mask = valid();
+ lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+ lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+ lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+ upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+ upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+ upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+ return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+ Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+ }
+
+ /* Non temporal store */
+ __forceinline static void store_nt(QuadMv* dst, const QuadMv& src)
+ {
+ vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+ vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+ vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+ vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+ vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+ vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+ vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+ vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+ vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+ vfloat<M>::store_nt(&dst->v3.x,src.v3.x);
+ vfloat<M>::store_nt(&dst->v3.y,src.v3.y);
+ vfloat<M>::store_nt(&dst->v3.z,src.v3.z);
+ vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+ vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+ }
+
+ /* Fill quad from quad list */
+ __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+ {
+ vuint<M> vgeomID = -1, vprimID = -1;
+ Vec3vf<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+
+ for (size_t i=0; i<M && begin<end; i++, begin++)
+ {
+ const PrimRef& prim = prims[begin];
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const QuadMesh* __restrict__ const mesh = scene->get<QuadMesh>(geomID);
+ const QuadMesh::Quad& quad = mesh->quad(primID);
+ const Vec3fa& p0 = mesh->vertex(quad.v[0]);
+ const Vec3fa& p1 = mesh->vertex(quad.v[1]);
+ const Vec3fa& p2 = mesh->vertex(quad.v[2]);
+ const Vec3fa& p3 = mesh->vertex(quad.v[3]);
+ vgeomID [i] = geomID;
+ vprimID [i] = primID;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+ }
+ QuadMv::store_nt(this,QuadMv(v0,v1,v2,v3,vgeomID,vprimID));
+ }
+
+ /* Updates the primitive */
+ __forceinline BBox3fa update(QuadMesh* mesh)
+ {
+ BBox3fa bounds = empty;
+ vuint<M> vgeomID = -1, vprimID = -1;
+ Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+ for (size_t i=0; i<M; i++)
+ {
+ if (primID(i) == -1) break;
+ const unsigned geomId = geomID(i);
+ const unsigned primId = primID(i);
+ const QuadMesh::Quad& quad = mesh->quad(primId);
+ const Vec3fa p0 = mesh->vertex(quad.v[0]);
+ const Vec3fa p1 = mesh->vertex(quad.v[1]);
+ const Vec3fa p2 = mesh->vertex(quad.v[2]);
+ const Vec3fa p3 = mesh->vertex(quad.v[3]);
+ bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+ vgeomID [i] = geomId;
+ vprimID [i] = primId;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+ }
+ new (this) QuadMv(v0,v1,v2,v3,vgeomID,vprimID);
+ return bounds;
+ }
+
+ public:
+ Vec3vf<M> v0; // 1st vertex of the quads
+ Vec3vf<M> v1; // 2nd vertex of the quads
+ Vec3vf<M> v2; // 3rd vertex of the quads
+ Vec3vf<M> v3; // 4rd vertex of the quads
+ private:
+ vuint<M> geomIDs; // geometry ID
+ vuint<M> primIDs; // primitive ID
+ };
+
+ template<int M>
+ typename QuadMv<M>::Type QuadMv<M>::type;
+
+ typedef QuadMv<4> Quad4v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h
new file mode 100644
index 0000000000..30a24b291a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Intersects M quads with 1 ray */
+ template<int M, bool filter>
+ struct QuadMvIntersector1Moeller
+ {
+ typedef QuadMv<M> Primitive;
+ typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+ /*! Intersect a ray with the M quads and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of M quads. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+ }
+ };
+
+ /*! Intersects M triangles with K rays. */
+ template<int M, int K, bool filter>
+ struct QuadMvIntersectorKMoeller
+ {
+ typedef QuadMv<M> Primitive;
+ typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+ {
+ for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+ const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+ const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+ const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+ pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+ {
+ vbool<K> valid0 = valid_i;
+
+ for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+ const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+ const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+ const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+ if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+ break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+ }
+ };
+
+ /*! Intersects M quads with 1 ray */
+ template<int M, bool filter>
+ struct QuadMvIntersector1Pluecker
+ {
+ typedef QuadMv<M> Primitive;
+ typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+ /*! Intersect a ray with the M quads and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of M quads. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+ }
+ };
+
+ /*! Intersects M triangles with K rays. */
+ template<int M, int K, bool filter>
+ struct QuadMvIntersectorKPluecker
+ {
+ typedef QuadMv<M> Primitive;
+ typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+ {
+ for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+ const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+ const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+ const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+ pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+ {
+ vbool<K> valid0 = valid_i;
+
+ for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+ {
+ if (!quad.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+ const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+ const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+ const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+ if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+ break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+ }
+ };
+ }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h
new file mode 100644
index 0000000000..cdf68f486b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h
@@ -0,0 +1,710 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+
+/*
+
+ This file implements the intersection of a ray with a round linear
+ curve segment. We define the geometry of such a round linear curve
+ segment from point p0 with radius r0 to point p1 with radius r1
+ using the cone that touches spheres p0/r0 and p1/r1 tangentially
+ plus the sphere p1/r1. We denote the tangentially touching cone from
+ p0/r0 to p1/r1 with cone(p0,r0,p1,r1) and the cone plus the ending
+ sphere with cone_sphere(p0,r0,p1,r1).
+
+ For multiple connected round linear curve segments this construction
+ yield a proper shape when viewed from the outside. Using the
+ following CSG we can also handle the interiour in most common cases:
+
+ round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) =
+ cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr)
+
+ Thus by subtracting the neighboring cone geometries, we cut away
+ parts of the center cone_sphere surface which lie inside the
+ combined curve. This approach works as long as geometry of the
+ current cone_sphere penetrates into direct neighbor segments only,
+ and not into segments further away.
+
+ To construct a cone that touches two spheres at p0 and p1 with r0
+ and r1, one has to increase the cone radius at r0 and r1 to obtain
+ larger radii w0 and w1, such that the infinite cone properly touches
+ the spheres. From the paper "Ray Tracing Generalized Tube
+ Primitives: Method and Applications"
+ (https://www.researchgate.net/publication/334378683_Ray_Tracing_Generalized_Tube_Primitives_Method_and_Applications)
+ one can derive the following equations for these increased
+ radii:
+
+ sr = 1.0f / sqrt(1-sqr(dr)/sqr(p1-p0))
+ w0 = sr*r0
+ w1 = sr*r1
+
+ Further, we want the cone to start where it touches the sphere at p0
+ and to end where it touches sphere at p1. Therefore, we need to
+ construct clipping locations y0 and y1 for the start and end of the
+ cone. These start and end clipping location of the cone can get
+ calculated as:
+
+ Y0 = - r0 * (r1-r0) / length(p1-p0)
+ Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+
+ Where the cone starts a distance Y0 and ends a distance Y1 away of
+ point p0 along the cone center. The distance between Y1-Y0 can get
+ calculated as:
+
+ dY = length(p1-p0) - (r1-r0)^2 / length(p1-p0)
+
+ In the code below, Y will always be scaled by length(p1-p0) to
+ obtain y and you will find the terms r0*(r1-r0) and
+ (p1-p0)^2-(r1-r0)^2.
+
+ */
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct RoundLineIntersectorHitM
+ {
+ __forceinline RoundLineIntersectorHitM() {}
+
+ __forceinline RoundLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+ : vu(u), vv(v), vt(t), vNg(Ng) {}
+
+ __forceinline void finalize() {}
+
+ __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+ __forceinline float t (const size_t i) const { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ public:
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ namespace __roundline_internal
+ {
+ template<int M>
+ struct ConeGeometry
+ {
+ ConeGeometry (const Vec4vf<M>& a, const Vec4vf<M>& b)
+ : p0(a.xyz()), p1(b.xyz()), dP(p1-p0), dPdP(dot(dP,dP)), r0(a.w), sqr_r0(sqr(r0)), r1(b.w), dr(r1-r0), drdr(dr*dr), r0dr (r0*dr), g(dPdP - drdr) {}
+
+ /*
+
+ This function tests if a point is accepted by first cone
+ clipping plane.
+
+ First, we need to project the point onto the line p0->p1:
+
+ Y = (p-p0)*(p1-p0)/length(p1-p0)
+
+ This value y is the distance to the projection point from
+ p0. The clip distances are calculated as:
+
+ Y0 = - r0 * (r1-r0) / length(p1-p0)
+ Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+
+ Thus to test if the point p is accepted by the first
+ clipping plane we need to test Y > Y0 and to test if it
+ is accepted by the second clipping plane we need to test
+ Y < Y1.
+
+ By multiplying the calculations with length(p1-p0) these
+ calculation can get simplied to:
+
+ y = (p-p0)*(p1-p0)
+ y0 = - r0 * (r1-r0)
+ y1 = (p1-p0)^2 - r1 * (r1-r0)
+
+ and the test y > y0 and y < y1.
+
+ */
+
+ __forceinline vbool<M> isClippedByPlane (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+ {
+ const Vec3vf<M> p0p = p - p0;
+ const vfloat<M> y = dot(p0p,dP);
+ const vfloat<M> cap0 = -r0dr;
+ const vbool<M> inside_cone = y > cap0;
+ return valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf)) & inside_cone;
+ }
+
+ /*
+
+ This function tests whether a point lies inside the capped cone
+ tangential to its ending spheres.
+
+ Therefore one has to check if the point is inside the
+ region defined by the cone clipping planes, which is
+ performed similar as in the previous function.
+
+ To perform the inside cone test we need to project the
+ point onto the line p0->p1:
+
+ dP = p1-p0
+ Y = (p-p0)*dP/length(dP)
+
+ This value Y is the distance to the projection point from
+ p0. To obtain a parameter value u going from 0 to 1 along
+ the line p0->p1 we calculate:
+
+ U = Y/length(dP)
+
+ The radii to use at points p0 and p1 are:
+
+ w0 = sr * r0
+ w1 = sr * r1
+ dw = w1-w0
+
+ Using these radii and u one can directly test if the point
+ lies inside the cone using the formula dP*dP < wy*wy with:
+
+ wy = w0 + u*dw
+ py = p0 + u*dP - p
+
+ By multiplying the calculations with length(p1-p0) and
+ inserting the definition of w can obtain simpler equations:
+
+ y = (p-p0)*dP
+ ry = r0 + y/dP^2 * dr
+ wy = sr*ry
+ py = p0 + y/dP^2*dP - p
+ y0 = - r0 * dr
+ y1 = dP^2 - r1 * dr
+
+ Thus for the in-cone test we get:
+
+ py^2 < wy^2
+ <=> py^2 < sr^2 * ry^2
+ <=> py^2 * ( dP^2 - dr^2 ) < dP^2 * ry^2
+
+ This can further get simplified to:
+
+ (p0-p)^2 * (dP^2 - dr^2) - y^2 < dP^2 * r0^2 + 2.0f*r0*dr*y;
+
+ */
+
+ __forceinline vbool<M> isInsideCappedCone (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+ {
+ const Vec3vf<M> p0p = p - p0;
+ const vfloat<M> y = dot(p0p,dP);
+ const vfloat<M> cap0 = -r0dr+vfloat<M>(ulp);
+ const vfloat<M> cap1 = -r1*dr + dPdP;
+
+ vbool<M> inside_cone = valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf));
+ inside_cone &= y > cap0; // start clipping plane
+ inside_cone &= y < cap1; // end clipping plane
+ inside_cone &= sqr(p0p)*g - sqr(y) < dPdP * sqr_r0 + 2.0f*r0dr*y; // in cone test
+ return inside_cone;
+ }
+
+ protected:
+ Vec3vf<M> p0;
+ Vec3vf<M> p1;
+ Vec3vf<M> dP;
+ vfloat<M> dPdP;
+ vfloat<M> r0;
+ vfloat<M> sqr_r0;
+ vfloat<M> r1;
+ vfloat<M> dr;
+ vfloat<M> drdr;
+ vfloat<M> r0dr;
+ vfloat<M> g;
+ };
+
+ template<int M>
+ struct ConeGeometryIntersector : public ConeGeometry<M>
+ {
+ using ConeGeometry<M>::p0;
+ using ConeGeometry<M>::p1;
+ using ConeGeometry<M>::dP;
+ using ConeGeometry<M>::dPdP;
+ using ConeGeometry<M>::r0;
+ using ConeGeometry<M>::sqr_r0;
+ using ConeGeometry<M>::r1;
+ using ConeGeometry<M>::dr;
+ using ConeGeometry<M>::r0dr;
+ using ConeGeometry<M>::g;
+
+ ConeGeometryIntersector (const Vec3vf<M>& ray_org, const Vec3vf<M>& ray_dir, const vfloat<M>& dOdO, const vfloat<M>& rcp_dOdO, const Vec4vf<M>& a, const Vec4vf<M>& b)
+ : ConeGeometry<M>(a,b), org(ray_org), O(ray_org-p0), dO(ray_dir), dOdO(dOdO), rcp_dOdO(rcp_dOdO), OdP(dot(dP,O)), dOdP(dot(dP,dO)), yp(OdP + r0dr) {}
+
+ /*
+
+ This function intersects a ray with a cone that touches a
+ start sphere p0/r0 and end sphere p1/r1.
+
+ To find this ray/cone intersections one could just
+ calculate radii w0 and w1 as described above and use a
+ standard ray/cone intersection routine with these
+ radii. However, it turns out that calculations can get
+ simplified when deriving a specialized ray/cone
+ intersection for this special case. We perform
+ calculations relative to the cone origin p0 and define:
+
+ O = ray_org - p0
+ dO = ray_dir
+ dP = p1-p0
+ dr = r1-r0
+ dw = w1-w0
+
+ For some t we can compute the potential hit point h = O + t*dO and
+ project it onto the cone vector dP to obtain u = (h*dP)/(dP*dP). In
+ case of an intersection, the squared distance from the hit point
+ projected onto the cone center line to the hit point should be equal
+ to the squared cone radius at u:
+
+ (u*dP - h)^2 = (w0 + u*dw)^2
+
+ Inserting the definition of h, u, w0, and dw into this formula, then
+ factoring out all terms, and sorting by t^2, t^1, and t^0 terms
+ yields a quadratic equation to solve.
+
+ Inserting u:
+ ( (h*dP)*dP/dP^2 - h )^2 = ( w0 + (h*dP)*dw/dP^2 )^2
+
+ Multiplying by dP^4:
+ ( (h*dP)*dP - h*dP^2 )^2 = ( w0*dP^2 + (h*dP)*dw )^2
+
+ Inserting w0 and dw:
+ ( (h*dP)*dP - h*dP^2 )^2 = ( r0*dP^2 + (h*dP)*dr )^2 / (1-dr^2/dP^2)
+ ( (h*dP)*dP - h*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (h*dP)*dr )^2
+
+ Now one can insert the definition of h, factor out, and presort by t:
+ ( ((O + t*dO)*dP)*dP - (O + t*dO)*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + ((O + t*dO)*dP)*dr )^2
+ ( (O*dP)*dP-O*dP^2 + t*( (dO*dP)*dP - dO*dP^2 ) )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (O*dP)*dr + t*(dO*dP)*dr )^2
+
+ Factoring out further and sorting by t^2, t^1 and t^0 yields:
+
+ 0 = t^2 * [ ((dO*dP)*dP - dO-dP^2)^2 * (dP^2 - dr^2) - dP^2*(dO*dP)^2*dr^2 ]
+ + 2*t^1 * [ ((O*dP)*dP - O*dP^2) * ((dO*dP)*dP - dO*dP^2) * (dP^2 - dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)*(dO*dP)*dr ]
+ + t^0 * [ ( (O*dP)*dP - O*dP^2)^2 * (dP^2-dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)^2 ]
+
+ This can be simplified to:
+
+ 0 = t^2 * [ (dP^2 - dr^2)*dO^2 - (dO*dP)^2 ]
+ + 2*t^1 * [ (dP^2 - dr^2)*(O*dO) - (dO*dP)*(O*dP + r0*dr) ]
+ + t^0 * [ (dP^2 - dr^2)*O^2 - (O*dP)^2 - r0^2*dP^2 - 2.0f*r0*dr*(O*dP) ]
+
+ Solving this quadratic equation yields the values for t at which the
+ ray intersects the cone.
+
+ */
+
+ __forceinline bool intersectCone(vbool<M>& valid, vfloat<M>& lower, vfloat<M>& upper)
+ {
+ /* return no hit by default */
+ lower = pos_inf;
+ upper = neg_inf;
+
+ /* compute quadratic equation A*t^2 + B*t + C = 0 */
+ const vfloat<M> OO = dot(O,O);
+ const vfloat<M> OdO = dot(dO,O);
+ const vfloat<M> A = g * dOdO - sqr(dOdP);
+ const vfloat<M> B = 2.0f * (g*OdO - dOdP*yp);
+ const vfloat<M> C = g*OO - sqr(OdP) - sqr_r0*dPdP - 2.0f*r0dr*OdP;
+
+ /* we miss the cone if determinant is smaller than zero */
+ const vfloat<M> D = B*B - 4.0f*A*C;
+ valid &= (D >= 0.0f & g > 0.0f); // if g <= 0 then the cone is inside a sphere end
+
+ /* When rays are parallel to the cone surface, then the
+ * ray may be inside or outside the cone. We just assume a
+ * miss in that case, which is fine as rays inside the
+ * cone would anyway hit the ending spheres in that
+ * case. */
+ valid &= abs(A) > min_rcp_input;
+ if (unlikely(none(valid))) {
+ return false;
+ }
+
+ /* compute distance to front and back hit */
+ const vfloat<M> Q = sqrt(D);
+ const vfloat<M> rcp_2A = rcp(2.0f*A);
+ t_cone_front = (-B-Q)*rcp_2A;
+ y_cone_front = yp + t_cone_front*dOdP;
+ lower = select( (y_cone_front > -(float)ulp) & (y_cone_front <= g) & (g > 0.0f), t_cone_front, vfloat<M>(pos_inf));
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+ t_cone_back = (-B+Q)*rcp_2A;
+ y_cone_back = yp + t_cone_back *dOdP;
+ upper = select( (y_cone_back > -(float)ulp) & (y_cone_back <= g) & (g > 0.0f), t_cone_back , vfloat<M>(neg_inf));
+#endif
+ return true;
+ }
+
+ /*
+ This function intersects the ray with the end sphere at
+ p1. We already clip away hits that are inside the
+ neighboring cone segment.
+
+ */
+
+ __forceinline void intersectEndSphere(vbool<M>& valid,
+ const ConeGeometry<M>& coneR,
+ vfloat<M>& lower, vfloat<M>& upper)
+ {
+ /* calculate front and back hit with end sphere */
+ const Vec3vf<M> O1 = org - p1;
+ const vfloat<M> O1dO = dot(O1,dO);
+ const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r1));
+ const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+
+ /* clip away front hit if it is inside next cone segment */
+ t_sph1_front = (-O1dO - rhs1)*rcp_dOdO;
+ const Vec3vf<M> hit_front = org + t_sph1_front*dO;
+ vbool<M> valid_sph1_front = h2 >= 0.0f & yp + t_sph1_front*dOdP > g & !coneR.isClippedByPlane (valid, hit_front);
+ lower = select(valid_sph1_front, t_sph1_front, vfloat<M>(pos_inf));
+
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+ /* clip away back hit if it is inside next cone segment */
+ t_sph1_back = (-O1dO + rhs1)*rcp_dOdO;
+ const Vec3vf<M> hit_back = org + t_sph1_back*dO;
+ vbool<M> valid_sph1_back = h2 >= 0.0f & yp + t_sph1_back*dOdP > g & !coneR.isClippedByPlane (valid, hit_back);
+ upper = select(valid_sph1_back, t_sph1_back, vfloat<M>(neg_inf));
+#else
+ upper = vfloat<M>(neg_inf);
+#endif
+ }
+
+ __forceinline void intersectBeginSphere(const vbool<M>& valid,
+ vfloat<M>& lower, vfloat<M>& upper)
+ {
+ /* calculate front and back hit with end sphere */
+ const Vec3vf<M> O1 = org - p0;
+ const vfloat<M> O1dO = dot(O1,dO);
+ const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r0));
+ const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+
+ /* clip away front hit if it is inside next cone segment */
+ t_sph0_front = (-O1dO - rhs1)*rcp_dOdO;
+ vbool<M> valid_sph1_front = valid & h2 >= 0.0f & yp + t_sph0_front*dOdP < 0;
+ lower = select(valid_sph1_front, t_sph0_front, vfloat<M>(pos_inf));
+
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+ /* clip away back hit if it is inside next cone segment */
+ t_sph0_back = (-O1dO + rhs1)*rcp_dOdO;
+ vbool<M> valid_sph1_back = valid & h2 >= 0.0f & yp + t_sph0_back*dOdP < 0;
+ upper = select(valid_sph1_back, t_sph0_back, vfloat<M>(neg_inf));
+#else
+ upper = vfloat<M>(neg_inf);
+#endif
+ }
+
+ /*
+
+ This function calculates the geometry normal of some cone hit.
+
+ For a given hit point h (relative to p0) with a cone
+ starting at p0 with radius w0 and ending at p1 with
+ radius w1 one normally calculates the geometry normal by
+ first calculating the parmetric u hit location along the
+ cone:
+
+ u = dot(h,dP)/dP^2
+
+ Using this value one can now directly calculate the
+ geometry normal by bending the connection vector (h-u*dP)
+ from hit to projected hit with some cone dependent value
+ dw/sqrt(dP^2) * normalize(dP):
+
+ Ng = normalize(h-u*dP) - dw/length(dP) * normalize(dP)
+
+ The length of the vector (h-u*dP) can also get calculated
+ by interpolating the radii as w0+u*dw which yields:
+
+ Ng = (h-u*dP)/(w0+u*dw) - dw/dP^2 * dP
+
+ Multiplying with (w0+u*dw) yield a scaled Ng':
+
+ Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP
+
+ Inserting the definition of w0 and dw and refactoring
+ yield a furhter scaled Ng'':
+
+ Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP
+
+ Now inserting the definition of u gives and multiplying
+ with the denominator yields:
+
+ Ng''' = (dP^2-dr^2)*(dP^2*h-dot(h,dP)*dP) - (dP^2*r0+dot(h,dP)*dr)*dr*dP
+
+ Factoring out, cancelling terms, dividing by dP^2, and
+ factoring again yields finally:
+
+ Ng'''' = (dP^2-dr^2)*h - dP*(dot(h,dP) + r0*dr)
+
+ */
+
+ __forceinline Vec3vf<M> Ng_cone(const vbool<M>& front_hit) const
+ {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+ const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+ const vfloat<M> t = select(front_hit, t_cone_front, t_cone_back);
+ const Vec3vf<M> h = O + t*dO;
+ return g*h-dP*y;
+#else
+ const Vec3vf<M> h = O + t_cone_front*dO;
+ return g*h-dP*y_cone_front;
+#endif
+ }
+
+ /* compute geometry normal of sphere hit as the difference
+ * vector from hit point to sphere center */
+
+ __forceinline Vec3vf<M> Ng_sphere1(const vbool<M>& front_hit) const
+ {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+ const vfloat<M> t_sph1 = select(front_hit, t_sph1_front, t_sph1_back);
+ return org+t_sph1*dO-p1;
+#else
+ return org+t_sph1_front*dO-p1;
+#endif
+ }
+
+ __forceinline Vec3vf<M> Ng_sphere0(const vbool<M>& front_hit) const
+ {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+ const vfloat<M> t_sph0 = select(front_hit, t_sph0_front, t_sph0_back);
+ return org+t_sph0*dO-p0;
+#else
+ return org+t_sph0_front*dO-p0;
+#endif
+ }
+
+ /*
+ This function calculates the u coordinate of a
+ hit. Therefore we use the hit distance y (which is zero
+ at the first cone clipping plane) and divide by distance
+ g between the clipping planes.
+
+ */
+
+ __forceinline vfloat<M> u_cone(const vbool<M>& front_hit) const
+ {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+ const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+ return clamp(y*rcp(g));
+#else
+ return clamp(y_cone_front*rcp(g));
+#endif
+ }
+
+ private:
+ Vec3vf<M> org;
+ Vec3vf<M> O;
+ Vec3vf<M> dO;
+ vfloat<M> dOdO;
+ vfloat<M> rcp_dOdO;
+ vfloat<M> OdP;
+ vfloat<M> dOdP;
+
+ /* for ray/cone intersection */
+ private:
+ vfloat<M> yp;
+ vfloat<M> y_cone_front;
+ vfloat<M> t_cone_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+ vfloat<M> y_cone_back;
+ vfloat<M> t_cone_back;
+#endif
+
+ /* for ray/sphere intersection */
+ private:
+ vfloat<M> t_sph1_front;
+ vfloat<M> t_sph0_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+ vfloat<M> t_sph1_back;
+ vfloat<M> t_sph0_back;
+#endif
+ };
+
+
+ template<int M, typename Epilog, typename ray_tfar_func>
+ static __forceinline bool intersectConeSphere(const vbool<M>& valid_i,
+ const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir,
+ const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+ const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+ const Vec4vf<M>& vL, const Vec4vf<M>& vR,
+ const Epilog& epilog)
+ {
+ vbool<M> valid = valid_i;
+
+ /* move ray origin closer to make calculations numerically stable */
+ const vfloat<M> dOdO = sqr(ray_dir);
+ const vfloat<M> rcp_dOdO = rcp(dOdO);
+ const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+ const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+ const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+
+ /* intersect with cone from v0 to v1 */
+ vfloat<M> t_cone_lower, t_cone_upper;
+ ConeGeometryIntersector<M> cone (ray_org, ray_dir, dOdO, rcp_dOdO, v0, v1);
+ vbool<M> validCone = valid;
+ cone.intersectCone(validCone, t_cone_lower, t_cone_upper);
+
+ valid &= (validCone | (cone.g <= 0.0f)); // if cone is entirely in sphere end - check sphere
+ if (unlikely(none(valid)))
+ return false;
+
+ /* cone hits inside the neighboring capped cones are inside the geometry and thus ignored */
+ const ConeGeometry<M> coneL (v0, vL);
+ const ConeGeometry<M> coneR (v1, vR);
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+ const Vec3vf<M> hit_lower = ray_org + t_cone_lower*ray_dir;
+ const Vec3vf<M> hit_upper = ray_org + t_cone_upper*ray_dir;
+ t_cone_lower = select (!coneL.isInsideCappedCone (validCone, hit_lower) & !coneR.isInsideCappedCone (validCone, hit_lower), t_cone_lower, vfloat<M>(pos_inf));
+ t_cone_upper = select (!coneL.isInsideCappedCone (validCone, hit_upper) & !coneR.isInsideCappedCone (validCone, hit_upper), t_cone_upper, vfloat<M>(neg_inf));
+#endif
+
+ /* intersect ending sphere */
+ vfloat<M> t_sph1_lower, t_sph1_upper;
+ vfloat<M> t_sph0_lower = vfloat<M>(pos_inf);
+ vfloat<M> t_sph0_upper = vfloat<M>(neg_inf);
+ cone.intersectEndSphere(valid, coneR, t_sph1_lower, t_sph1_upper);
+
+ const vbool<M> isBeginPoint = valid & (vL[0] == vfloat<M>(pos_inf));
+ if (unlikely(any(isBeginPoint))) {
+ cone.intersectBeginSphere (isBeginPoint, t_sph0_lower, t_sph0_upper);
+ }
+
+ /* CSG union of cone and end sphere */
+ vfloat<M> t_sph_lower = min(t_sph0_lower, t_sph1_lower);
+ vfloat<M> t_cone_sphere_lower = min(t_cone_lower, t_sph_lower);
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+ vfloat<M> t_sph_upper = max(t_sph0_upper, t_sph1_upper);
+ vfloat<M> t_cone_sphere_upper = max(t_cone_upper, t_sph_upper);
+
+ /* filter out hits that are not in tnear/tfar range */
+ const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+ const vbool<M> valid_upper = valid & ray_tnear <= dt+t_cone_sphere_upper & dt+t_cone_sphere_upper <= ray_tfar() & t_cone_sphere_upper != vfloat<M>(neg_inf);
+
+ /* check if there is a first hit */
+ const vbool<M> valid_first = valid_lower | valid_upper;
+ if (unlikely(none(valid_first)))
+ return false;
+
+ /* construct first hit */
+ const vfloat<M> t_first = select(valid_lower, t_cone_sphere_lower, t_cone_sphere_upper);
+ const vbool<M> cone_hit_first = t_first == t_cone_lower | t_first == t_cone_upper;
+ const vbool<M> sph0_hit_first = t_first == t_sph0_lower | t_first == t_sph0_upper;
+ const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+ const vfloat<M> u_first = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+ /* invoke intersection filter for first hit */
+ RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+ const bool is_hit_first = epilog(valid_first, hit);
+
+ /* check for possible second hits before potentially accepted hit */
+ const vfloat<M> t_second = t_cone_sphere_upper;
+ const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_cone_sphere_upper <= ray_tfar());
+ if (unlikely(none(valid_second)))
+ return is_hit_first;
+
+ /* invoke intersection filter for second hit */
+ const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+ const vbool<M> sph0_hit_second = t_second == t_sph0_lower | t_second == t_sph0_upper;
+ const Vec3vf<M> Ng_second = select(cone_hit_second, cone.Ng_cone(false), select (sph0_hit_second, cone.Ng_sphere0(false), cone.Ng_sphere1(false)));
+ const vfloat<M> u_second = select(cone_hit_second, cone.u_cone(false), select (sph0_hit_second, vfloat<M>(zero), vfloat<M>(one)));
+
+ hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+ const bool is_hit_second = epilog(valid_second, hit);
+
+ return is_hit_first | is_hit_second;
+#else
+ /* filter out hits that are not in tnear/tfar range */
+ const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+
+ /* check if there is a valid hit */
+ if (unlikely(none(valid_lower)))
+ return false;
+
+ /* construct first hit */
+ const vbool<M> cone_hit_first = t_cone_sphere_lower == t_cone_lower | t_cone_sphere_lower == t_cone_upper;
+ const vbool<M> sph0_hit_first = t_cone_sphere_lower == t_sph0_lower | t_cone_sphere_lower == t_sph0_upper;
+ const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+ const vfloat<M> u_first = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+ /* invoke intersection filter for first hit */
+ RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_cone_sphere_lower,Ng_first);
+ const bool is_hit_first = epilog(valid_lower, hit);
+
+ return is_hit_first;
+#endif
+ }
+
+ } // end namespace __roundline_internal
+
+ template<int M>
+ struct RoundLinearCurveIntersector1
+ {
+ typedef CurvePrecalculations1 Precalculations;
+
+ struct ray_tfar {
+ Ray& ray;
+ __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+ __forceinline vfloat<M> operator() () const { return ray.tfar; };
+ };
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ Ray& ray,
+ IntersectContext* context,
+ const LineSegments* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+ const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+ const Epilog& epilog)
+ {
+ const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+ const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+ const vfloat<M> ray_tnear(ray.tnear());
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+ const Vec4vf<M> vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi);
+ const Vec4vf<M> vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi);
+ return __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,vL,vR,epilog);
+ }
+ };
+
+ template<int M, int K>
+ struct RoundLinearCurveIntersectorK
+ {
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ struct ray_tfar {
+ RayK<K>& ray;
+ size_t k;
+ __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+ __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+ };
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const LineSegments* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+ const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+ const Epilog& epilog)
+ {
+ const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+ const vfloat<M> ray_tnear = ray.tnear()[k];
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+ const Vec4vf<M> vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi);
+ const Vec4vf<M> vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi);
+ return __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,vL,vR,epilog);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h
new file mode 100644
index 0000000000..079817335e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h
@@ -0,0 +1,136 @@
+// ======================================================================== //
+// Copyright 2009-2020 Intel Corporation //
+// //
+// Licensed under the Apache License, Version 2.0 (the "License"); //
+// you may not use this file except in compliance with the License. //
+// You may obtain a copy of the License at //
+// //
+// http://www.apache.org/licenses/LICENSE-2.0 //
+// //
+// Unless required by applicable law or agreed to in writing, software //
+// distributed under the License is distributed on an "AS IS" BASIS, //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and //
+// limitations under the License. //
+// ======================================================================== //
+
+#pragma once
+
+#include "roundline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M, int Mx, bool filter>
+ struct RoundLinearCurveMiIntersector1
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+ }
+ };
+
+ template<int M, int Mx, bool filter>
+ struct RoundLinearCurveMiMBIntersector1
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+ const vbool<Mx> valid = line.template valid<Mx>();
+ RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct RoundLinearCurveMiIntersectorK
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct RoundLinearCurveMiMBIntersectorK
+ {
+ typedef LineMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+ Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+ const vbool<Mx> valid = line.template valid<Mx>();
+ return RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h
new file mode 100644
index 0000000000..3ab90c29ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h
@@ -0,0 +1,183 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct SphereIntersectorHitM
+ {
+ __forceinline SphereIntersectorHitM() {}
+
+ __forceinline SphereIntersectorHitM(const vfloat<M>& t, const Vec3vf<M>& Ng)
+ : vt(t), vNg(Ng) {}
+
+ __forceinline void finalize() {}
+
+ __forceinline Vec2f uv(const size_t i) const {
+ return Vec2f(0.0f, 0.0f);
+ }
+ __forceinline float t(const size_t i) const {
+ return vt[i];
+ }
+ __forceinline Vec3fa Ng(const size_t i) const {
+ return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+ }
+
+ public:
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ template<int M>
+ struct SphereIntersector1
+ {
+ typedef CurvePrecalculations1 Precalculations;
+
+ template<typename Epilog>
+ static __forceinline bool intersect(
+ const vbool<M>& valid_i, Ray& ray,
+ const Precalculations& pre, const Vec4vf<M>& v0, const Epilog& epilog)
+ {
+ vbool<M> valid = valid_i;
+
+ const vfloat<M> rd2 = rcp(dot(ray.dir, ray.dir));
+ const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+ const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+ const Vec3vf<M> center = v0.xyz();
+ const vfloat<M> radius = v0.w;
+
+ const Vec3vf<M> c0 = center - ray_org;
+ const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+ const Vec3vf<M> perp = c0 - projC0 * ray_dir;
+ const vfloat<M> l2 = dot(perp, perp);
+ const vfloat<M> r2 = radius * radius;
+ valid &= (l2 <= r2);
+ if (unlikely(none(valid)))
+ return false;
+
+ const vfloat<M> td = sqrt((r2 - l2) * rd2);
+ const vfloat<M> t_front = projC0 - td;
+ const vfloat<M> t_back = projC0 + td;
+
+ const vbool<M> valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar);
+ const vbool<M> valid_back = valid & (ray.tnear() <= t_back ) & (t_back <= ray.tfar);
+
+ /* check if there is a first hit */
+ const vbool<M> valid_first = valid_front | valid_back;
+ if (unlikely(none(valid_first)))
+ return false;
+
+ /* construct first hit */
+ const vfloat<M> td_front = -td;
+ const vfloat<M> td_back = +td;
+ const vfloat<M> t_first = select(valid_front, t_front, t_back);
+ const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+ SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+ /* invoke intersection filter for first hit */
+ const bool is_hit_first = epilog(valid_first, hit);
+
+ /* check for possible second hits before potentially accepted hit */
+ const vfloat<M> t_second = t_back;
+ const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar);
+ if (unlikely(none(valid_second)))
+ return is_hit_first;
+
+ /* invoke intersection filter for second hit */
+ const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+ hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+ const bool is_hit_second = epilog(valid_second, hit);
+
+ return is_hit_first | is_hit_second;
+ }
+
+ template<typename Epilog>
+ static __forceinline bool intersect(
+ const vbool<M>& valid_i, Ray& ray, IntersectContext* context, const Points* geom,
+ const Precalculations& pre, const Vec4vf<M>& v0i, const Epilog& epilog)
+ {
+ const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ return intersect(valid_i,ray,pre,v0,epilog);
+ }
+ };
+
+ template<int M, int K>
+ struct SphereIntersectorK
+ {
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid_i,
+ RayK<K>& ray, size_t k,
+ IntersectContext* context,
+ const Points* geom,
+ const Precalculations& pre,
+ const Vec4vf<M>& v0i,
+ const Epilog& epilog)
+ {
+ vbool<M> valid = valid_i;
+
+ const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+ const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+ const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir));
+
+ const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+ const Vec3vf<M> center = v0.xyz();
+ const vfloat<M> radius = v0.w;
+
+ const Vec3vf<M> c0 = center - ray_org;
+ const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+ const Vec3vf<M> perp = c0 - projC0 * ray_dir;
+ const vfloat<M> l2 = dot(perp, perp);
+ const vfloat<M> r2 = radius * radius;
+ valid &= (l2 <= r2);
+ if (unlikely(none(valid)))
+ return false;
+
+ const vfloat<M> td = sqrt((r2 - l2) * rd2);
+ const vfloat<M> t_front = projC0 - td;
+ const vfloat<M> t_back = projC0 + td;
+
+ const vbool<M> valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]);
+ const vbool<M> valid_back = valid & (ray.tnear()[k] <= t_back ) & (t_back <= ray.tfar[k]);
+
+ /* check if there is a first hit */
+ const vbool<M> valid_first = valid_front | valid_back;
+ if (unlikely(none(valid_first)))
+ return false;
+
+ /* construct first hit */
+ const vfloat<M> td_front = -td;
+ const vfloat<M> td_back = +td;
+ const vfloat<M> t_first = select(valid_front, t_front, t_back);
+ const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+ SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+ /* invoke intersection filter for first hit */
+ const bool is_hit_first = epilog(valid_first, hit);
+
+ /* check for possible second hits before potentially accepted hit */
+ const vfloat<M> t_second = t_back;
+ const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]);
+ if (unlikely(none(valid_second)))
+ return is_hit_first;
+
+ /* invoke intersection filter for second hit */
+ const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+ hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+ const bool is_hit_second = epilog(valid_second, hit);
+
+ return is_hit_first | is_hit_second;
+ }
+ };
+ } // namespace isa
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h
new file mode 100644
index 0000000000..1146847602
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intersector_epilog.h"
+#include "pointi.h"
+#include "sphere_intersector.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M, int Mx, bool filter>
+ struct SphereMiIntersector1
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre,
+ RayHit& ray,
+ IntersectContext* context,
+ const Primitive& sphere)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(sphere.geomID());
+ Vec4vf<M> v0; sphere.gather(v0, geom);
+ const vbool<Mx> valid = sphere.template valid<Mx>();
+ SphereIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre,
+ Ray& ray,
+ IntersectContext* context,
+ const Primitive& sphere)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(sphere.geomID());
+ Vec4vf<M> v0; sphere.gather(v0, geom);
+ const vbool<Mx> valid = sphere.template valid<Mx>();
+ return SphereIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query,
+ PointQueryContext* context,
+ const Primitive& sphere)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+ }
+ };
+
+ template<int M, int Mx, bool filter>
+ struct SphereMiMBIntersector1
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculations1 Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre,
+ RayHit& ray,
+ IntersectContext* context,
+ const Primitive& sphere)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(sphere.geomID());
+ Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+ const vbool<Mx> valid = sphere.template valid<Mx>();
+ SphereIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre,
+ Ray& ray,
+ IntersectContext* context,
+ const Primitive& sphere)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(sphere.geomID());
+ Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+ const vbool<Mx> valid = sphere.template valid<Mx>();
+ return SphereIntersector1<Mx>::intersect(
+ valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query,
+ PointQueryContext* context,
+ const Primitive& sphere)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct SphereMiIntersectorK
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(
+ const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(sphere.geomID());
+ Vec4vf<M> v0; sphere.gather(v0, geom);
+ const vbool<Mx> valid = sphere.template valid<Mx>();
+ SphereIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0,
+ Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+ }
+
+ static __forceinline bool occluded(
+ const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(sphere.geomID());
+ Vec4vf<M> v0; sphere.gather(v0, geom);
+ const vbool<Mx> valid = sphere.template valid<Mx>();
+ return SphereIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0,
+ Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+ }
+ };
+
+ template<int M, int Mx, int K, bool filter>
+ struct SphereMiMBIntersectorK
+ {
+ typedef PointMi<M> Primitive;
+ typedef CurvePrecalculationsK<K> Precalculations;
+
+ static __forceinline void intersect(
+ const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+ {
+ STAT3(normal.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(sphere.geomID());
+ Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+ const vbool<Mx> valid = sphere.template valid<Mx>();
+ SphereIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0,
+ Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+ }
+
+ static __forceinline bool occluded(
+ const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+ {
+ STAT3(shadow.trav_prims, 1, 1, 1);
+ const Points* geom = context->scene->get<Points>(sphere.geomID());
+ Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+ const vbool<Mx> valid = sphere.template valid<Mx>();
+ return SphereIntersectorK<Mx, K>::intersect(
+ valid, ray, k, context, geom, pre, v0,
+ Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+ }
+ };
+ } // namespace isa
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h
new file mode 100644
index 0000000000..94ad46ad87
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h
@@ -0,0 +1,38 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "../subdiv/subdivpatch1base.h"
+
+namespace embree
+{
+
+ struct __aligned(64) SubdivPatch1 : public SubdivPatch1Base
+ {
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+
+ static Type type;
+
+ public:
+
+ /*! constructor for cached subdiv patch */
+ SubdivPatch1 (const unsigned int gID,
+ const unsigned int pID,
+ const unsigned int subPatch,
+ const SubdivMesh *const mesh,
+ const size_t time,
+ const Vec2f uv[4],
+ const float edge_level[4],
+ const int subdiv[4],
+ const int simd_width)
+ : SubdivPatch1Base(gID,pID,subPatch,mesh,time,uv,edge_level,subdiv,simd_width) {}
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h
new file mode 100644
index 0000000000..74ec1de258
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subdivpatch1.h"
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<typename T>
+ class SubdivPatch1Precalculations : public T
+ {
+ public:
+ __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+ : T(ray,ptr) {}
+ };
+
+ template<int K, typename T>
+ class SubdivPatch1PrecalculationsK : public T
+ {
+ public:
+ __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+ : T(valid,ray) {}
+ };
+
+ class SubdivPatch1Intersector1
+ {
+ public:
+ typedef GridSOA Primitive;
+ typedef SubdivPatch1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+ static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ lazy_node = prim->root(0);
+ pre.grid = (Primitive*)prim;
+ return false;
+ }
+
+ /*! Intersect a ray with the primitive. */
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+ else processLazyNode(pre,context,prim,lazy_node);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+ intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+ else return processLazyNode(pre,context,prim,lazy_node);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+ return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+ }
+
+ template<int N>
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node)
+ {
+ // TODO: PointQuery implement
+ assert(false && "not implemented");
+ return false;
+ }
+
+ template<int N>
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+ return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+ }
+ };
+
+ class SubdivPatch1MBIntersector1
+ {
+ public:
+ typedef SubdivPatch1 Primitive;
+ typedef GridSOAMBIntersector1::Precalculations Precalculations;
+
+ static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+ {
+ Primitive* prim = (Primitive*) prim_i;
+ GridSOA* grid = nullptr;
+ grid = (GridSOA*) prim->root_ref.get();
+ pre.itime = getTimeSegment(ray.time(), float(grid->time_steps-1), pre.ftime);
+ lazy_node = grid->root(pre.itime);
+ pre.grid = grid;
+ return false;
+ }
+
+ /*! Intersect a ray with the primitive. */
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node);
+ else processLazyNode(pre,ray,context,prim,lazy_node);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+ intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+ }
+
+ /*! Test if the ray is occluded by the primitive */
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node);
+ else return processLazyNode(pre,ray,context,prim,lazy_node);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+ return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+ }
+
+ template<int N>
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node)
+ {
+ // TODO: PointQuery implement
+ assert(false && "not implemented");
+ return false;
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+ return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+ }
+ };
+
+ template <int K>
+ struct SubdivPatch1IntersectorK
+ {
+ typedef GridSOA Primitive;
+ typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+
+ static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+ {
+ lazy_node = prim->root(0);
+ pre.grid = (Primitive*)prim;
+ return false;
+ }
+
+ template<bool robust>
+ static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+ else processLazyNode(pre,context,prim,lazy_node);
+ }
+
+ template<bool robust>
+ static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+ else return processLazyNode(pre,context,prim,lazy_node);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+ else processLazyNode(pre,context,prim,lazy_node);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+ else return processLazyNode(pre,context,prim,lazy_node);
+ }
+ };
+
+ typedef SubdivPatch1IntersectorK<4> SubdivPatch1Intersector4;
+ typedef SubdivPatch1IntersectorK<8> SubdivPatch1Intersector8;
+ typedef SubdivPatch1IntersectorK<16> SubdivPatch1Intersector16;
+
+ template <int K>
+ struct SubdivPatch1MBIntersectorK
+ {
+ typedef SubdivPatch1 Primitive;
+ //typedef GridSOAMBIntersectorK<K>::Precalculations Precalculations;
+ typedef SubdivPatch1PrecalculationsK<K,typename GridSOAMBIntersectorK<K>::Precalculations> Precalculations;
+
+ static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+ {
+ Primitive* prim = (Primitive*) prim_i;
+ GridSOA* grid = (GridSOA*) prim->root_ref.get();
+ lazy_node = grid->troot;
+ pre.grid = grid;
+ return false;
+ }
+
+ template<bool robust>
+ static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+ else processLazyNode(pre,context,prim,lazy_node);
+ }
+
+ template<bool robust>
+ static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+ else return processLazyNode(pre,context,prim,lazy_node);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+ else processLazyNode(pre,context,prim,lazy_node);
+ }
+
+ template<int N, int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+ else return processLazyNode(pre,context,prim,lazy_node);
+ }
+ };
+
+ typedef SubdivPatch1MBIntersectorK<4> SubdivPatch1MBIntersector4;
+ typedef SubdivPatch1MBIntersectorK<8> SubdivPatch1MBIntersector8;
+ typedef SubdivPatch1MBIntersectorK<16> SubdivPatch1MBIntersector16;
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h
new file mode 100644
index 0000000000..39fa6fb0f0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h
@@ -0,0 +1,517 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_grid_mesh.h"
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+ /* Stores M quads from an indexed face set */
+ struct SubGrid
+ {
+ /* Virtual interface to query information about the quad type */
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* primitive supports multiple time segments */
+ static const bool singleTimeSegment = false;
+
+ /* Returns maximum number of stored quads */
+ static __forceinline size_t max_size() { return 1; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+ public:
+
+ /* Default constructor */
+ __forceinline SubGrid() { }
+
+ /* Construction from vertices and IDs */
+ __forceinline SubGrid(const unsigned int x,
+ const unsigned int y,
+ const unsigned int geomID,
+ const unsigned int primID)
+ : _x(x), _y(y), _geomID(geomID), _primID(primID)
+ {
+ }
+
+ __forceinline bool invalid3x3X() const { return (unsigned int)_x & (1<<15); }
+ __forceinline bool invalid3x3Y() const { return (unsigned int)_y & (1<<15); }
+
+ /* Gather the quads */
+ __forceinline void gather(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ Vec3vf4& p3,
+ const GridMesh* const mesh,
+ const GridMesh::Grid &g) const
+ {
+ /* first quad always valid */
+ const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+ const size_t vtxID01 = vtxID00 + 1;
+ const vfloat4 vtx00 = vfloat4::loadu(mesh->vertexPtr(vtxID00));
+ const vfloat4 vtx01 = vfloat4::loadu(mesh->vertexPtr(vtxID01));
+ const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+ const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+ const vfloat4 vtx10 = vfloat4::loadu(mesh->vertexPtr(vtxID10));
+ const vfloat4 vtx11 = vfloat4::loadu(mesh->vertexPtr(vtxID11));
+
+ /* deltaX => vtx02, vtx12 */
+ const size_t deltaX = invalid3x3X() ? 0 : 1;
+ const size_t vtxID02 = vtxID01 + deltaX;
+ const vfloat4 vtx02 = vfloat4::loadu(mesh->vertexPtr(vtxID02));
+ const size_t vtxID12 = vtxID11 + deltaX;
+ const vfloat4 vtx12 = vfloat4::loadu(mesh->vertexPtr(vtxID12));
+
+ /* deltaY => vtx20, vtx21 */
+ const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset;
+ const size_t vtxID20 = vtxID10 + deltaY;
+ const size_t vtxID21 = vtxID11 + deltaY;
+ const vfloat4 vtx20 = vfloat4::loadu(mesh->vertexPtr(vtxID20));
+ const vfloat4 vtx21 = vfloat4::loadu(mesh->vertexPtr(vtxID21));
+
+ /* deltaX/deltaY => vtx22 */
+ const size_t vtxID22 = vtxID11 + deltaX + deltaY;
+ const vfloat4 vtx22 = vfloat4::loadu(mesh->vertexPtr(vtxID22));
+
+ transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+ transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+ transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+ transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);
+ }
+
+ template<typename T>
+ __forceinline vfloat4 getVertexMB(const GridMesh* const mesh, const size_t offset, const size_t itime, const float ftime) const
+ {
+ const T v0 = T::loadu(mesh->vertexPtr(offset,itime+0));
+ const T v1 = T::loadu(mesh->vertexPtr(offset,itime+1));
+ return lerp(v0,v1,ftime);
+ }
+
+ /* Gather the quads */
+ __forceinline void gatherMB(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ Vec3vf4& p3,
+ const GridMesh* const mesh,
+ const GridMesh::Grid &g,
+ const size_t itime,
+ const float ftime) const
+ {
+ /* first quad always valid */
+ const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+ const size_t vtxID01 = vtxID00 + 1;
+ const vfloat4 vtx00 = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+ const vfloat4 vtx01 = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+ const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+ const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+ const vfloat4 vtx10 = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+ const vfloat4 vtx11 = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+ /* deltaX => vtx02, vtx12 */
+ const size_t deltaX = invalid3x3X() ? 0 : 1;
+ const size_t vtxID02 = vtxID01 + deltaX;
+ const vfloat4 vtx02 = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+ const size_t vtxID12 = vtxID11 + deltaX;
+ const vfloat4 vtx12 = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+ /* deltaY => vtx20, vtx21 */
+ const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset;
+ const size_t vtxID20 = vtxID10 + deltaY;
+ const size_t vtxID21 = vtxID11 + deltaY;
+ const vfloat4 vtx20 = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+ const vfloat4 vtx21 = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+ /* deltaX/deltaY => vtx22 */
+ const size_t vtxID22 = vtxID11 + deltaX + deltaY;
+ const vfloat4 vtx22 = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+ transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+ transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+ transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+ transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);
+ }
+
+
+
+ /* Gather the quads */
+ __forceinline void gather(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ Vec3vf4& p3,
+ const Scene *const scene) const
+ {
+ const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+ const GridMesh::Grid &g = mesh->grid(primID());
+ gather(p0,p1,p2,p3,mesh,g);
+ }
+
+ /* Gather the quads in the motion blur case */
+ __forceinline void gatherMB(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ Vec3vf4& p3,
+ const Scene *const scene,
+ const size_t itime,
+ const float ftime) const
+ {
+ const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+ const GridMesh::Grid &g = mesh->grid(primID());
+ gatherMB(p0,p1,p2,p3,mesh,g,itime,ftime);
+ }
+
+ /* Gather the quads */
+ __forceinline void gather(Vec3fa vtx[16], const Scene *const scene) const
+ {
+ const GridMesh* mesh = scene->get<GridMesh>(geomID());
+ const GridMesh::Grid &g = mesh->grid(primID());
+
+ /* first quad always valid */
+ const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+ const size_t vtxID01 = vtxID00 + 1;
+ const Vec3fa vtx00 = Vec3fa::loadu(mesh->vertexPtr(vtxID00));
+ const Vec3fa vtx01 = Vec3fa::loadu(mesh->vertexPtr(vtxID01));
+ const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+ const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+ const Vec3fa vtx10 = Vec3fa::loadu(mesh->vertexPtr(vtxID10));
+ const Vec3fa vtx11 = Vec3fa::loadu(mesh->vertexPtr(vtxID11));
+
+ /* deltaX => vtx02, vtx12 */
+ const size_t deltaX = invalid3x3X() ? 0 : 1;
+ const size_t vtxID02 = vtxID01 + deltaX;
+ const Vec3fa vtx02 = Vec3fa::loadu(mesh->vertexPtr(vtxID02));
+ const size_t vtxID12 = vtxID11 + deltaX;
+ const Vec3fa vtx12 = Vec3fa::loadu(mesh->vertexPtr(vtxID12));
+
+ /* deltaY => vtx20, vtx21 */
+ const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset;
+ const size_t vtxID20 = vtxID10 + deltaY;
+ const size_t vtxID21 = vtxID11 + deltaY;
+ const Vec3fa vtx20 = Vec3fa::loadu(mesh->vertexPtr(vtxID20));
+ const Vec3fa vtx21 = Vec3fa::loadu(mesh->vertexPtr(vtxID21));
+
+ /* deltaX/deltaY => vtx22 */
+ const size_t vtxID22 = vtxID11 + deltaX + deltaY;
+ const Vec3fa vtx22 = Vec3fa::loadu(mesh->vertexPtr(vtxID22));
+
+ vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+ vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+ vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+ vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+ }
+
+ /* Gather the quads */
+ __forceinline void gatherMB(vfloat4 vtx[16], const Scene *const scene, const size_t itime, const float ftime) const
+ {
+ const GridMesh* mesh = scene->get<GridMesh>(geomID());
+ const GridMesh::Grid &g = mesh->grid(primID());
+
+ /* first quad always valid */
+ const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+ const size_t vtxID01 = vtxID00 + 1;
+ const vfloat4 vtx00 = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+ const vfloat4 vtx01 = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+ const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+ const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+ const vfloat4 vtx10 = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+ const vfloat4 vtx11 = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+ /* deltaX => vtx02, vtx12 */
+ const size_t deltaX = invalid3x3X() ? 0 : 1;
+ const size_t vtxID02 = vtxID01 + deltaX;
+ const vfloat4 vtx02 = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+ const size_t vtxID12 = vtxID11 + deltaX;
+ const vfloat4 vtx12 = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+ /* deltaY => vtx20, vtx21 */
+ const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset;
+ const size_t vtxID20 = vtxID10 + deltaY;
+ const size_t vtxID21 = vtxID11 + deltaY;
+ const vfloat4 vtx20 = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+ const vfloat4 vtx21 = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+ /* deltaX/deltaY => vtx22 */
+ const size_t vtxID22 = vtxID11 + deltaX + deltaY;
+ const vfloat4 vtx22 = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+ vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+ vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+ vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+ vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+ }
+
+
+ /* Calculate the bounds of the subgrid */
+ __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+ {
+ BBox3fa bounds = empty;
+ FATAL("not implemented yet");
+ return bounds;
+ }
+
+ /* Calculate the linear bounds of the primitive */
+ __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime)
+ {
+ return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+ {
+ LBBox3fa allBounds = empty;
+ FATAL("not implemented yet");
+ return allBounds;
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+ {
+ LBBox3fa allBounds = empty;
+ FATAL("not implemented yet");
+ return allBounds;
+ }
+
+
+ friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) {
+ return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )";
+ }
+
+ __forceinline unsigned int geomID() const { return _geomID; }
+ __forceinline unsigned int primID() const { return _primID; }
+ __forceinline unsigned int x() const { return (unsigned int)_x & 0x7fff; }
+ __forceinline unsigned int y() const { return (unsigned int)_y & 0x7fff; }
+
+ private:
+ unsigned short _x;
+ unsigned short _y;
+ unsigned int _geomID; // geometry ID of mesh
+ unsigned int _primID; // primitive ID of primitive inside mesh
+ };
+
+ struct SubGridID {
+ unsigned short x;
+ unsigned short y;
+ unsigned int primID;
+
+ __forceinline SubGridID() {}
+ __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) :
+ x(x), y(y), primID(primID) {}
+ };
+
+ /* QuantizedBaseNode as large subgrid leaf */
+ template<int N>
+ struct SubGridQBVHN
+ {
+ /* Virtual interface to query information about the quad type */
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ __forceinline size_t size() const
+ {
+ for (size_t i=0;i<N;i++)
+ if (primID(i) == -1) return i;
+ return N;
+ }
+
+ __forceinline void clear() {
+ for (size_t i=0;i<N;i++)
+ subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+ qnode.clear();
+ }
+
+ /* Default constructor */
+ __forceinline SubGridQBVHN() { }
+
+ /* Construction from vertices and IDs */
+ __forceinline SubGridQBVHN(const unsigned int x[N],
+ const unsigned int y[N],
+ const unsigned int primID[N],
+ const BBox3fa * const subGridBounds,
+ const unsigned int geomID,
+ const unsigned int items)
+ {
+ clear();
+ _geomID = geomID;
+
+ __aligned(64) typename BVHN<N>::AABBNode node;
+ node.clear();
+ for (size_t i=0;i<items;i++)
+ {
+ subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+ node.setBounds(i,subGridBounds[i]);
+ }
+ qnode.init_dim(node);
+ }
+
+ __forceinline unsigned int geomID() const { return _geomID; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+ __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+ __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+ __forceinline SubGrid subgrid(const size_t i) const {
+ assert(i < N);
+ assert(primID(i) != -1);
+ return SubGrid(x(i),y(i),geomID(),primID(i));
+ }
+
+ public:
+ SubGridID subgridIDs[N];
+
+ typename BVHN<N>::QuantizedBaseNode qnode;
+
+ unsigned int _geomID; // geometry ID of mesh
+
+
+ friend embree_ostream operator<<(embree_ostream cout, const SubGridQBVHN& sg) {
+ cout << "SubGridQBVHN " << embree_endl;
+ for (size_t i=0;i<N;i++)
+ cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+ cout << "geomID " << sg._geomID << embree_endl;
+ cout << "lowerX " << sg.qnode.dequantizeLowerX() << embree_endl;
+ cout << "upperX " << sg.qnode.dequantizeUpperX() << embree_endl;
+ cout << "lowerY " << sg.qnode.dequantizeLowerY() << embree_endl;
+ cout << "upperY " << sg.qnode.dequantizeUpperY() << embree_endl;
+ cout << "lowerZ " << sg.qnode.dequantizeLowerZ() << embree_endl;
+ cout << "upperZ " << sg.qnode.dequantizeUpperZ() << embree_endl;
+ return cout;
+ }
+
+ };
+
+ template<int N>
+ typename SubGridQBVHN<N>::Type SubGridQBVHN<N>::type;
+
+ typedef SubGridQBVHN<4> SubGridQBVH4;
+ typedef SubGridQBVHN<8> SubGridQBVH8;
+
+
+ /* QuantizedBaseNode as large subgrid leaf */
+ template<int N>
+ struct SubGridMBQBVHN
+ {
+ /* Virtual interface to query information about the quad type */
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ __forceinline size_t size() const
+ {
+ for (size_t i=0;i<N;i++)
+ if (primID(i) == -1) return i;
+ return N;
+ }
+
+ __forceinline void clear() {
+ for (size_t i=0;i<N;i++)
+ subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+ qnode.clear();
+ }
+
+ /* Default constructor */
+ __forceinline SubGridMBQBVHN() { }
+
+ /* Construction from vertices and IDs */
+ __forceinline SubGridMBQBVHN(const unsigned int x[N],
+ const unsigned int y[N],
+ const unsigned int primID[N],
+ const BBox3fa * const subGridBounds0,
+ const BBox3fa * const subGridBounds1,
+ const unsigned int geomID,
+ const float toffset,
+ const float tscale,
+ const unsigned int items)
+ {
+ clear();
+ _geomID = geomID;
+ time_offset = toffset;
+ time_scale = tscale;
+
+ __aligned(64) typename BVHN<N>::AABBNode node0,node1;
+ node0.clear();
+ node1.clear();
+ for (size_t i=0;i<items;i++)
+ {
+ subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+ node0.setBounds(i,subGridBounds0[i]);
+ node1.setBounds(i,subGridBounds1[i]);
+ }
+ qnode.node0.init_dim(node0);
+ qnode.node1.init_dim(node1);
+ }
+
+ __forceinline unsigned int geomID() const { return _geomID; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+ __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+ __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+ __forceinline SubGrid subgrid(const size_t i) const {
+ assert(i < N);
+ assert(primID(i) != -1);
+ return SubGrid(x(i),y(i),geomID(),primID(i));
+ }
+
+ __forceinline float adjustTime(const float t) const { return time_scale * (t-time_offset); }
+
+ template<int K>
+ __forceinline vfloat<K> adjustTime(const vfloat<K> &t) const { return time_scale * (t-time_offset); }
+
+ public:
+ SubGridID subgridIDs[N];
+
+ typename BVHN<N>::QuantizedBaseNodeMB qnode;
+
+ float time_offset;
+ float time_scale;
+ unsigned int _geomID; // geometry ID of mesh
+
+
+ friend embree_ostream operator<<(embree_ostream cout, const SubGridMBQBVHN& sg) {
+ cout << "SubGridMBQBVHN " << embree_endl;
+ for (size_t i=0;i<N;i++)
+ cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+ cout << "geomID " << sg._geomID << embree_endl;
+ cout << "time_offset " << sg.time_offset << embree_endl;
+ cout << "time_scale " << sg.time_scale << embree_endl;
+ cout << "lowerX " << sg.qnode.node0.dequantizeLowerX() << embree_endl;
+ cout << "upperX " << sg.qnode.node0.dequantizeUpperX() << embree_endl;
+ cout << "lowerY " << sg.qnode.node0.dequantizeLowerY() << embree_endl;
+ cout << "upperY " << sg.qnode.node0.dequantizeUpperY() << embree_endl;
+ cout << "lowerZ " << sg.qnode.node0.dequantizeLowerZ() << embree_endl;
+ cout << "upperZ " << sg.qnode.node0.dequantizeUpperZ() << embree_endl;
+ cout << "lowerX " << sg.qnode.node1.dequantizeLowerX() << embree_endl;
+ cout << "upperX " << sg.qnode.node1.dequantizeUpperX() << embree_endl;
+ cout << "lowerY " << sg.qnode.node1.dequantizeLowerY() << embree_endl;
+ cout << "upperY " << sg.qnode.node1.dequantizeUpperY() << embree_endl;
+ cout << "lowerZ " << sg.qnode.node1.dequantizeLowerZ() << embree_endl;
+ cout << "upperZ " << sg.qnode.node1.dequantizeUpperZ() << embree_endl;
+ return cout;
+ }
+
+ };
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h
new file mode 100644
index 0000000000..045eee4329
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h
@@ -0,0 +1,518 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "subgrid_intersector_moeller.h"
+#include "subgrid_intersector_pluecker.h"
+
+namespace embree
+{
+ namespace isa
+ {
+
+ // =======================================================================================
+ // =================================== SubGridIntersectors ===============================
+ // =======================================================================================
+
+
+ template<int N, bool filter>
+ struct SubGridIntersector1Moeller
+ {
+ typedef SubGridQBVHN<N> Primitive;
+ typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+ pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+ return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+ {
+ STAT3(point_query.trav_prims,1,1,1);
+ AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+ assert(accel);
+ context->geomID = subgrid.geomID();
+ context->primID = subgrid.primID();
+ return accel->pointQuery(query, context);
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,dist);
+#if defined(__AVX__)
+ STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+ if (unlikely(dist[ID] > ray.tfar)) continue;
+ intersect(pre,ray,context,prim[i].subgrid(ID));
+ }
+ }
+ }
+ template<int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+ if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+ return true;
+ }
+ }
+ return false;
+ }
+
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+ {
+ bool changed = false;
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<N> dist;
+ size_t mask;
+ if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+ mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+ } else {
+ mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+ }
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+ changed |= pointQuery(query, context, prim[i].subgrid(ID));
+ }
+ }
+ return changed;
+ }
+ };
+
+ template<int N, bool filter>
+ struct SubGridIntersector1Pluecker
+ {
+ typedef SubGridQBVHN<N> Primitive;
+ typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+ pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+ return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+ {
+ STAT3(point_query.trav_prims,1,1,1);
+ AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+ context->geomID = subgrid.geomID();
+ context->primID = subgrid.primID();
+ return accel->pointQuery(query, context);
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,dist);
+#if defined(__AVX__)
+ STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+ if (unlikely(dist[ID] > ray.tfar)) continue;
+ intersect(pre,ray,context,prim[i].subgrid(ID));
+ }
+ }
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+ if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+ return true;
+ }
+ }
+ return false;
+ }
+
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+ {
+ bool changed = false;
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<N> dist;
+ size_t mask;
+ if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+ mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+ } else {
+ mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+ }
+#if defined(__AVX__)
+ STAT3(point_query.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+ changed |= pointQuery(query, context, prim[i].subgrid(ID));
+ }
+ }
+ return changed;
+ }
+ };
+
+ template<int N, int K, bool filter>
+ struct SubGridIntersectorKMoeller
+ {
+ typedef SubGridQBVHN<N> Primitive;
+ typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations;
+
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ Vec3fa vtx[16];
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ subgrid.gather(vtx,context->scene);
+ for (unsigned int i=0; i<4; i++)
+ {
+ const Vec3vf<K> p0 = vtx[i*4+0];
+ const Vec3vf<K> p1 = vtx[i*4+1];
+ const Vec3vf<K> p2 = vtx[i*4+2];
+ const Vec3vf<K> p3 = vtx[i*4+3];
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+ }
+ }
+
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ vbool<K> valid0 = valid_i;
+ Vec3fa vtx[16];
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ subgrid.gather(vtx,context->scene);
+ for (unsigned int i=0; i<4; i++)
+ {
+ const Vec3vf<K> p0 = vtx[i*4+0];
+ const Vec3vf<K> p1 = vtx[i*4+1];
+ const Vec3vf<K> p2 = vtx[i*4+2];
+ const Vec3vf<K> p3 = vtx[i*4+3];
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+ break;
+ }
+ return !valid0;
+ }
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+ pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+ Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+ return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ template<bool robust>
+ static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+ for (size_t j=0;j<num;j++)
+ {
+ size_t m_valid = movemask(prim[j].qnode.validMask());
+ vfloat<K> dist;
+ while(m_valid)
+ {
+ const size_t i = bscf(m_valid);
+ if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+ intersect(valid,pre,ray,context,prim[j].subgrid(i));
+ }
+ }
+ }
+
+ template<bool robust>
+ static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+ vbool<K> valid0 = valid;
+ for (size_t j=0;j<num;j++)
+ {
+ size_t m_valid = movemask(prim[j].qnode.validMask());
+ vfloat<K> dist;
+ while(m_valid)
+ {
+ const size_t i = bscf(m_valid);
+ if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+ valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+ if (none(valid0)) break;
+ }
+ }
+ return !valid0;
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+ if (unlikely(dist[ID] > ray.tfar[k])) continue;
+ intersect(pre,ray,k,context,prim[i].subgrid(ID));
+ }
+ }
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+ if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+ return true;
+ }
+ }
+ return false;
+ }
+ };
+
+
+ template<int N, int K, bool filter>
+ struct SubGridIntersectorKPluecker
+ {
+ typedef SubGridQBVHN<N> Primitive;
+ typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ Vec3fa vtx[16];
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ subgrid.gather(vtx,context->scene);
+ for (unsigned int i=0; i<4; i++)
+ {
+ const Vec3vf<K> p0 = vtx[i*4+0];
+ const Vec3vf<K> p1 = vtx[i*4+1];
+ const Vec3vf<K> p2 = vtx[i*4+2];
+ const Vec3vf<K> p3 = vtx[i*4+3];
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+ }
+ }
+
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ vbool<K> valid0 = valid_i;
+ Vec3fa vtx[16];
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ subgrid.gather(vtx,context->scene);
+ for (unsigned int i=0; i<4; i++)
+ {
+ const Vec3vf<K> p0 = vtx[i*4+0];
+ const Vec3vf<K> p1 = vtx[i*4+1];
+ const Vec3vf<K> p2 = vtx[i*4+2];
+ const Vec3vf<K> p3 = vtx[i*4+3];
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+ break;
+ }
+ return !valid0;
+ }
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+ pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+ Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+ return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ template<bool robust>
+ static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+ for (size_t j=0;j<num;j++)
+ {
+ size_t m_valid = movemask(prim[j].qnode.validMask());
+ vfloat<K> dist;
+ while(m_valid)
+ {
+ const size_t i = bscf(m_valid);
+ if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+ intersect(valid,pre,ray,context,prim[j].subgrid(i));
+ }
+ }
+ }
+
+ template<bool robust>
+ static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+ vbool<K> valid0 = valid;
+ for (size_t j=0;j<num;j++)
+ {
+ size_t m_valid = movemask(prim[j].qnode.validMask());
+ vfloat<K> dist;
+ while(m_valid)
+ {
+ const size_t i = bscf(m_valid);
+ if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+ valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+ if (none(valid0)) break;
+ }
+ }
+ return !valid0;
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+ if (unlikely(dist[ID] > ray.tfar[k])) continue;
+ intersect(pre,ray,k,context,prim[i].subgrid(ID));
+ }
+ }
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+ if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+ return true;
+ }
+ }
+ return false;
+ }
+ };
+
+
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h
new file mode 100644
index 0000000000..f65b4abf61
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h
@@ -0,0 +1,493 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+
+namespace embree
+{
+ namespace isa
+ {
+
+ /* ----------------------------- */
+ /* -- single ray intersectors -- */
+ /* ----------------------------- */
+
+ template<int M>
+ __forceinline void interpolateUV(MoellerTrumboreHitM<M> &hit,const GridMesh::Grid &g, const SubGrid& subgrid)
+ {
+ /* correct U,V interpolation across the entire grid */
+ const vint<M> sx((int)subgrid.x());
+ const vint<M> sy((int)subgrid.y());
+ const vint<M> sxM(sx + vint<M>(0,1,1,0));
+ const vint<M> syM(sy + vint<M>(0,0,1,1));
+ const float inv_resX = rcp((float)((int)g.resX-1));
+ const float inv_resY = rcp((float)((int)g.resY-1));
+ hit.U = (hit.U + (vfloat<M>)sxM * hit.absDen) * inv_resX;
+ hit.V = (hit.V + (vfloat<M>)syM * hit.absDen) * inv_resY;
+ }
+
+ template<int M, bool filter>
+ struct SubGridQuadMIntersector1MoellerTrumbore;
+
+ template<int M, bool filter>
+ struct SubGridQuadMIntersector1MoellerTrumbore
+ {
+ __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+ __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+ __forceinline void intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const GridMesh::Grid &g, const SubGrid& subgrid) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+ Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+ /* intersect first triangle */
+ if (intersector.intersect(ray,v0,v1,v3,hit))
+ {
+ interpolateUV<M>(hit,g,subgrid);
+ epilog(hit.valid,hit);
+ }
+
+ /* intersect second triangle */
+ if (intersector.intersect(ray,v2,v3,v1,hit))
+ {
+ hit.U = hit.absDen - hit.U;
+ hit.V = hit.absDen - hit.V;
+ interpolateUV<M>(hit,g,subgrid);
+ epilog(hit.valid,hit);
+ }
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const GridMesh::Grid &g, const SubGrid& subgrid) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+ Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+ /* intersect first triangle */
+ if (intersector.intersect(ray,v0,v1,v3,hit))
+ {
+ interpolateUV<M>(hit,g,subgrid);
+ if (epilog(hit.valid,hit))
+ return true;
+ }
+
+ /* intersect second triangle */
+ if (intersector.intersect(ray,v2,v3,v1,hit))
+ {
+ hit.U = hit.absDen - hit.U;
+ hit.V = hit.absDen - hit.V;
+ interpolateUV<M>(hit,g,subgrid);
+ if (epilog(hit.valid,hit))
+ return true;
+ }
+ return false;
+ }
+ };
+
+#if defined (__AVX__)
+
+ /*! Intersects 4 quads with 1 ray using AVX */
+ template<bool filter>
+ struct SubGridQuadMIntersector1MoellerTrumbore<4,filter>
+ {
+ __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+ __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+ {
+ const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+ const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+ MoellerTrumboreHitM<8> hit;
+ MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+ const vbool8 flags(0,0,0,0,1,1,1,1);
+ if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit)))
+ {
+ vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+
+#if !defined(EMBREE_BACKFACE_CULLING)
+ hit.U = select(flags,absDen-V,U);
+ hit.V = select(flags,absDen-U,V);
+ hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f));
+#else
+ hit.U = select(flags,absDen-U,U);
+ hit.V = select(flags,absDen-V,V);
+#endif
+ /* correct U,V interpolation across the entire grid */
+ const vint8 sx((int)subgrid.x());
+ const vint8 sy((int)subgrid.y());
+ const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0));
+ const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1));
+ const float inv_resX = rcp((float)((int)g.resX-1));
+ const float inv_resY = rcp((float)((int)g.resY-1));
+ hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX;
+ hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY;
+
+ if (unlikely(epilog(hit.valid,hit)))
+ return true;
+ }
+ return false;
+ }
+
+ __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const GridMesh::Grid &g, const SubGrid& subgrid) const
+ {
+ return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const GridMesh::Grid &g, const SubGrid& subgrid) const
+ {
+ return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+ }
+ };
+
+#endif
+
+ // ============================================================================================================================
+ // ============================================================================================================================
+ // ============================================================================================================================
+
+
+ /* ----------------------------- */
+ /* -- ray packet intersectors -- */
+ /* ----------------------------- */
+
+ template<int K>
+ struct SubGridQuadHitK
+ {
+ __forceinline SubGridQuadHitK(const vfloat<K>& U,
+ const vfloat<K>& V,
+ const vfloat<K>& T,
+ const vfloat<K>& absDen,
+ const Vec3vf<K>& Ng,
+ const vbool<K>& flags,
+ const GridMesh::Grid &g,
+ const SubGrid& subgrid,
+ const unsigned int i)
+ : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {}
+
+ __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+ {
+ const vfloat<K> rcpAbsDen = rcp(absDen);
+ const vfloat<K> t = T * rcpAbsDen;
+ const vfloat<K> u0 = min(U * rcpAbsDen,1.0f);
+ const vfloat<K> v0 = min(V * rcpAbsDen,1.0f);
+ const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+ const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+ const vfloat<K> uu = select(flags,u1,u0);
+ const vfloat<K> vv = select(flags,v1,v0);
+ const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+ const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+ const float inv_resX = rcp((float)(int)(g.resX-1));
+ const float inv_resY = rcp((float)(int)(g.resY-1));
+ const vfloat<K> u = (uu + (float)(int)sx) * inv_resX;
+ const vfloat<K> v = (vv + (float)(int)sy) * inv_resY;
+ const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+ return std::make_tuple(u,v,t,Ng);
+ }
+
+ private:
+ const vfloat<K> U;
+ const vfloat<K> V;
+ const vfloat<K> T;
+ const vfloat<K> absDen;
+ const vbool<K> flags;
+ const Vec3vf<K> tri_Ng;
+
+ const GridMesh::Grid &g;
+ const SubGrid& subgrid;
+ const size_t i;
+ };
+
+ template<int M, int K, bool filter>
+ struct SubGridQuadMIntersectorKMoellerTrumboreBase
+ {
+ __forceinline SubGridQuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_e1,
+ const Vec3vf<K>& tri_e2,
+ const Vec3vf<K>& tri_Ng,
+ const vbool<K>& flags,
+ const GridMesh::Grid &g,
+ const SubGrid &subgrid,
+ const unsigned int i,
+ const Epilog& epilog) const
+ {
+ /* calculate denominator */
+ vbool<K> valid = valid0;
+ const Vec3vf<K> C = tri_v0 - ray.org;
+ const Vec3vf<K> R = cross(C,ray.dir);
+ const vfloat<K> den = dot(tri_Ng,ray.dir);
+ const vfloat<K> absDen = abs(den);
+ const vfloat<K> sgnDen = signmsk(den);
+
+ /* test against edge p2 p0 */
+ const vfloat<K> U = dot(R,tri_e2) ^ sgnDen;
+ valid &= U >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* test against edge p0 p1 */
+ const vfloat<K> V = dot(R,tri_e1) ^ sgnDen;
+ valid &= V >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* test against edge p1 p2 */
+ const vfloat<K> W = absDen-U-V;
+ valid &= W >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+ valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar);
+ if (unlikely(none(valid))) return false;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ valid &= den < vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+#else
+ valid &= den != vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+#endif
+
+ /* calculate hit information */
+ SubGridQuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags,g,subgrid,i);
+ return epilog(valid,hit);
+ }
+
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_v1,
+ const Vec3vf<K>& tri_v2,
+ const vbool<K>& flags,
+ const GridMesh::Grid &g,
+ const SubGrid &subgrid,
+ const unsigned int i,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<K> e1 = tri_v0-tri_v1;
+ const Vec3vf<K> e2 = tri_v2-tri_v0;
+ const Vec3vf<K> Ng = cross(e2,e1);
+ return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,g,subgrid,i,epilog);
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& v0,
+ const Vec3vf<K>& v1,
+ const Vec3vf<K>& v2,
+ const Vec3vf<K>& v3,
+ const GridMesh::Grid &g,
+ const SubGrid &subgrid,
+ const unsigned int i,
+ const Epilog& epilog) const
+ {
+ intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),g,subgrid,i,epilog);
+ if (none(valid0)) return true;
+ intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),g,subgrid,i,epilog);
+ return none(valid0);
+ }
+
+ static __forceinline bool intersect1(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ const Vec3vf<M>& tri_Ng,
+ MoellerTrumboreHitM<M> &hit)
+ {
+ /* calculate denominator */
+ const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+ const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+ const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+ const Vec3vf<M> R = cross(C,D);
+ const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+ const vfloat<M> absDen = abs(den);
+ const vfloat<M> sgnDen = signmsk(den);
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+ const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+ vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+ valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+ if (likely(none(valid))) return false;
+
+ /* calculate hit information */
+ new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+ return true;
+ }
+
+ static __forceinline bool intersect1(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ MoellerTrumboreHitM<M> &hit)
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ const Vec3vf<M> Ng = cross(e2,e1);
+ return intersect1(ray,k,v0,e1,e2,Ng,hit);
+ }
+
+ };
+
+ template<int M, int K, bool filter>
+ struct SubGridQuadMIntersectorKMoellerTrumbore : public SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+ {
+ __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+ : SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+ __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+ {
+ Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+ MoellerTrumboreHitM<4> hit;
+ if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit))
+ {
+ interpolateUV<M>(hit,g,subgrid);
+ epilog(hit.valid,hit);
+ }
+
+ if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit))
+ {
+ hit.U = hit.absDen - hit.U;
+ hit.V = hit.absDen - hit.V;
+ interpolateUV<M>(hit,g,subgrid);
+ epilog(hit.valid,hit);
+ }
+
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+ {
+ Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+ MoellerTrumboreHitM<4> hit;
+ if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit))
+ {
+ interpolateUV<M>(hit,g,subgrid);
+ if (epilog(hit.valid,hit)) return true;
+ }
+
+ if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit))
+ {
+ hit.U = hit.absDen - hit.U;
+ hit.V = hit.absDen - hit.V;
+ interpolateUV<M>(hit,g,subgrid);
+ if (epilog(hit.valid,hit)) return true;
+ }
+ return false;
+ }
+ };
+
+
+#if defined (__AVX__)
+
+ /*! Intersects 4 quads with 1 ray using AVX */
+ template<int K, bool filter>
+ struct SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> : public SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+ {
+ __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+ : SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const
+ {
+ const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+ const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+ const vbool8 flags(0,0,0,0,1,1,1,1);
+
+ MoellerTrumboreHitM<8> hit;
+ if (SubGridQuadMIntersectorKMoellerTrumboreBase<8,K,filter>::intersect1(ray,k,vtx0,vtx1,vtx2,hit))
+ {
+ vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+#if !defined(EMBREE_BACKFACE_CULLING)
+ hit.U = select(flags,absDen-V,U);
+ hit.V = select(flags,absDen-U,V);
+ hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f));
+#else
+ hit.U = select(flags,absDen-U,U);
+ hit.V = select(flags,absDen-V,V);
+#endif
+
+ /* correct U,V interpolation across the entire grid */
+ const vint8 sx((int)subgrid.x());
+ const vint8 sy((int)subgrid.y());
+ const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0));
+ const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1));
+ const float inv_resX = rcp((float)((int)g.resX-1));
+ const float inv_resY = rcp((float)((int)g.resY-1));
+ hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX;
+ hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY;
+ if (unlikely(epilog(hit.valid,hit)))
+ return true;
+
+ }
+ return false;
+ }
+
+ __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+ {
+ return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+ }
+ };
+
+#endif
+
+
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h
new file mode 100644
index 0000000000..1cd88aa799
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h
@@ -0,0 +1,508 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+ namespace isa
+ {
+
+ template<int M>
+ struct SubGridQuadHitPlueckerM
+ {
+ __forceinline SubGridQuadHitPlueckerM() {}
+
+ __forceinline SubGridQuadHitPlueckerM(const vbool<M>& valid,
+ const vfloat<M>& U,
+ const vfloat<M>& V,
+ const vfloat<M>& UVW,
+ const vfloat<M>& t,
+ const Vec3vf<M>& Ng,
+ const vbool<M>& flags) : valid(valid), vt(t)
+ {
+ const vbool<M> invalid = abs(UVW) < min_rcp_input;
+ const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+ const vfloat<M> u = min(U * rcpUVW,1.0f);
+ const vfloat<M> v = min(V * rcpUVW,1.0f);
+ const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+ const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+ vu = select(flags,u1,u);
+ vv = select(flags,v1,v);
+ vNg = Vec3vf<M>(Ng.x,Ng.y,Ng.z);
+#else
+ const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+ vv = select(flags,u1,v);
+ vu = select(flags,v1,u);
+ vNg = Vec3vf<M>(flip*Ng.x,flip*Ng.y,flip*Ng.z);
+#endif
+ }
+
+ __forceinline void finalize()
+ {
+ }
+
+ __forceinline Vec2f uv(const size_t i)
+ {
+ const float u = vu[i];
+ const float v = vv[i];
+ return Vec2f(u,v);
+ }
+
+ __forceinline float t(const size_t i) { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ public:
+ vbool<M> valid;
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ template<int M>
+ __forceinline void interpolateUV(SubGridQuadHitPlueckerM<M> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY)
+ {
+ /* correct U,V interpolation across the entire grid */
+ const vint<M> sx((int)subgrid.x());
+ const vint<M> sy((int)subgrid.y());
+ const vint<M> sxM(sx + stepX);
+ const vint<M> syM(sy + stepY);
+ const float inv_resX = rcp((float)((int)g.resX-1));
+ const float inv_resY = rcp((float)((int)g.resY-1));
+ hit.vu = (hit.vu + vfloat<M>(sxM)) * inv_resX;
+ hit.vv = (hit.vv + vfloat<M>(syM)) * inv_resY;
+ }
+
+ template<int M>
+ __forceinline static bool intersectPluecker(Ray& ray,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_v1,
+ const Vec3vf<M>& tri_v2,
+ const vbool<M>& flags,
+ SubGridQuadHitPlueckerM<M> &hit)
+ {
+ /* calculate vertices relative to ray origin */
+ const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+ const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+ const Vec3vf<M> v0 = tri_v0-O;
+ const Vec3vf<M> v1 = tri_v1-O;
+ const Vec3vf<M> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<M> e0 = v2-v0;
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(cross(e0,v2+v0),D);
+ const vfloat<M> V = dot(cross(e1,v0+v1),D);
+ const vfloat<M> W = dot(cross(e2,v1+v2),D);
+ const vfloat<M> UVW = U+V+W;
+ const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = max(U,V,W) <= eps;
+#else
+ vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<M> den = twice(dot(Ng,D));
+
+ /* perform depth test */
+ const vfloat<M> T = twice(dot(v0,Ng));
+ const vfloat<M> t = rcp(den)*T;
+ valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+ valid &= den != vfloat<M>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ new (&hit) SubGridQuadHitPlueckerM<M>(valid,U,V,UVW,t,Ng,flags);
+ return true;
+ }
+
+ template<int M, bool filter>
+ struct SubGridQuadMIntersector1Pluecker;
+
+ template<int M, bool filter>
+ struct SubGridQuadMIntersector1Pluecker
+ {
+ __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+ __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+ __forceinline void intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const GridMesh::Grid &g, const SubGrid& subgrid) const
+ {
+ SubGridQuadHitPlueckerM<M> hit;
+ Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+ /* intersect first triangle */
+ if (intersectPluecker(ray,v0,v1,v3,vbool<M>(false),hit))
+ {
+ interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+ epilog(hit.valid,hit);
+ }
+
+ /* intersect second triangle */
+ if (intersectPluecker(ray,v2,v3,v1,vbool<M>(true),hit))
+ {
+ interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+ epilog(hit.valid,hit);
+ }
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+ const GridMesh::Grid &g, const SubGrid& subgrid) const
+ {
+ SubGridQuadHitPlueckerM<M> hit;
+ Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+ /* intersect first triangle */
+ if (intersectPluecker(ray,v0,v1,v3,vbool<M>(false),hit))
+ {
+ interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+ if (epilog(hit.valid,hit))
+ return true;
+ }
+
+ /* intersect second triangle */
+ if (intersectPluecker(ray,v2,v3,v1,vbool<M>(true),hit))
+ {
+ interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+ if (epilog(hit.valid,hit))
+ return true;
+ }
+
+ return false;
+ }
+ };
+
+#if defined (__AVX__)
+
+ /*! Intersects 4 quads with 1 ray using AVX */
+ template<bool filter>
+ struct SubGridQuadMIntersector1Pluecker<4,filter>
+ {
+ __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+ __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+ template<typename Epilog>
+ __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+ {
+ const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+ const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+ const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+ const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+ SubGridQuadHitPlueckerM<8> hit;
+ const vbool8 flags(0,0,0,0,1,1,1,1);
+ if (unlikely(intersectPluecker(ray,vtx0,vtx1,vtx2,flags,hit)))
+ {
+ /* correct U,V interpolation across the entire grid */
+ interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));
+ if (unlikely(epilog(hit.valid,hit)))
+ return true;
+ }
+ return false;
+ }
+
+ __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const GridMesh::Grid &g, const SubGrid& subgrid) const
+ {
+ return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+ }
+
+ __forceinline bool occluded(Ray& ray, IntersectContext* context,
+ const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+ const GridMesh::Grid &g, const SubGrid& subgrid) const
+ {
+ return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+ }
+ };
+
+#endif
+
+
+ /* ----------------------------- */
+ /* -- ray packet intersectors -- */
+ /* ----------------------------- */
+
+ template<int K>
+ struct SubGridQuadHitPlueckerK
+ {
+ __forceinline SubGridQuadHitPlueckerK(const vfloat<K>& U,
+ const vfloat<K>& V,
+ const vfloat<K>& UVW,
+ const vfloat<K>& t,
+ const Vec3vf<K>& Ng,
+ const vbool<K>& flags,
+ const GridMesh::Grid &g,
+ const SubGrid& subgrid,
+ const unsigned int i)
+ : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {}
+
+ __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+ {
+ const vbool<K> invalid = abs(UVW) < min_rcp_input;
+ const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+ const vfloat<K> u0 = min(U * rcpUVW,1.0f);
+ const vfloat<K> v0 = min(V * rcpUVW,1.0f);
+ const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+ const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+ const vfloat<K> uu = select(flags,u1,u0);
+ const vfloat<K> vv = select(flags,v1,v0);
+ const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+ const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+ const float inv_resX = rcp((float)(int)(g.resX-1));
+ const float inv_resY = rcp((float)(int)(g.resY-1));
+ const vfloat<K> u = (uu + (float)(int)sx) * inv_resX;
+ const vfloat<K> v = (vv + (float)(int)sy) * inv_resY;
+ const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+ return std::make_tuple(u,v,t,Ng);
+ }
+
+ private:
+ const vfloat<K> U;
+ const vfloat<K> V;
+ const vfloat<K> UVW;
+ const vfloat<K> t;
+ const vfloat<K> absDen;
+ const vbool<K> flags;
+ const Vec3vf<K> tri_Ng;
+
+ const GridMesh::Grid &g;
+ const SubGrid& subgrid;
+ const size_t i;
+ };
+
+
+ template<int M, int K, bool filter>
+ struct SubGridQuadMIntersectorKPlueckerBase
+ {
+ __forceinline SubGridQuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_v1,
+ const Vec3vf<K>& tri_v2,
+ const Vec3vf<K>& tri_Ng,
+ const vbool<K>& flags,
+ const GridMesh::Grid &g,
+ const SubGrid &subgrid,
+ const unsigned int i,
+ const Epilog& epilog) const
+ {
+ /* calculate denominator */
+ /* calculate vertices relative to ray origin */
+ vbool<K> valid = valid0;
+ const Vec3vf<K> O = ray.org;
+ const Vec3vf<K> D = ray.dir;
+ const Vec3vf<K> v0 = tri_v0-O;
+ const Vec3vf<K> v1 = tri_v1-O;
+ const Vec3vf<K> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<K> e0 = v2-v0;
+ const Vec3vf<K> e1 = v0-v1;
+ const Vec3vf<K> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+ const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+ const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+ const vfloat<K> UVW = U+V+W;
+ const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ valid &= max(U,V,W) <= eps;
+#else
+ valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+ /* perform depth test */
+ const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+ const vfloat<K> t = rcp(den)*T;
+ valid &= ray.tnear() <= t & t <= ray.tfar;
+ valid &= den != vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* calculate hit information */
+ SubGridQuadHitPlueckerK<K> hit(U,V,UVW,t,tri_Ng,flags,g,subgrid,i);
+ return epilog(valid,hit);
+ }
+
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& v0,
+ const Vec3vf<K>& v1,
+ const Vec3vf<K>& v2,
+ const vbool<K>& flags,
+ const GridMesh::Grid &g,
+ const SubGrid &subgrid,
+ const unsigned int i,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<K> e1 = v0-v1;
+ const Vec3vf<K> e2 = v2-v0;
+ const Vec3vf<K> Ng = cross(e2,e1);
+ return intersectK(valid0,ray,v0,v1,v2,Ng,flags,g,subgrid,i,epilog);
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& v0,
+ const Vec3vf<K>& v1,
+ const Vec3vf<K>& v2,
+ const Vec3vf<K>& v3,
+ const GridMesh::Grid &g,
+ const SubGrid &subgrid,
+ const unsigned int i,
+ const Epilog& epilog) const
+ {
+ intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),g,subgrid,i,epilog);
+ if (none(valid0)) return true;
+ intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),g,subgrid,i,epilog);
+ return none(valid0);
+ }
+
+ static __forceinline bool intersect1(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_v1,
+ const Vec3vf<M>& tri_v2,
+ const Vec3vf<M>& tri_Ng,
+ const vbool<M>& flags,
+ SubGridQuadHitPlueckerM<M> &hit)
+ {
+ /* calculate vertices relative to ray origin */
+ const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+ const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+ const Vec3vf<M> v0 = tri_v0-O;
+ const Vec3vf<M> v1 = tri_v1-O;
+ const Vec3vf<M> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<M> e0 = v2-v0;
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(cross(e0,v2+v0),D);
+ const vfloat<M> V = dot(cross(e1,v0+v1),D);
+ const vfloat<M> W = dot(cross(e2,v1+v2),D);
+ const vfloat<M> UVW = U+V+W;
+ const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = max(U,V,W) <= eps ;
+#else
+ vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<M> den = twice(dot(Ng,D));
+
+ /* perform depth test */
+ const vfloat<M> T = twice(dot(v0,Ng));
+ const vfloat<M> t = rcp(den)*T;
+ valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+ if (unlikely(none(valid))) return false;
+
+ /* avoid division by 0 */
+ valid &= den != vfloat<M>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ new (&hit) SubGridQuadHitPlueckerM<M>(valid,U,V,UVW,t,tri_Ng,flags);
+ return true;
+ }
+
+ static __forceinline bool intersect1(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const vbool<M>& flags,
+ SubGridQuadHitPlueckerM<M> &hit)
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ const Vec3vf<M> Ng = cross(e2,e1); // FIXME: optimize!!!
+ return intersect1(ray,k,v0,v1,v2,Ng,flags,hit);
+ }
+
+ };
+
+ template<int M, int K, bool filter>
+ struct SubGridQuadMIntersectorKPluecker : public SubGridQuadMIntersectorKPlueckerBase<M,K,filter>
+ {
+ __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+ : SubGridQuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+ __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+ {
+ Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+ SubGridQuadHitPlueckerM<4> hit;
+ if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit))
+ {
+ interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+ epilog(hit.valid,hit);
+ }
+
+ if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit))
+ {
+ interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+ epilog(hit.valid,hit);
+ }
+
+ }
+
+ __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+ const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+ {
+ Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+ SubGridQuadHitPlueckerM<4> hit;
+ if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit))
+ {
+ interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+ if (epilog(hit.valid,hit)) return true;
+ }
+
+ if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit))
+ {
+ interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+ if (epilog(hit.valid,hit)) return true;
+ }
+ return false;
+ }
+ };
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h
new file mode 100644
index 0000000000..400a88b985
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h
@@ -0,0 +1,236 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid_intersector.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N, bool filter>
+ struct SubGridMBIntersector1Pluecker
+ {
+ typedef SubGridMBQBVHN<N> Primitive;
+ typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ float ftime;
+ const int itime = mesh->timeSegment(ray.time(), ftime);
+ Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+ pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ float ftime;
+ const int itime = mesh->timeSegment(ray.time(), ftime);
+
+ Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+ return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, subgrid);
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<Nx> dist;
+ const float time = prim[i].adjustTime(ray.time());
+
+ assert(time <= 1.0f);
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist);
+#if defined(__AVX__)
+ STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ if (unlikely(dist[ID] > ray.tfar)) continue;
+ intersect(pre,ray,context,prim[i].subgrid(ID));
+ }
+ }
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+ for (size_t i=0;i<num;i++)
+ {
+ const float time = prim[i].adjustTime(ray.time());
+ assert(time <= 1.0f);
+ vfloat<Nx> dist;
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+ return true;
+ }
+ }
+ return false;
+ }
+
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+ {
+ assert(false && "not implemented");
+ return false;
+ }
+ };
+
+
+ template<int N, int K, bool filter>
+ struct SubGridMBIntersectorKPluecker
+ {
+ typedef SubGridMBQBVHN<N> Primitive;
+ typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ size_t m_valid = movemask(valid_i);
+ while(m_valid)
+ {
+ size_t ID = bscf(m_valid);
+ intersect(pre,ray,ID,context,subgrid);
+ }
+ }
+
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+ {
+ vbool<K> valid0 = valid_i;
+ size_t m_valid = movemask(valid_i);
+ while(m_valid)
+ {
+ size_t ID = bscf(m_valid);
+ if (occluded(pre,ray,ID,context,subgrid))
+ clear(valid0,ID);
+ }
+ return !valid0;
+ }
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ vfloat<K> ftime;
+ const vint<K> itime = mesh->timeSegment(ray.time(), ftime);
+ Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+ pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID());
+ const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+ vfloat<K> ftime;
+ const vint<K> itime = mesh->timeSegment(ray.time(), ftime);
+ Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+ return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+ }
+
+ template<bool robust>
+ static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+ for (size_t j=0;j<num;j++)
+ {
+ size_t m_valid = movemask(prim[j].qnode.validMask());
+ const vfloat<K> time = prim[j].adjustTime(ray.time());
+
+ vfloat<K> dist;
+ while(m_valid)
+ {
+ const size_t i = bscf(m_valid);
+ if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+ intersect(valid,pre,ray,context,prim[j].subgrid(i));
+ }
+ }
+ }
+
+ template<bool robust>
+ static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+
+ vbool<K> valid0 = valid;
+ for (size_t j=0;j<num;j++)
+ {
+ size_t m_valid = movemask(prim[j].qnode.validMask());
+ const vfloat<K> time = prim[j].adjustTime(ray.time());
+ vfloat<K> dist;
+ while(m_valid)
+ {
+ const size_t i = bscf(m_valid);
+ if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+ valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+ if (none(valid0)) break;
+ }
+ }
+ return !valid0;
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<N> dist;
+ const float time = prim[i].adjustTime(ray.time()[k]);
+ assert(time <= 1.0f);
+
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ if (unlikely(dist[ID] > ray.tfar[k])) continue;
+ intersect(pre,ray,k,context,prim[i].subgrid(ID));
+ }
+ }
+ }
+
+ template<int Nx, bool robust>
+ static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+ {
+ BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+ for (size_t i=0;i<num;i++)
+ {
+ vfloat<N> dist;
+ const float time = prim[i].adjustTime(ray.time()[k]);
+ assert(time <= 1.0f);
+
+ size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist);
+ while(mask != 0)
+ {
+ const size_t ID = bscf(mask);
+ if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+ return true;
+ }
+ }
+ return false;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle.h b/thirdparty/embree-aarch64/kernels/geometry/triangle.h
new file mode 100644
index 0000000000..0dedf6dc4c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle.h
@@ -0,0 +1,162 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+ /* Precalculated representation for M triangles. Stores for each
+ triangle a base vertex, two edges, and the geometry normal to
+ speed up intersection calculations */
+ template<int M>
+ struct TriangleM
+ {
+ public:
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* Returns maximum number of stored triangles */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+ public:
+
+ /* Default constructor */
+ __forceinline TriangleM() {}
+
+ /* Construction from vertices and IDs */
+ __forceinline TriangleM(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+ : v0(v0), e1(v0-v1), e2(v2-v0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ /* Returns a mask that tells which triangles are valid */
+ __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+ /* Returns true if the specified triangle is valid */
+ __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+ /* Returns the number of stored triangles */
+ __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+ /* Returns the geometry IDs */
+ __forceinline vuint<M>& geomID() { return geomIDs; }
+ __forceinline const vuint<M>& geomID() const { return geomIDs; }
+ __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+ /* Returns the primitive IDs */
+ __forceinline vuint<M>& primID() { return primIDs; }
+ __forceinline const vuint<M>& primID() const { return primIDs; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+ /* Calculate the bounds of the triangle */
+ __forceinline BBox3fa bounds() const
+ {
+ Vec3vf<M> p0 = v0;
+ Vec3vf<M> p1 = v0-e1;
+ Vec3vf<M> p2 = v0+e2;
+ Vec3vf<M> lower = min(p0,p1,p2);
+ Vec3vf<M> upper = max(p0,p1,p2);
+ vbool<M> mask = valid();
+ lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+ lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+ lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+ upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+ upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+ upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+ return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+ Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+ }
+
+ /* Non temporal store */
+ __forceinline static void store_nt(TriangleM* dst, const TriangleM& src)
+ {
+ vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+ vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+ vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+ vfloat<M>::store_nt(&dst->e1.x,src.e1.x);
+ vfloat<M>::store_nt(&dst->e1.y,src.e1.y);
+ vfloat<M>::store_nt(&dst->e1.z,src.e1.z);
+ vfloat<M>::store_nt(&dst->e2.x,src.e2.x);
+ vfloat<M>::store_nt(&dst->e2.y,src.e2.y);
+ vfloat<M>::store_nt(&dst->e2.z,src.e2.z);
+ vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+ vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+ }
+
+ /* Fill triangle from triangle list */
+ __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+ {
+ vuint<M> vgeomID = -1, vprimID = -1;
+ Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+ for (size_t i=0; i<M && begin<end; i++, begin++)
+ {
+ const PrimRef& prim = prims[begin];
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+ vgeomID [i] = geomID;
+ vprimID [i] = primID;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ }
+ TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+ }
+
+ /* Updates the primitive */
+ __forceinline BBox3fa update(TriangleMesh* mesh)
+ {
+ BBox3fa bounds = empty;
+ vuint<M> vgeomID = -1, vprimID = -1;
+ Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+ for (size_t i=0; i<M; i++)
+ {
+ if (unlikely(geomID(i) == -1)) break;
+ const unsigned geomId = geomID(i);
+ const unsigned primId = primID(i);
+ const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+ const Vec3fa p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa p2 = mesh->vertex(tri.v[2]);
+ bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+ vgeomID [i] = geomId;
+ vprimID [i] = primId;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ }
+ TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+ return bounds;
+ }
+
+ public:
+ Vec3vf<M> v0; // base vertex of the triangles
+ Vec3vf<M> e1; // 1st edge of the triangles (v0-v1)
+ Vec3vf<M> e2; // 2nd edge of the triangles (v2-v0)
+ private:
+ vuint<M> geomIDs; // geometry IDs
+ vuint<M> primIDs; // primitive IDs
+ };
+
+ template<int M>
+ typename TriangleM<M>::Type TriangleM<M>::type;
+
+ typedef TriangleM<4> Triangle4;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h
new file mode 100644
index 0000000000..125a42c5fe
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Intersects M triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMIntersector1Moeller
+ {
+ typedef TriangleM<M> Primitive;
+ typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+ /*! Intersect a ray with the M triangles and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM<M>& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of M triangles. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM<M>& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+
+ };
+
+ /*! Intersects M triangles with K rays. */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMIntersectorKMoeller
+ {
+ typedef TriangleM<M> Primitive;
+ typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+ {
+ STAT_USER(0,TriangleM<M>::max_size());
+ for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+ const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+ const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+ pre.intersectEdgeK(valid_i,ray,p0,e1,e2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+ {
+ vbool<K> valid0 = valid_i;
+
+ for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+ const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+ const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+ pre.intersectEdgeK(valid0,ray,p0,e1,e2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h
new file mode 100644
index 0000000000..b5a8519236
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h
@@ -0,0 +1,403 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Moeller
+ * Trumbore intersector from the paper "Fast, Minimum Storage
+ * Ray-Triangle Intersection". In contrast to the paper we
+ * precalculate some factors and factor the calculations differently
+ * to allow precalculating the cross product e1 x e2. The resulting
+ * algorithm is similar to the fastest one of the paper "Optimizing
+ * Ray-Triangle Intersection via Automated Search". */
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct MoellerTrumboreHitM
+ {
+ __forceinline MoellerTrumboreHitM() {}
+
+ __forceinline MoellerTrumboreHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const vfloat<M>& absDen, const Vec3vf<M>& Ng)
+ : U(U), V(V), T(T), absDen(absDen), valid(valid), vNg(Ng) {}
+
+ __forceinline void finalize()
+ {
+ const vfloat<M> rcpAbsDen = rcp(absDen);
+ vt = T * rcpAbsDen;
+ vu = U * rcpAbsDen;
+ vv = V * rcpAbsDen;
+ }
+
+ __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+ __forceinline float t (const size_t i) const { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ public:
+ vfloat<M> U;
+ vfloat<M> V;
+ vfloat<M> T;
+ vfloat<M> absDen;
+
+ public:
+ vbool<M> valid;
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ template<int M>
+ struct MoellerTrumboreIntersector1
+ {
+ __forceinline MoellerTrumboreIntersector1() {}
+
+ __forceinline MoellerTrumboreIntersector1(const Ray& ray, const void* ptr) {}
+
+ __forceinline bool intersect(const vbool<M>& valid0,
+ Ray& ray,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ const Vec3vf<M>& tri_Ng,
+ MoellerTrumboreHitM<M>& hit) const
+ {
+ /* calculate denominator */
+ vbool<M> valid = valid0;
+ const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+ const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+ const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+ const Vec3vf<M> R = cross(C,D);
+ const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+
+ const vfloat<M> absDen = abs(den);
+ const vfloat<M> sgnDen = signmsk(den);
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+ const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ valid &= (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+ valid &= (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+ valid &= (absDen*vfloat<M>(ray.tnear()) < T) & (T <= absDen*vfloat<M>(ray.tfar));
+ if (likely(none(valid))) return false;
+
+
+ /* update hit information */
+ new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+
+ return true;
+ }
+
+ __forceinline bool intersectEdge(Ray& ray,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ MoellerTrumboreHitM<M>& hit) const
+ {
+ vbool<M> valid = true;
+ const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1);
+ return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,hit);
+ }
+
+ __forceinline bool intersect(Ray& ray,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ MoellerTrumboreHitM<M>& hit) const
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ return intersectEdge(ray,v0,e1,e2,hit);
+ }
+
+ __forceinline bool intersect(const vbool<M>& valid,
+ Ray& ray,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ MoellerTrumboreHitM<M>& hit) const
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ return intersectEdge(valid,ray,v0,e1,e2,hit);
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersectEdge(Ray& ray,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& e1,
+ const Vec3vf<M>& e2,
+ const Epilog& epilog) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ if (likely(intersectEdge(ray,v0,e1,e2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersect(Ray& ray,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const Epilog& epilog) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ if (likely(intersect(ray,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersect(const vbool<M>& valid,
+ Ray& ray,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const Epilog& epilog) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ if (likely(intersect(valid,ray,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+ };
+
+ template<int K>
+ struct MoellerTrumboreHitK
+ {
+ __forceinline MoellerTrumboreHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng)
+ : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {}
+
+ __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+ {
+ const vfloat<K> rcpAbsDen = rcp(absDen);
+ const vfloat<K> t = T * rcpAbsDen;
+ const vfloat<K> u = U * rcpAbsDen;
+ const vfloat<K> v = V * rcpAbsDen;
+ return std::make_tuple(u,v,t,Ng);
+ }
+
+ private:
+ const vfloat<K> U;
+ const vfloat<K> V;
+ const vfloat<K> T;
+ const vfloat<K> absDen;
+ const Vec3vf<K> Ng;
+ };
+
+ template<int M, int K>
+ struct MoellerTrumboreIntersectorK
+ {
+ __forceinline MoellerTrumboreIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ //RayK<K>& ray,
+ const Vec3vf<K>& ray_org,
+ const Vec3vf<K>& ray_dir,
+ const vfloat<K>& ray_tnear,
+ const vfloat<K>& ray_tfar,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_e1,
+ const Vec3vf<K>& tri_e2,
+ const Vec3vf<K>& tri_Ng,
+ const Epilog& epilog) const
+ {
+ /* calculate denominator */
+ vbool<K> valid = valid0;
+ const Vec3vf<K> C = tri_v0 - ray_org;
+ const Vec3vf<K> R = cross(C,ray_dir);
+ const vfloat<K> den = dot(tri_Ng,ray_dir);
+ const vfloat<K> absDen = abs(den);
+ const vfloat<K> sgnDen = signmsk(den);
+
+ /* test against edge p2 p0 */
+ const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+ valid &= U >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* test against edge p0 p1 */
+ const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+ valid &= V >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* test against edge p1 p2 */
+ const vfloat<K> W = absDen-U-V;
+ valid &= W >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+ valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+ if (unlikely(none(valid))) return false;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ valid &= den < vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+#else
+ valid &= den != vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+#endif
+
+ /* calculate hit information */
+ MoellerTrumboreHitK<K> hit(U,V,T,absDen,tri_Ng);
+ return epilog(valid,hit);
+ }
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_v1,
+ const Vec3vf<K>& tri_v2,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<K> e1 = tri_v0-tri_v1;
+ const Vec3vf<K> e2 = tri_v2-tri_v0;
+ const Vec3vf<K> Ng = cross(e2,e1);
+ return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog);
+ }
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_e1,
+ const Vec3vf<K>& tri_e2,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+ return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog);
+ }
+
+ /*! Intersect k'th ray from ray packet of size K with M triangles. */
+ __forceinline bool intersectEdge(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ MoellerTrumboreHitM<M>& hit) const
+ {
+ /* calculate denominator */
+ typedef Vec3vf<M> Vec3vfM;
+ const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+ const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+ const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+ const Vec3vfM C = Vec3vfM(tri_v0) - O;
+ const Vec3vfM R = cross(C,D);
+ const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+ const vfloat<M> absDen = abs(den);
+ const vfloat<M> sgnDen = signmsk(den);
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+ const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+ vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+ valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+ if (likely(none(valid))) return false;
+
+ /* calculate hit information */
+ new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+ return true;
+ }
+
+ __forceinline bool intersectEdge(RayK<K>& ray,
+ size_t k,
+ const BBox<vfloat<M>>& time_range,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ MoellerTrumboreHitM<M>& hit) const
+ {
+ if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit)))
+ {
+ hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+ hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+ return any(hit.valid);
+ }
+ return false;
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersectEdge(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ const Epilog& epilog) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersectEdge(RayK<K>& ray,
+ size_t k,
+ const BBox<vfloat<M>>& time_range,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ const Epilog& epilog) const
+ {
+ MoellerTrumboreHitM<M> hit;
+ if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersect(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ return intersectEdge(ray,k,v0,e1,e2,epilog);
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersect(RayK<K>& ray,
+ size_t k,
+ const BBox<vfloat<M>>& time_range,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ return intersectEdge(ray,k,time_range,v0,e1,e2,epilog);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h
new file mode 100644
index 0000000000..f1de99d208
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "intersector_epilog.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ * the ray origin into the origin of the coordinate system and then
+ * uses Pluecker coordinates for the intersection. Due to the shift,
+ * the Pluecker coordinate calculation simplifies and the tests get
+ * numerically stable. The edge equations are watertight along the
+ * edge for neighboring triangles. */
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M, typename UVMapper>
+ struct PlueckerHitM
+ {
+ __forceinline PlueckerHitM(const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& UVW, const vfloat<M>& t, const Vec3vf<M>& Ng, const UVMapper& mapUV)
+ : U(U), V(V), UVW(UVW), mapUV(mapUV), vt(t), vNg(Ng) {}
+
+ __forceinline void finalize()
+ {
+ const vbool<M> invalid = abs(UVW) < min_rcp_input;
+ const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+ vu = U * rcpUVW;
+ vv = V * rcpUVW;
+ mapUV(vu,vv);
+ }
+
+ __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+ __forceinline float t (const size_t i) const { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ private:
+ const vfloat<M> U;
+ const vfloat<M> V;
+ const vfloat<M> UVW;
+ const UVMapper& mapUV;
+
+ public:
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ template<int M>
+ struct PlueckerIntersector1
+ {
+ __forceinline PlueckerIntersector1() {}
+
+ __forceinline PlueckerIntersector1(const Ray& ray, const void* ptr) {}
+
+ template<typename UVMapper, typename Epilog>
+ __forceinline bool intersect(Ray& ray,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_v1,
+ const Vec3vf<M>& tri_v2,
+ const UVMapper& mapUV,
+ const Epilog& epilog) const
+ {
+ /* calculate vertices relative to ray origin */
+ const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+ const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+ const Vec3vf<M> v0 = tri_v0-O;
+ const Vec3vf<M> v1 = tri_v1-O;
+ const Vec3vf<M> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<M> e0 = v2-v0;
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(cross(e0,v2+v0),D);
+ const vfloat<M> V = dot(cross(e1,v0+v1),D);
+ const vfloat<M> W = dot(cross(e2,v1+v2),D);
+ const vfloat<M> UVW = U+V+W;
+ const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = max(U,V,W) <= eps;
+#else
+ vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<M> den = twice(dot(Ng,D));
+
+ /* perform depth test */
+ const vfloat<M> T = twice(dot(v0,Ng));
+ const vfloat<M> t = rcp(den)*T;
+ valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+ valid &= den != vfloat<M>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ PlueckerHitM<M,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+ return epilog(valid,hit);
+ }
+ };
+
+ template<int K, typename UVMapper>
+ struct PlueckerHitK
+ {
+ __forceinline PlueckerHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& UVW, const vfloat<K>& t, const Vec3vf<K>& Ng, const UVMapper& mapUV)
+ : U(U), V(V), UVW(UVW), t(t), Ng(Ng), mapUV(mapUV) {}
+
+ __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+ {
+ const vbool<K> invalid = abs(UVW) < min_rcp_input;
+ const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+ vfloat<K> u = U * rcpUVW;
+ vfloat<K> v = V * rcpUVW;
+ mapUV(u,v);
+ return std::make_tuple(u,v,t,Ng);
+ }
+
+ private:
+ const vfloat<K> U;
+ const vfloat<K> V;
+ const vfloat<K> UVW;
+ const vfloat<K> t;
+ const Vec3vf<K> Ng;
+ const UVMapper& mapUV;
+ };
+
+ template<int M, int K>
+ struct PlueckerIntersectorK
+ {
+ __forceinline PlueckerIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename UVMapper, typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_v1,
+ const Vec3vf<K>& tri_v2,
+ const UVMapper& mapUV,
+ const Epilog& epilog) const
+ {
+ /* calculate vertices relative to ray origin */
+ vbool<K> valid = valid0;
+ const Vec3vf<K> O = ray.org;
+ const Vec3vf<K> D = ray.dir;
+ const Vec3vf<K> v0 = tri_v0-O;
+ const Vec3vf<K> v1 = tri_v1-O;
+ const Vec3vf<K> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<K> e0 = v2-v0;
+ const Vec3vf<K> e1 = v0-v1;
+ const Vec3vf<K> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+ const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+ const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+ const vfloat<K> UVW = U+V+W;
+ const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ valid &= max(U,V,W) <= eps;
+#else
+ valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+ /* perform depth test */
+ const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+ const vfloat<K> t = rcp(den)*T;
+ valid &= ray.tnear() <= t & t <= ray.tfar;
+ valid &= den != vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* calculate hit information */
+ PlueckerHitK<K,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+ return epilog(valid,hit);
+ }
+
+ /*! Intersect k'th ray from ray packet of size K with M triangles. */
+ template<typename UVMapper, typename Epilog>
+ __forceinline bool intersect(RayK<K>& ray, size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_v1,
+ const Vec3vf<M>& tri_v2,
+ const UVMapper& mapUV,
+ const Epilog& epilog) const
+ {
+ /* calculate vertices relative to ray origin */
+ const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+ const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+ const Vec3vf<M> v0 = tri_v0-O;
+ const Vec3vf<M> v1 = tri_v1-O;
+ const Vec3vf<M> v2 = tri_v2-O;
+
+ /* calculate triangle edges */
+ const Vec3vf<M> e0 = v2-v0;
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v1-v2;
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(cross(e0,v2+v0),D);
+ const vfloat<M> V = dot(cross(e1,v0+v1),D);
+ const vfloat<M> W = dot(cross(e2,v1+v2),D);
+ const vfloat<M> UVW = U+V+W;
+ const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = max(U,V,W) <= eps;
+#else
+ vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+ if (unlikely(none(valid))) return false;
+
+ /* calculate geometry normal and denominator */
+ const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+ const vfloat<M> den = twice(dot(Ng,D));
+
+ /* perform depth test */
+ const vfloat<M> T = twice(dot(v0,Ng));
+ const vfloat<M> t = rcp(den)*T;
+ valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+ if (unlikely(none(valid))) return false;
+
+ /* avoid division by 0 */
+ valid &= den != vfloat<M>(zero);
+ if (unlikely(none(valid))) return false;
+
+ /* update hit information */
+ PlueckerHitM<M,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+ return epilog(valid,hit);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h
new file mode 100644
index 0000000000..63e649d8fb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h
@@ -0,0 +1,418 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Woop's ray-triangle intersection test */
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int M>
+ struct WoopHitM
+ {
+ __forceinline WoopHitM() {}
+
+ __forceinline WoopHitM(const vbool<M>& valid,
+ const vfloat<M>& U,
+ const vfloat<M>& V,
+ const vfloat<M>& T,
+ const vfloat<M>& inv_det,
+ const Vec3vf<M>& Ng)
+ : U(U), V(V), T(T), inv_det(inv_det), valid(valid), vNg(Ng) {}
+
+ __forceinline void finalize()
+ {
+ vt = T;
+ vu = U*inv_det;
+ vv = V*inv_det;
+ }
+
+ __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+ __forceinline float t (const size_t i) const { return vt[i]; }
+ __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+ private:
+ const vfloat<M> U;
+ const vfloat<M> V;
+ const vfloat<M> T;
+ const vfloat<M> inv_det;
+
+ public:
+ const vbool<M> valid;
+ vfloat<M> vu;
+ vfloat<M> vv;
+ vfloat<M> vt;
+ Vec3vf<M> vNg;
+ };
+
+ template<int M>
+ struct WoopPrecalculations1
+ {
+ unsigned int kx,ky,kz;
+ Vec3vf<M> org;
+ Vec3fa S;
+ __forceinline WoopPrecalculations1() {}
+
+ __forceinline WoopPrecalculations1(const Ray& ray, const void* ptr)
+ {
+ kz = maxDim(abs(ray.dir));
+ kx = (kz+1) % 3;
+ ky = (kx+1) % 3;
+ const float inv_dir_kz = rcp(ray.dir[kz]);
+ if (ray.dir[kz]) std::swap(kx,ky);
+ S.x = ray.dir[kx] * inv_dir_kz;
+ S.y = ray.dir[ky] * inv_dir_kz;
+ S.z = inv_dir_kz;
+ org = Vec3vf<M>(ray.org[kx],ray.org[ky],ray.org[kz]);
+ }
+ };
+
+
+ template<int M>
+ struct WoopIntersector1
+ {
+
+ typedef WoopPrecalculations1<M> Precalculations;
+
+ __forceinline WoopIntersector1() {}
+
+ __forceinline WoopIntersector1(const Ray& ray, const void* ptr) {}
+
+ static __forceinline bool intersect(const vbool<M>& valid0,
+ Ray& ray,
+ const Precalculations& pre,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_v1,
+ const Vec3vf<M>& tri_v2,
+ WoopHitM<M>& hit)
+ {
+ vbool<M> valid = valid0;
+
+ /* vertices relative to ray origin */
+ const Vec3vf<M> org = Vec3vf<M>(pre.org.x,pre.org.y,pre.org.z);
+ const Vec3vf<M> A = Vec3vf<M>(tri_v0[pre.kx],tri_v0[pre.ky],tri_v0[pre.kz]) - org;
+ const Vec3vf<M> B = Vec3vf<M>(tri_v1[pre.kx],tri_v1[pre.ky],tri_v1[pre.kz]) - org;
+ const Vec3vf<M> C = Vec3vf<M>(tri_v2[pre.kx],tri_v2[pre.ky],tri_v2[pre.kz]) - org;
+
+ /* shear and scale vertices */
+ const vfloat<M> Ax = nmadd(A.z,pre.S.x,A.x);
+ const vfloat<M> Ay = nmadd(A.z,pre.S.y,A.y);
+ const vfloat<M> Bx = nmadd(B.z,pre.S.x,B.x);
+ const vfloat<M> By = nmadd(B.z,pre.S.y,B.y);
+ const vfloat<M> Cx = nmadd(C.z,pre.S.x,C.x);
+ const vfloat<M> Cy = nmadd(C.z,pre.S.y,C.y);
+
+ /* scaled barycentric */
+ const vfloat<M> U0 = Cx*By;
+ const vfloat<M> U1 = Cy*Bx;
+ const vfloat<M> V0 = Ax*Cy;
+ const vfloat<M> V1 = Ay*Cx;
+ const vfloat<M> W0 = Bx*Ay;
+ const vfloat<M> W1 = By*Ax;
+#if !defined(__AVX512F__)
+ valid &= (U0 >= U1) & (V0 >= V1) & (W0 >= W1) |
+ (U0 <= U1) & (V0 <= V1) & (W0 <= W1);
+#else
+ valid &= ge(ge(U0 >= U1,V0,V1),W0,W1) | le(le(U0 <= U1,V0,V1),W0,W1);
+#endif
+
+ if (likely(none(valid))) return false;
+ const vfloat<M> U = U0-U1;
+ const vfloat<M> V = V0-V1;
+ const vfloat<M> W = W0-W1;
+
+ const vfloat<M> det = U+V+W;
+
+ valid &= det != 0.0f;
+ const vfloat<M> inv_det = rcp(det);
+
+ const vfloat<M> Az = pre.S.z * A.z;
+ const vfloat<M> Bz = pre.S.z * B.z;
+ const vfloat<M> Cz = pre.S.z * C.z;
+ const vfloat<M> T = madd(U,Az,madd(V,Bz,W*Cz));
+ const vfloat<M> t = T * inv_det;
+ /* perform depth test */
+ valid &= (vfloat<M>(ray.tnear()) < t) & (t <= vfloat<M>(ray.tfar));
+ if (likely(none(valid))) return false;
+
+ const Vec3vf<M> tri_Ng = cross(tri_v2-tri_v0,tri_v0-tri_v1);
+
+ /* update hit information */
+ new (&hit) WoopHitM<M>(valid,U,V,t,inv_det,tri_Ng);
+ return true;
+ }
+
+ static __forceinline bool intersect(Ray& ray,
+ const Precalculations& pre,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ WoopHitM<M>& hit)
+ {
+ vbool<M> valid = true;
+ return intersect(valid,ray,pre,v0,v1,v2,hit);
+ }
+
+
+ template<typename Epilog>
+ static __forceinline bool intersect(Ray& ray,
+ const Precalculations& pre,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const Epilog& epilog)
+ {
+ WoopHitM<M> hit;
+ if (likely(intersect(ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+
+ template<typename Epilog>
+ static __forceinline bool intersect(const vbool<M>& valid,
+ Ray& ray,
+ const Precalculations& pre,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const Epilog& epilog)
+ {
+ WoopHitM<M> hit;
+ if (likely(intersect(valid,ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+ };
+
+#if 0
+ template<int K>
+ struct WoopHitK
+ {
+ __forceinline WoopHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng)
+ : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {}
+
+ __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+ {
+ const vfloat<K> rcpAbsDen = rcp(absDen);
+ const vfloat<K> t = T * rcpAbsDen;
+ const vfloat<K> u = U * rcpAbsDen;
+ const vfloat<K> v = V * rcpAbsDen;
+ return std::make_tuple(u,v,t,Ng);
+ }
+
+ private:
+ const vfloat<K> U;
+ const vfloat<K> V;
+ const vfloat<K> T;
+ const vfloat<K> absDen;
+ const Vec3vf<K> Ng;
+ };
+
+ template<int M, int K>
+ struct WoopIntersectorK
+ {
+ __forceinline WoopIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ //RayK<K>& ray,
+ const Vec3vf<K>& ray_org,
+ const Vec3vf<K>& ray_dir,
+ const vfloat<K>& ray_tnear,
+ const vfloat<K>& ray_tfar,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_e1,
+ const Vec3vf<K>& tri_e2,
+ const Vec3vf<K>& tri_Ng,
+ const Epilog& epilog) const
+ {
+ /* calculate denominator */
+ vbool<K> valid = valid0;
+ const Vec3vf<K> C = tri_v0 - ray_org;
+ const Vec3vf<K> R = cross(C,ray_dir);
+ const vfloat<K> den = dot(tri_Ng,ray_dir);
+ const vfloat<K> absDen = abs(den);
+ const vfloat<K> sgnDen = signmsk(den);
+
+ /* test against edge p2 p0 */
+ const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+ valid &= U >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* test against edge p0 p1 */
+ const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+ valid &= V >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* test against edge p1 p2 */
+ const vfloat<K> W = absDen-U-V;
+ valid &= W >= 0.0f;
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+ valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+ if (unlikely(none(valid))) return false;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ valid &= den < vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+#else
+ valid &= den != vfloat<K>(zero);
+ if (unlikely(none(valid))) return false;
+#endif
+
+ /* calculate hit information */
+ WoopHitK<K> hit(U,V,T,absDen,tri_Ng);
+ return epilog(valid,hit);
+ }
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_v1,
+ const Vec3vf<K>& tri_v2,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<K> e1 = tri_v0-tri_v1;
+ const Vec3vf<K> e2 = tri_v2-tri_v0;
+ const Vec3vf<K> Ng = cross(e2,e1);
+ return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog);
+ }
+
+ /*! Intersects K rays with one of M triangles. */
+ template<typename Epilog>
+ __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0,
+ RayK<K>& ray,
+ const Vec3vf<K>& tri_v0,
+ const Vec3vf<K>& tri_e1,
+ const Vec3vf<K>& tri_e2,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+ return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog);
+ }
+
+ /*! Intersect k'th ray from ray packet of size K with M triangles. */
+ __forceinline bool intersectEdge(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ WoopHitM<M>& hit) const
+ {
+ /* calculate denominator */
+ typedef Vec3vf<M> Vec3vfM;
+ const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+ const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+ const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+ const Vec3vfM C = Vec3vfM(tri_v0) - O;
+ const Vec3vfM R = cross(C,D);
+ const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+ const vfloat<M> absDen = abs(den);
+ const vfloat<M> sgnDen = signmsk(den);
+
+ /* perform edge tests */
+ const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+ const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+
+ /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+ vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+ vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+ if (likely(none(valid))) return false;
+
+ /* perform depth test */
+ const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+ valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+ if (likely(none(valid))) return false;
+
+ /* calculate hit information */
+ new (&hit) WoopHitM<M>(valid,U,V,T,absDen,tri_Ng);
+ return true;
+ }
+
+ __forceinline bool intersectEdge(RayK<K>& ray,
+ size_t k,
+ const BBox<vfloat<M>>& time_range,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ WoopHitM<M>& hit) const
+ {
+ if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit)))
+ {
+ hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+ hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+ return any(hit.valid);
+ }
+ return false;
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersectEdge(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ const Epilog& epilog) const
+ {
+ WoopHitM<M> hit;
+ if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersectEdge(RayK<K>& ray,
+ size_t k,
+ const BBox<vfloat<M>>& time_range,
+ const Vec3vf<M>& tri_v0,
+ const Vec3vf<M>& tri_e1,
+ const Vec3vf<M>& tri_e2,
+ const Epilog& epilog) const
+ {
+ WoopHitM<M> hit;
+ if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+ return false;
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersect(RayK<K>& ray,
+ size_t k,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ return intersectEdge(ray,k,v0,e1,e2,epilog);
+ }
+
+ template<typename Epilog>
+ __forceinline bool intersect(RayK<K>& ray,
+ size_t k,
+ const BBox<vfloat<M>>& time_range,
+ const Vec3vf<M>& v0,
+ const Vec3vf<M>& v1,
+ const Vec3vf<M>& v2,
+ const Epilog& epilog) const
+ {
+ const Vec3vf<M> e1 = v0-v1;
+ const Vec3vf<M> e2 = v2-v0;
+ return intersectEdge(ray,k,time_range,v0,e1,e2,epilog);
+ }
+ };
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h
new file mode 100644
index 0000000000..91b35c36f3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h
@@ -0,0 +1,132 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct TriangleTriangleIntersector
+ {
+ __forceinline static float T(float pa0, float pa1, float da0, float da1) {
+ return pa0 + (pa1-pa0)*da0/(da0-da1);
+ }
+
+ __forceinline static bool point_line_side(const Vec2f& p, const Vec2f& a0, const Vec2f& a1) {
+ return det(p-a0,a0-a1) >= 0.0f;
+ }
+
+ __forceinline static bool point_inside_triangle(const Vec2f& p, const Vec2f& a, const Vec2f& b, const Vec2f& c)
+ {
+ const bool pab = point_line_side(p,a,b);
+ const bool pbc = point_line_side(p,b,c);
+ const bool pca = point_line_side(p,c,a);
+ return pab == pbc && pab == pca;
+ }
+
+ __forceinline static bool intersect_line_line(const Vec2f& a0, const Vec2f& a1, const Vec2f& b0, const Vec2f& b1)
+ {
+ const bool different_sides0 = point_line_side(b0,a0,a1) != point_line_side(b1,a0,a1);
+ const bool different_sides1 = point_line_side(a0,b0,b1) != point_line_side(a1,b0,b1);
+ return different_sides0 && different_sides1;
+ }
+
+ __forceinline static bool intersect_triangle_triangle (const Vec2f& a0, const Vec2f& a1, const Vec2f& a2,
+ const Vec2f& b0, const Vec2f& b1, const Vec2f& b2)
+ {
+ const bool a01_b01 = intersect_line_line(a0,a1,b0,b1);
+ if (a01_b01) return true;
+ const bool a01_b12 = intersect_line_line(a0,a1,b1,b2);
+ if (a01_b12) return true;
+ const bool a01_b20 = intersect_line_line(a0,a1,b2,b0);
+ if (a01_b20) return true;
+ const bool a12_b01 = intersect_line_line(a1,a2,b0,b1);
+ if (a12_b01) return true;
+ const bool a12_b12 = intersect_line_line(a1,a2,b1,b2);
+ if (a12_b12) return true;
+ const bool a12_b20 = intersect_line_line(a1,a2,b2,b0);
+ if (a12_b20) return true;
+ const bool a20_b01 = intersect_line_line(a2,a0,b0,b1);
+ if (a20_b01) return true;
+ const bool a20_b12 = intersect_line_line(a2,a0,b1,b2);
+ if (a20_b12) return true;
+ const bool a20_b20 = intersect_line_line(a2,a0,b2,b0);
+ if (a20_b20) return true;
+
+ bool a_in_b = point_inside_triangle(a0,b0,b1,b2) && point_inside_triangle(a1,b0,b1,b2) && point_inside_triangle(a2,b0,b1,b2);
+ if (a_in_b) return true;
+
+ bool b_in_a = point_inside_triangle(b0,a0,a1,a2) && point_inside_triangle(b1,a0,a1,a2) && point_inside_triangle(b2,a0,a1,a2);
+ if (b_in_a) return true;
+
+ return false;
+ }
+
+ static bool intersect_triangle_triangle (const Vec3fa& a0, const Vec3fa& a1, const Vec3fa& a2,
+ const Vec3fa& b0, const Vec3fa& b1, const Vec3fa& b2)
+ {
+ const float eps = 1E-5f;
+
+ /* calculate triangle planes */
+ const Vec3fa Na = cross(a1-a0,a2-a0);
+ const float Ca = dot(Na,a0);
+ const Vec3fa Nb = cross(b1-b0,b2-b0);
+ const float Cb = dot(Nb,b0);
+
+ /* project triangle A onto plane B */
+ const float da0 = dot(Nb,a0)-Cb;
+ const float da1 = dot(Nb,a1)-Cb;
+ const float da2 = dot(Nb,a2)-Cb;
+ if (max(da0,da1,da2) < -eps) return false;
+ if (min(da0,da1,da2) > +eps) return false;
+ //CSTAT(bvh_collide_prim_intersections4++);
+
+ /* project triangle B onto plane A */
+ const float db0 = dot(Na,b0)-Ca;
+ const float db1 = dot(Na,b1)-Ca;
+ const float db2 = dot(Na,b2)-Ca;
+ if (max(db0,db1,db2) < -eps) return false;
+ if (min(db0,db1,db2) > +eps) return false;
+ //CSTAT(bvh_collide_prim_intersections5++);
+
+ if (unlikely((std::fabs(da0) < eps && std::fabs(da1) < eps && std::fabs(da2) < eps) ||
+ (std::fabs(db0) < eps && std::fabs(db1) < eps && std::fabs(db2) < eps)))
+ {
+ const size_t dz = maxDim(Na);
+ const size_t dx = (dz+1)%3;
+ const size_t dy = (dx+1)%3;
+ const Vec2f A0(a0[dx],a0[dy]);
+ const Vec2f A1(a1[dx],a1[dy]);
+ const Vec2f A2(a2[dx],a2[dy]);
+ const Vec2f B0(b0[dx],b0[dy]);
+ const Vec2f B1(b1[dx],b1[dy]);
+ const Vec2f B2(b2[dx],b2[dy]);
+ return intersect_triangle_triangle(A0,A1,A2,B0,B1,B2);
+ }
+
+ const Vec3fa D = cross(Na,Nb);
+ const float pa0 = dot(D,a0);
+ const float pa1 = dot(D,a1);
+ const float pa2 = dot(D,a2);
+ const float pb0 = dot(D,b0);
+ const float pb1 = dot(D,b1);
+ const float pb2 = dot(D,b2);
+
+ BBox1f ba = empty;
+ if (min(da0,da1) <= 0.0f && max(da0,da1) >= 0.0f && abs(da0-da1) > 0.0f) ba.extend(T(pa0,pa1,da0,da1));
+ if (min(da1,da2) <= 0.0f && max(da1,da2) >= 0.0f && abs(da1-da2) > 0.0f) ba.extend(T(pa1,pa2,da1,da2));
+ if (min(da2,da0) <= 0.0f && max(da2,da0) >= 0.0f && abs(da2-da0) > 0.0f) ba.extend(T(pa2,pa0,da2,da0));
+
+ BBox1f bb = empty;
+ if (min(db0,db1) <= 0.0f && max(db0,db1) >= 0.0f && abs(db0-db1) > 0.0f) bb.extend(T(pb0,pb1,db0,db1));
+ if (min(db1,db2) <= 0.0f && max(db1,db2) >= 0.0f && abs(db1-db2) > 0.0f) bb.extend(T(pb1,pb2,db1,db2));
+ if (min(db2,db0) <= 0.0f && max(db2,db0) >= 0.0f && abs(db2-db0) > 0.0f) bb.extend(T(pb2,pb0,db2,db0));
+
+ return conjoint(ba,bb);
+ }
+ };
+ }
+}
+
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h
new file mode 100644
index 0000000000..4f3118cc0c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h
@@ -0,0 +1,442 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+ /* Stores M triangles from an indexed face set */
+ template <int M>
+ struct TriangleMi
+ {
+ /* Virtual interface to query information about the triangle type */
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* primitive supports multiple time segments */
+ static const bool singleTimeSegment = false;
+
+ /* Returns maximum number of stored triangles */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+ public:
+
+ /* Default constructor */
+ __forceinline TriangleMi() { }
+
+ /* Construction from vertices and IDs */
+ __forceinline TriangleMi(const vuint<M>& v0,
+ const vuint<M>& v1,
+ const vuint<M>& v2,
+ const vuint<M>& geomIDs,
+ const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+ : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+ : v0_(v0), v1_(v1), v2_(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+ /* Returns a mask that tells which triangles are valid */
+ __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+ /* Returns if the specified triangle is valid */
+ __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+ /* Returns the number of stored triangles */
+ __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+ /* Returns the geometry IDs */
+ __forceinline vuint<M> geomID() const { return geomIDs; }
+ __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+ /* Returns the primitive IDs */
+ __forceinline vuint<M> primID() const { return primIDs; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+ /* Calculate the bounds of the triangles */
+ __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+ {
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<M && valid(i); i++) {
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+ bounds.extend(mesh->bounds(primID(i),itime));
+ }
+ return bounds;
+ }
+
+ /* Calculate the linear bounds of the primitive */
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime) {
+ return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+ {
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<M && valid(i); i++)
+ {
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+ allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+ }
+ return allBounds;
+ }
+
+ __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+ {
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<M && valid(i); i++)
+ {
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+ allBounds.extend(mesh->linearBounds(primID(i), time_range));
+ }
+ return allBounds;
+ }
+
+ /* Non-temporal store */
+ __forceinline static void store_nt(TriangleMi* dst, const TriangleMi& src)
+ {
+#if !defined(EMBREE_COMPACT_POLYS)
+ vuint<M>::store_nt(&dst->v0_,src.v0_);
+ vuint<M>::store_nt(&dst->v1_,src.v1_);
+ vuint<M>::store_nt(&dst->v2_,src.v2_);
+#endif
+ vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+ vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+ }
+
+ /* Fill triangle from triangle list */
+ template<typename PrimRefT>
+ __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+ {
+ vuint<M> v0 = zero, v1 = zero, v2 = zero;
+ vuint<M> geomID = -1, primID = -1;
+ const PrimRefT* prim = &prims[begin];
+
+ for (size_t i=0; i<M; i++)
+ {
+ if (begin<end) {
+ geomID[i] = prim->geomID();
+ primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(prim->geomID());
+ const TriangleMesh::Triangle& tri = mesh->triangle(prim->primID());
+ unsigned int int_stride = mesh->vertices0.getStride()/4;
+ v0[i] = tri.v[0] * int_stride;
+ v1[i] = tri.v[1] * int_stride;
+ v2[i] = tri.v[2] * int_stride;
+#endif
+ begin++;
+ } else {
+ assert(i);
+ if (likely(i > 0)) {
+ geomID[i] = geomID[0];
+ primID[i] = -1;
+ v0[i] = v0[0];
+ v1[i] = v0[0];
+ v2[i] = v0[0];
+ }
+ }
+ if (begin<end) prim = &prims[begin];
+ }
+ new (this) TriangleMi(v0,v1,v2,geomID,primID); // FIXME: use non temporal store
+ }
+
+ __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+ {
+ fill(prims, begin, end, scene);
+ return linearBounds(scene, itime);
+ }
+
+ __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+ {
+ fill(prims, begin, end, scene);
+ return linearBounds(scene, time_range);
+ }
+
+ /* Updates the primitive */
+ __forceinline BBox3fa update(TriangleMesh* mesh)
+ {
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<M; i++)
+ {
+ if (primID(i) == -1) break;
+ const unsigned int primId = primID(i);
+ const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+ const Vec3fa p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa p2 = mesh->vertex(tri.v[2]);
+ bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+ }
+ return bounds;
+ }
+
+ protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+ vuint<M> v0_; // 4 byte offset of 1st vertex
+ vuint<M> v1_; // 4 byte offset of 2nd vertex
+ vuint<M> v2_; // 4 byte offset of 3rd vertex
+#endif
+ vuint<M> geomIDs; // geometry ID of mesh
+ vuint<M> primIDs; // primitive ID of primitive inside mesh
+ };
+
+ namespace isa
+ {
+
+ template<int M>
+ struct TriangleMi : public embree::TriangleMi<M>
+ {
+#if !defined(EMBREE_COMPACT_POLYS)
+ using embree::TriangleMi<M>::v0_;
+ using embree::TriangleMi<M>::v1_;
+ using embree::TriangleMi<M>::v2_;
+#endif
+ using embree::TriangleMi<M>::geomIDs;
+ using embree::TriangleMi<M>::primIDs;
+ using embree::TriangleMi<M>::geomID;
+ using embree::TriangleMi<M>::primID;
+ using embree::TriangleMi<M>::valid;
+
+ /* loads a single vertex */
+ template<int vid>
+ __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+ {
+#if defined(EMBREE_COMPACT_POLYS)
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+ return (Vec3f) mesh->vertices[0][tri.v[vid]];
+#else
+ const vuint<M>& v = getVertexOffset<vid>();
+ const float* vertices = scene->vertices[geomID(index)];
+ return (Vec3f&) vertices[v[index]];
+#endif
+ }
+
+ template<int vid, typename T>
+ __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+ {
+#if defined(EMBREE_COMPACT_POLYS)
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+ const Vec3fa v0 = mesh->vertices[itime+0][tri.v[vid]];
+ const Vec3fa v1 = mesh->vertices[itime+1][tri.v[vid]];
+#else
+ const vuint<M>& v = getVertexOffset<vid>();
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+ const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+ const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+ const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+ const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+ const Vec3<T> p0(v0.x,v0.y,v0.z);
+ const Vec3<T> p1(v1.x,v1.y,v1.z);
+ return lerp(p0,p1,ftime);
+ }
+
+ template<int vid, int K, typename T>
+ __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+ {
+ Vec3<T> p0, p1;
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+
+ for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+ {
+#if defined(EMBREE_COMPACT_POLYS)
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+ const Vec3fa v0 = mesh->vertices[itime[i]+0][tri.v[vid]];
+ const Vec3fa v1 = mesh->vertices[itime[i]+1][tri.v[vid]];
+#else
+ const vuint<M>& v = getVertexOffset<vid>();
+ const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+ const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+ const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+ const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+ p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+ p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+ }
+ return (T(one)-ftime)*p0 + ftime*p1;
+ }
+
+ struct Triangle {
+ vfloat4 v0,v1,v2;
+ };
+
+#if defined(EMBREE_COMPACT_POLYS)
+
+ __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const
+ {
+ const unsigned int geomID = geomIDs[i];
+ const unsigned int primID = primIDs[i];
+ if (unlikely(primID == -1)) return { zero, zero, zero };
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID);
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ const vfloat4 v0 = (vfloat4) mesh->vertices0[tri.v[0]];
+ const vfloat4 v1 = (vfloat4) mesh->vertices0[tri.v[1]];
+ const vfloat4 v2 = (vfloat4) mesh->vertices0[tri.v[2]];
+ return { v0, v1, v2 };
+ }
+
+ __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const
+ {
+ const unsigned int primID = primIDs[i];
+ if (unlikely(primID == -1)) return { zero, zero, zero };
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ const vfloat4 v0 = (vfloat4) mesh->vertices[itime][tri.v[0]];
+ const vfloat4 v1 = (vfloat4) mesh->vertices[itime][tri.v[1]];
+ const vfloat4 v2 = (vfloat4) mesh->vertices[itime][tri.v[2]];
+ return { v0, v1, v2 };
+ }
+
+#else
+
+ __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const
+ {
+ const float* vertices = scene->vertices[geomID(i)];
+ const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+ const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+ const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+ return { v0, v1, v2 };
+ }
+
+ __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const
+ {
+ const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+ const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+ const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+ const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+ return { v0, v1, v2 };
+ }
+
+#endif
+
+ /* Gather the triangles */
+ __forceinline void gather(Vec3vf<M>& p0, Vec3vf<M>& p1, Vec3vf<M>& p2, const Scene* const scene) const;
+
+ template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+ __noinline
+#else
+ __forceinline
+#endif
+ void gather(const vbool<K>& valid,
+ Vec3vf<K>& p0,
+ Vec3vf<K>& p1,
+ Vec3vf<K>& p2,
+ const size_t index,
+ const Scene* const scene,
+ const vfloat<K>& time) const
+ {
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+
+ vfloat<K> ftime;
+ const vint<K> itime = mesh->timeSegment(time, ftime);
+
+ const size_t first = bsf(movemask(valid));
+ if (likely(all(valid,itime[first] == itime)))
+ {
+ p0 = getVertex<0>(index, scene, itime[first], ftime);
+ p1 = getVertex<1>(index, scene, itime[first], ftime);
+ p2 = getVertex<2>(index, scene, itime[first], ftime);
+ } else {
+ p0 = getVertex<0>(valid, index, scene, itime, ftime);
+ p1 = getVertex<1>(valid, index, scene, itime, ftime);
+ p2 = getVertex<2>(valid, index, scene, itime, ftime);
+ }
+ }
+
+ __forceinline void gather(Vec3vf<M>& p0,
+ Vec3vf<M>& p1,
+ Vec3vf<M>& p2,
+ const TriangleMesh* mesh,
+ const Scene *const scene,
+ const int itime) const;
+
+ __forceinline void gather(Vec3vf<M>& p0,
+ Vec3vf<M>& p1,
+ Vec3vf<M>& p2,
+ const Scene *const scene,
+ const float time) const;
+
+
+#if !defined(EMBREE_COMPACT_POLYS)
+ template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+ };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+ template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<0>() const { return v0_; }
+ template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<1>() const { return v1_; }
+ template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<2>() const { return v2_; }
+#endif
+
+ template<>
+ __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ const Scene* const scene) const
+ {
+ const Triangle tri0 = loadTriangle(0,scene);
+ const Triangle tri1 = loadTriangle(1,scene);
+ const Triangle tri2 = loadTriangle(2,scene);
+ const Triangle tri3 = loadTriangle(3,scene);
+ transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+ transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+ transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+ }
+
+ template<>
+ __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ const TriangleMesh* mesh,
+ const Scene *const scene,
+ const int itime) const
+ {
+ const Triangle tri0 = loadTriangle(0,itime,mesh);
+ const Triangle tri1 = loadTriangle(1,itime,mesh);
+ const Triangle tri2 = loadTriangle(2,itime,mesh);
+ const Triangle tri3 = loadTriangle(3,itime,mesh);
+ transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+ transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+ transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+ }
+
+ template<>
+ __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+ Vec3vf4& p1,
+ Vec3vf4& p2,
+ const Scene *const scene,
+ const float time) const
+ {
+ const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+ float ftime;
+ const int itime = mesh->timeSegment(time, ftime);
+
+ Vec3vf4 a0,a1,a2; gather(a0,a1,a2,mesh,scene,itime);
+ Vec3vf4 b0,b1,b2; gather(b0,b1,b2,mesh,scene,itime+1);
+ p0 = lerp(a0,b0,vfloat4(ftime));
+ p1 = lerp(a1,b1,vfloat4(ftime));
+ p2 = lerp(a2,b2,vfloat4(ftime));
+ }
+ }
+
+ template<int M>
+ typename TriangleMi<M>::Type TriangleMi<M>::type;
+
+ typedef TriangleMi<4> Triangle4i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h
new file mode 100644
index 0000000000..e2f106a62c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h
@@ -0,0 +1,336 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "trianglei.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Intersects M triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMiIntersector1Moeller
+ {
+ typedef TriangleMi<M> Primitive;
+ typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+ pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+ return pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+ /*! Intersects M triangles with K rays */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMiIntersectorKMoeller
+ {
+ typedef TriangleMi<M> Primitive;
+ typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+ {
+ const Scene* scene = context->scene;
+ for (size_t i=0; i<Primitive::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+ const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+ const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+ const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+ pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+ {
+ vbool<K> valid0 = valid_i;
+ const Scene* scene = context->scene;
+
+ for (size_t i=0; i<Primitive::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+ const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+ const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+ const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+ pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+ pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+ return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+ };
+
+ /*! Intersects M triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMiIntersector1Pluecker
+ {
+ typedef TriangleMi<M> Primitive;
+ typedef PlueckerIntersector1<Mx> Precalculations;
+
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+ pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+ return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+ /*! Intersects M triangles with K rays */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMiIntersectorKPluecker
+ {
+ typedef TriangleMi<M> Primitive;
+ typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+ {
+ const Scene* scene = context->scene;
+ for (size_t i=0; i<Primitive::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+ const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+ const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+ const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+ pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+ {
+ vbool<K> valid0 = valid_i;
+ const Scene* scene = context->scene;
+
+ for (size_t i=0; i<Primitive::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+ const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+ const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+ const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+ pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+ pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+ return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+ };
+
+ /*! Intersects M motion blur triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMiMBIntersector1Moeller
+ {
+ typedef TriangleMi<M> Primitive;
+ typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+ /*! Intersect a ray with the M triangles and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+ pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of M triangles. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+ return pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+ /*! Intersects M motion blur triangles with K rays. */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMiMBIntersectorKMoeller
+ {
+ typedef TriangleMi<M> Primitive;
+ typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+ {
+ for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+ pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+ {
+ vbool<K> valid0 = valid_i;
+ for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+ pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+ pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+ return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+ };
+
+ /*! Intersects M motion blur triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMiMBIntersector1Pluecker
+ {
+ typedef TriangleMi<M> Primitive;
+ typedef PlueckerIntersector1<Mx> Precalculations;
+
+ /*! Intersect a ray with the M triangles and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+ pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of M triangles. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+ return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+ /*! Intersects M motion blur triangles with K rays. */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMiMBIntersectorKPluecker
+ {
+ typedef TriangleMi<M> Primitive;
+ typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+ {
+ for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+ pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+ {
+ vbool<K> valid0 = valid_i;
+ for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+ pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+ pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+ return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h
new file mode 100644
index 0000000000..19af389e73
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h
@@ -0,0 +1,157 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+ /* Stores the vertices of M triangles in struct of array layout */
+ template <int M>
+ struct TriangleMv
+ {
+ public:
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+ static Type type;
+
+ public:
+
+ /* Returns maximum number of stored triangles */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+ public:
+
+ /* Default constructor */
+ __forceinline TriangleMv() {}
+
+ /* Construction from vertices and IDs */
+ __forceinline TriangleMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+ : v0(v0), v1(v1), v2(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ /* Returns a mask that tells which triangles are valid */
+ __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+ /* Returns true if the specified triangle is valid */
+ __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+ /* Returns the number of stored triangles */
+ __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+ /* Returns the geometry IDs */
+ __forceinline vuint<M>& geomID() { return geomIDs; }
+ __forceinline const vuint<M>& geomID() const { return geomIDs; }
+ __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+ /* Returns the primitive IDs */
+ __forceinline vuint<M>& primID() { return primIDs; }
+ __forceinline const vuint<M>& primID() const { return primIDs; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+ /* Calculate the bounds of the triangles */
+ __forceinline BBox3fa bounds() const
+ {
+ Vec3vf<M> lower = min(v0,v1,v2);
+ Vec3vf<M> upper = max(v0,v1,v2);
+ vbool<M> mask = valid();
+ lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+ lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+ lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+ upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+ upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+ upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+ return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+ Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+ }
+
+ /* Non temporal store */
+ __forceinline static void store_nt(TriangleMv* dst, const TriangleMv& src)
+ {
+ vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+ vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+ vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+ vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+ vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+ vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+ vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+ vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+ vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+ vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+ vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+ }
+
+ /* Fill triangle from triangle list */
+ __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+ {
+ vuint<M> vgeomID = -1, vprimID = -1;
+ Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+ for (size_t i=0; i<M && begin<end; i++, begin++)
+ {
+ const PrimRef& prim = prims[begin];
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+ vgeomID [i] = geomID;
+ vprimID [i] = primID;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ }
+ TriangleMv::store_nt(this,TriangleMv(v0,v1,v2,vgeomID,vprimID));
+ }
+
+ /* Updates the primitive */
+ __forceinline BBox3fa update(TriangleMesh* mesh)
+ {
+ BBox3fa bounds = empty;
+ vuint<M> vgeomID = -1, vprimID = -1;
+ Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+ for (size_t i=0; i<M; i++)
+ {
+ if (primID(i) == -1) break;
+ const unsigned geomId = geomID(i);
+ const unsigned primId = primID(i);
+ const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+ const Vec3fa p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa p2 = mesh->vertex(tri.v[2]);
+ bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+ vgeomID [i] = geomId;
+ vprimID [i] = primId;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ }
+ new (this) TriangleMv(v0,v1,v2,vgeomID,vprimID);
+ return bounds;
+ }
+
+ public:
+ Vec3vf<M> v0; // 1st vertex of the triangles
+ Vec3vf<M> v1; // 2nd vertex of the triangles
+ Vec3vf<M> v2; // 3rd vertex of the triangles
+ private:
+ vuint<M> geomIDs; // geometry ID
+ vuint<M> primIDs; // primitive ID
+ };
+
+ template<int M>
+ typename TriangleMv<M>::Type TriangleMv<M>::type;
+
+ typedef TriangleMv<4> Triangle4v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h
new file mode 100644
index 0000000000..6af0d5a11c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h
@@ -0,0 +1,206 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_pluecker.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_woop.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Intersects M triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMvIntersector1Moeller
+ {
+ typedef TriangleMv<M> Primitive;
+ typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+
+ template<int M, int Mx, bool filter>
+ struct TriangleMvIntersector1Woop
+ {
+ typedef TriangleMv<M> Primitive;
+ typedef WoopIntersector1<Mx> intersec;
+ typedef WoopPrecalculations1<M> Precalculations;
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+
+ /*! Intersects M triangles with K rays */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMvIntersectorKMoeller
+ {
+ typedef TriangleMv<M> Primitive;
+ typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+ {
+ for (size_t i=0; i<M; i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+ const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+ const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+ pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+ {
+ vbool<K> valid0 = valid_i;
+
+ for (size_t i=0; i<M; i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+ const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+ const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+ pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+ }
+ };
+
+ /*! Intersects M triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMvIntersector1Pluecker
+ {
+ typedef TriangleMv<M> Primitive;
+ typedef PlueckerIntersector1<Mx> Precalculations;
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+ /*! Intersects M triangles with K rays */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMvIntersectorKPluecker
+ {
+ typedef TriangleMv<M> Primitive;
+ typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+ {
+ for (size_t i=0; i<M; i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+ const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+ const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+ pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+ {
+ vbool<K> valid0 = valid_i;
+
+ for (size_t i=0; i<M; i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+ const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+ const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+ pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h
new file mode 100644
index 0000000000..63137aee16
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h
@@ -0,0 +1,201 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+ /* Stores the vertices of M triangles in struct of array layout */
+ template<int M>
+ struct TriangleMvMB
+ {
+ public:
+ struct Type : public PrimitiveType
+ {
+ const char* name() const;
+ size_t sizeActive(const char* This) const;
+ size_t sizeTotal(const char* This) const;
+ size_t getBytes(const char* This) const;
+ };
+
+ static Type type;
+
+ public:
+
+ /* primitive supports single time segments */
+ static const bool singleTimeSegment = true;
+
+ /* Returns maximum number of stored triangles */
+ static __forceinline size_t max_size() { return M; }
+
+ /* Returns required number of primitive blocks for N primitives */
+ static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+ public:
+
+ /* Default constructor */
+ __forceinline TriangleMvMB() {}
+
+ /* Construction from vertices and IDs */
+ __forceinline TriangleMvMB(const Vec3vf<M>& a0, const Vec3vf<M>& a1,
+ const Vec3vf<M>& b0, const Vec3vf<M>& b1,
+ const Vec3vf<M>& c0, const Vec3vf<M>& c1,
+ const vuint<M>& geomIDs, const vuint<M>& primIDs)
+ : v0(a0), v1(b0), v2(c0), dv0(a1-a0), dv1(b1-b0), dv2(c1-c0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+ /* Returns a mask that tells which triangles are valid */
+ __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+ /* Returns if the specified triangle is valid */
+ __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+ /* Returns the number of stored triangles */
+ __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+ /* Returns the geometry IDs */
+ __forceinline vuint<M>& geomID() { return geomIDs; }
+ __forceinline const vuint<M>& geomID() const { return geomIDs; }
+ __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+ /* Returns the primitive IDs */
+ __forceinline vuint<M>& primID() { return primIDs; }
+ __forceinline const vuint<M>& primID() const { return primIDs; }
+ __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+ /* Calculate the bounds of the triangles at t0 */
+ __forceinline BBox3fa bounds0() const
+ {
+ Vec3vf<M> lower = min(v0,v1,v2);
+ Vec3vf<M> upper = max(v0,v1,v2);
+ const vbool<M> mask = valid();
+ lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+ lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+ lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+ upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+ upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+ upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+ return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+ Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+ }
+
+ /* Calculate the bounds of the triangles at t1 */
+ __forceinline BBox3fa bounds1() const
+ {
+ const Vec3vf<M> p0 = v0+dv0;
+ const Vec3vf<M> p1 = v1+dv1;
+ const Vec3vf<M> p2 = v2+dv2;
+ Vec3vf<M> lower = min(p0,p1,p2);
+ Vec3vf<M> upper = max(p0,p1,p2);
+ const vbool<M> mask = valid();
+ lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+ lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+ lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+ upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+ upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+ upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+ return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+ Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+ }
+
+ /* Calculate the linear bounds of the primitive */
+ __forceinline LBBox3fa linearBounds() const {
+ return LBBox3fa(bounds0(),bounds1());
+ }
+
+ /* Fill triangle from triangle list */
+ __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+ {
+ vuint<M> vgeomID = -1, vprimID = -1;
+ Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+ Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+ BBox3fa bounds0 = empty;
+ BBox3fa bounds1 = empty;
+
+ for (size_t i=0; i<M && begin<end; i++, begin++)
+ {
+ const PrimRef& prim = prims[begin];
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0);
+ const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1);
+ const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0);
+ const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1);
+ const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0);
+ const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1);
+ vgeomID [i] = geomID;
+ vprimID [i] = primID;
+ va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z;
+ va1.x[i] = a1.x; va1.y[i] = a1.y; va1.z[i] = a1.z;
+ vb0.x[i] = b0.x; vb0.y[i] = b0.y; vb0.z[i] = b0.z;
+ vb1.x[i] = b1.x; vb1.y[i] = b1.y; vb1.z[i] = b1.z;
+ vc0.x[i] = c0.x; vc0.y[i] = c0.y; vc0.z[i] = c0.z;
+ vc1.x[i] = c1.x; vc1.y[i] = c1.y; vc1.z[i] = c1.z;
+ }
+ new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+ return LBBox3fa(bounds0,bounds1);
+ }
+
+ /* Fill triangle from triangle list */
+ __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+ {
+ vuint<M> vgeomID = -1, vprimID = -1;
+ Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+ Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<M && begin<end; i++, begin++)
+ {
+ const PrimRefMB& prim = prims[begin];
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const TriangleMesh* const mesh = scene->get<TriangleMesh>(geomID);
+ const range<int> itime_range = mesh->timeSegmentRange(time_range);
+ assert(itime_range.size() == 1);
+ const int ilower = itime_range.begin();
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ allBounds.extend(mesh->linearBounds(primID, time_range));
+ const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0);
+ const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1);
+ const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0);
+ const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1);
+ const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0);
+ const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1);
+ const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1));
+ auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v);
+ auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v);
+ auto c01 = globalLinear(std::make_pair(c0,c1),time_range_v);
+ vgeomID [i] = geomID;
+ vprimID [i] = primID;
+ va0.x[i] = a01.first .x; va0.y[i] = a01.first .y; va0.z[i] = a01.first .z;
+ va1.x[i] = a01.second.x; va1.y[i] = a01.second.y; va1.z[i] = a01.second.z;
+ vb0.x[i] = b01.first .x; vb0.y[i] = b01.first .y; vb0.z[i] = b01.first .z;
+ vb1.x[i] = b01.second.x; vb1.y[i] = b01.second.y; vb1.z[i] = b01.second.z;
+ vc0.x[i] = c01.first .x; vc0.y[i] = c01.first .y; vc0.z[i] = c01.first .z;
+ vc1.x[i] = c01.second.x; vc1.y[i] = c01.second.y; vc1.z[i] = c01.second.z;
+ }
+ new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+ return allBounds;
+ }
+
+ public:
+ Vec3vf<M> v0; // 1st vertex of the triangles
+ Vec3vf<M> v1; // 2nd vertex of the triangles
+ Vec3vf<M> v2; // 3rd vertex of the triangles
+ Vec3vf<M> dv0; // difference vector between time steps t0 and t1 for first vertex
+ Vec3vf<M> dv1; // difference vector between time steps t0 and t1 for second vertex
+ Vec3vf<M> dv2; // difference vector between time steps t0 and t1 for third vertex
+ private:
+ vuint<M> geomIDs; // geometry ID
+ vuint<M> primIDs; // primitive ID
+ };
+
+ template<int M>
+ typename TriangleMvMB<M>::Type TriangleMvMB<M>::type;
+
+ typedef TriangleMvMB<4> Triangle4vMB;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h
new file mode 100644
index 0000000000..35a260d826
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h
@@ -0,0 +1,211 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Intersects M motion blur triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMvMBIntersector1Moeller
+ {
+ typedef TriangleMvMB<M> Primitive;
+ typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+ /*! Intersect a ray with the M triangles and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const Vec3vf<Mx> time(ray.time());
+ const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+ const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+ const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+ pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of M triangles. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const Vec3vf<Mx> time(ray.time());
+ const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+ const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+ const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+ return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+ /*! Intersects M motion blur triangles with K rays. */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMvMBIntersectorKMoeller
+ {
+ typedef TriangleMvMB<M> Primitive;
+ typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> time(ray.time());
+ const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+ const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+ const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+ pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ vbool<K> valid0 = valid_i;
+
+ for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ const Vec3vf<K> time(ray.time());
+ const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+ const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+ const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+ pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const Vec3vf<Mx> time(ray.time()[k]);
+ const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+ const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+ const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+ pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const Vec3vf<Mx> time(ray.time()[k]);
+ const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+ const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+ const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+ return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+ };
+
+ /*! Intersects M motion blur triangles with 1 ray */
+ template<int M, int Mx, bool filter>
+ struct TriangleMvMBIntersector1Pluecker
+ {
+ typedef TriangleMvMB<M> Primitive;
+ typedef PlueckerIntersector1<Mx> Precalculations;
+
+ /*! Intersect a ray with the M triangles and updates the hit. */
+ static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const Vec3vf<Mx> time(ray.time());
+ const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+ const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+ const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+ pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of M triangles. */
+ static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const Vec3vf<Mx> time(ray.time());
+ const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+ const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+ const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+ return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+ }
+
+ static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+ {
+ return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+ }
+ };
+
+ /*! Intersects M motion blur triangles with K rays. */
+ template<int M, int Mx, int K, bool filter>
+ struct TriangleMvMBIntersectorKPluecker
+ {
+ typedef TriangleMvMB<M> Primitive;
+ typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+ /*! Intersects K rays with M triangles. */
+ static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+ const Vec3vf<K> time(ray.time());
+ const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+ const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+ const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+ pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+ }
+ }
+
+ /*! Test for K rays if they are occluded by any of the M triangles. */
+ static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ vbool<K> valid0 = valid_i;
+
+ for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+ {
+ if (!tri.valid(i)) break;
+ STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+ const Vec3vf<K> time(ray.time());
+ const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+ const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+ const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+ pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+ if (none(valid0)) break;
+ }
+ return !valid0;
+ }
+
+ /*! Intersect a ray with M triangles and updates the hit. */
+ static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ STAT3(normal.trav_prims,1,1,1);
+ const Vec3vf<Mx> time(ray.time()[k]);
+ const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+ const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+ const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+ pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+
+ /*! Test if the ray is occluded by one of the M triangles. */
+ static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+ {
+ STAT3(shadow.trav_prims,1,1,1);
+ const Vec3vf<Mx> time(ray.time()[k]);
+ const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+ const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+ const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+ return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+ }
+ };
+ }
+}