summaryrefslogtreecommitdiff
path: root/thirdparty/embree-aarch64/kernels/common
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/common')
-rw-r--r--thirdparty/embree-aarch64/kernels/common/accel.h556
-rw-r--r--thirdparty/embree-aarch64/kernels/common/accelinstance.h41
-rw-r--r--thirdparty/embree-aarch64/kernels/common/acceln.cpp232
-rw-r--r--thirdparty/embree-aarch64/kernels/common/acceln.h49
-rw-r--r--thirdparty/embree-aarch64/kernels/common/accelset.cpp17
-rw-r--r--thirdparty/embree-aarch64/kernels/common/accelset.h248
-rw-r--r--thirdparty/embree-aarch64/kernels/common/alloc.cpp82
-rw-r--r--thirdparty/embree-aarch64/kernels/common/alloc.h1006
-rw-r--r--thirdparty/embree-aarch64/kernels/common/buffer.h263
-rw-r--r--thirdparty/embree-aarch64/kernels/common/builder.h60
-rw-r--r--thirdparty/embree-aarch64/kernels/common/context.h131
-rw-r--r--thirdparty/embree-aarch64/kernels/common/default.h273
-rw-r--r--thirdparty/embree-aarch64/kernels/common/device.cpp567
-rw-r--r--thirdparty/embree-aarch64/kernels/common/device.h85
-rw-r--r--thirdparty/embree-aarch64/kernels/common/geometry.cpp259
-rw-r--r--thirdparty/embree-aarch64/kernels/common/geometry.h582
-rw-r--r--thirdparty/embree-aarch64/kernels/common/hit.h114
-rw-r--r--thirdparty/embree-aarch64/kernels/common/instance_stack.h199
-rw-r--r--thirdparty/embree-aarch64/kernels/common/isa.h271
-rw-r--r--thirdparty/embree-aarch64/kernels/common/motion_derivative.h325
-rw-r--r--thirdparty/embree-aarch64/kernels/common/point_query.h136
-rw-r--r--thirdparty/embree-aarch64/kernels/common/primref.h138
-rw-r--r--thirdparty/embree-aarch64/kernels/common/primref_mb.h262
-rw-r--r--thirdparty/embree-aarch64/kernels/common/profile.h159
-rw-r--r--thirdparty/embree-aarch64/kernels/common/ray.h1517
-rw-r--r--thirdparty/embree-aarch64/kernels/common/rtcore.cpp1799
-rw-r--r--thirdparty/embree-aarch64/kernels/common/rtcore.h142
-rw-r--r--thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp442
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene.cpp976
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene.h390
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_curves.h341
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h215
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_instance.h272
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_line_segments.h307
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_points.h282
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h277
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h326
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp243
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h264
-rw-r--r--thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h77
-rw-r--r--thirdparty/embree-aarch64/kernels/common/stack_item.h125
-rw-r--r--thirdparty/embree-aarch64/kernels/common/stat.cpp128
-rw-r--r--thirdparty/embree-aarch64/kernels/common/stat.h116
-rw-r--r--thirdparty/embree-aarch64/kernels/common/state.cpp543
-rw-r--r--thirdparty/embree-aarch64/kernels/common/state.h197
-rw-r--r--thirdparty/embree-aarch64/kernels/common/vector.h76
46 files changed, 15110 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/common/accel.h b/thirdparty/embree-aarch64/kernels/common/accel.h
new file mode 100644
index 0000000000..c038d3cf21
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accel.h
@@ -0,0 +1,556 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "point_query.h"
+#include "context.h"
+
+namespace embree
+{
+ class Scene;
+
+ /*! Base class for the acceleration structure data. */
+ class AccelData : public RefCount
+ {
+ ALIGNED_CLASS_(16);
+ public:
+ enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 };
+
+ public:
+ AccelData (const Type type)
+ : bounds(empty), type(type) {}
+
+ /*! notifies the acceleration structure about the deletion of some geometry */
+ virtual void deleteGeometry(size_t geomID) {};
+
+ /*! clears the acceleration structure data */
+ virtual void clear() = 0;
+
+ /*! returns normal bounds */
+ __forceinline BBox3fa getBounds() const {
+ return bounds.bounds();
+ }
+
+ /*! returns bounds for some time */
+ __forceinline BBox3fa getBounds(float t) const {
+ return bounds.interpolate(t);
+ }
+
+ /*! returns linear bounds */
+ __forceinline LBBox3fa getLinearBounds() const {
+ return bounds;
+ }
+
+ /*! checks if acceleration structure is empty */
+ __forceinline bool isEmpty() const {
+ return bounds.bounds0.lower.x == float(pos_inf);
+ }
+
+ public:
+ LBBox3fa bounds; // linear bounds
+ Type type;
+ };
+
+ /*! Base class for all intersectable and buildable acceleration structures. */
+ class Accel : public AccelData
+ {
+ ALIGNED_CLASS_(16);
+ public:
+
+ struct Intersectors;
+
+ /*! Type of collide function */
+ typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr);
+
+ /*! Type of point query function */
+ typedef bool(*PointQueryFunc)(Intersectors* This, /*!< this pointer to accel */
+ PointQuery* query, /*!< point query for lookup */
+ PointQueryContext* context); /*!< point query context */
+
+ /*! Type of intersect function pointer for single rays. */
+ typedef void (*IntersectFunc)(Intersectors* This, /*!< this pointer to accel */
+ RTCRayHit& ray, /*!< ray to intersect */
+ IntersectContext* context);
+
+ /*! Type of intersect function pointer for ray packets of size 4. */
+ typedef void (*IntersectFunc4)(const void* valid, /*!< pointer to valid mask */
+ Intersectors* This, /*!< this pointer to accel */
+ RTCRayHit4& ray, /*!< ray packet to intersect */
+ IntersectContext* context);
+
+ /*! Type of intersect function pointer for ray packets of size 8. */
+ typedef void (*IntersectFunc8)(const void* valid, /*!< pointer to valid mask */
+ Intersectors* This, /*!< this pointer to accel */
+ RTCRayHit8& ray, /*!< ray packet to intersect */
+ IntersectContext* context);
+
+ /*! Type of intersect function pointer for ray packets of size 16. */
+ typedef void (*IntersectFunc16)(const void* valid, /*!< pointer to valid mask */
+ Intersectors* This, /*!< this pointer to accel */
+ RTCRayHit16& ray, /*!< ray packet to intersect */
+ IntersectContext* context);
+
+ /*! Type of intersect function pointer for ray packets of size N. */
+ typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */
+ RTCRayHitN** ray, /*!< ray stream to intersect */
+ const size_t N, /*!< number of rays in stream */
+ IntersectContext* context /*!< layout flags */);
+
+
+ /*! Type of occlusion function pointer for single rays. */
+ typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */
+ RTCRay& ray, /*!< ray to test occlusion */
+ IntersectContext* context);
+
+ /*! Type of occlusion function pointer for ray packets of size 4. */
+ typedef void (*OccludedFunc4) (const void* valid, /*!< pointer to valid mask */
+ Intersectors* This, /*!< this pointer to accel */
+ RTCRay4& ray, /*!< ray packet to test occlusion. */
+ IntersectContext* context);
+
+ /*! Type of occlusion function pointer for ray packets of size 8. */
+ typedef void (*OccludedFunc8) (const void* valid, /*!< pointer to valid mask */
+ Intersectors* This, /*!< this pointer to accel */
+ RTCRay8& ray, /*!< ray packet to test occlusion. */
+ IntersectContext* context);
+
+ /*! Type of occlusion function pointer for ray packets of size 16. */
+ typedef void (*OccludedFunc16) (const void* valid, /*!< pointer to valid mask */
+ Intersectors* This, /*!< this pointer to accel */
+ RTCRay16& ray, /*!< ray packet to test occlusion. */
+ IntersectContext* context);
+
+ /*! Type of intersect function pointer for ray packets of size N. */
+ typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */
+ RTCRayN** ray, /*!< ray stream to test occlusion */
+ const size_t N, /*!< number of rays in stream */
+ IntersectContext* context /*!< layout flags */);
+ typedef void (*ErrorFunc) ();
+
+ struct Collider
+ {
+ Collider (ErrorFunc error = nullptr)
+ : collide((CollideFunc)error), name(nullptr) {}
+
+ Collider (CollideFunc collide, const char* name)
+ : collide(collide), name(name) {}
+
+ operator bool() const { return name; }
+
+ public:
+ CollideFunc collide;
+ const char* name;
+ };
+
+ struct Intersector1
+ {
+ Intersector1 (ErrorFunc error = nullptr)
+ : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {}
+
+ Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name)
+ : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {}
+
+ Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name)
+ : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {}
+
+ operator bool() const { return name; }
+
+ public:
+ static const char* type;
+ IntersectFunc intersect;
+ OccludedFunc occluded;
+ PointQueryFunc pointQuery;
+ const char* name;
+ };
+
+ struct Intersector4
+ {
+ Intersector4 (ErrorFunc error = nullptr)
+ : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {}
+
+ Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name)
+ : intersect(intersect), occluded(occluded), name(name) {}
+
+ operator bool() const { return name; }
+
+ public:
+ static const char* type;
+ IntersectFunc4 intersect;
+ OccludedFunc4 occluded;
+ const char* name;
+ };
+
+ struct Intersector8
+ {
+ Intersector8 (ErrorFunc error = nullptr)
+ : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {}
+
+ Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name)
+ : intersect(intersect), occluded(occluded), name(name) {}
+
+ operator bool() const { return name; }
+
+ public:
+ static const char* type;
+ IntersectFunc8 intersect;
+ OccludedFunc8 occluded;
+ const char* name;
+ };
+
+ struct Intersector16
+ {
+ Intersector16 (ErrorFunc error = nullptr)
+ : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {}
+
+ Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name)
+ : intersect(intersect), occluded(occluded), name(name) {}
+
+ operator bool() const { return name; }
+
+ public:
+ static const char* type;
+ IntersectFunc16 intersect;
+ OccludedFunc16 occluded;
+ const char* name;
+ };
+
+ struct IntersectorN
+ {
+ IntersectorN (ErrorFunc error = nullptr)
+ : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+
+ IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+ : intersect(intersect), occluded(occluded), name(name) {}
+
+ operator bool() const { return name; }
+
+ public:
+ static const char* type;
+ IntersectFuncN intersect;
+ OccludedFuncN occluded;
+ const char* name;
+ };
+
+ struct Intersectors
+ {
+ Intersectors()
+ : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {}
+
+ Intersectors (ErrorFunc error)
+ : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {}
+
+ void print(size_t ident)
+ {
+ if (collider.name) {
+ for (size_t i=0; i<ident; i++) std::cout << " ";
+ std::cout << "collider = " << collider.name << std::endl;
+ }
+ if (intersector1.name) {
+ for (size_t i=0; i<ident; i++) std::cout << " ";
+ std::cout << "intersector1 = " << intersector1.name << std::endl;
+ }
+ if (intersector4.name) {
+ for (size_t i=0; i<ident; i++) std::cout << " ";
+ std::cout << "intersector4 = " << intersector4.name << std::endl;
+ }
+ if (intersector8.name) {
+ for (size_t i=0; i<ident; i++) std::cout << " ";
+ std::cout << "intersector8 = " << intersector8.name << std::endl;
+ }
+ if (intersector16.name) {
+ for (size_t i=0; i<ident; i++) std::cout << " ";
+ std::cout << "intersector16 = " << intersector16.name << std::endl;
+ }
+ if (intersectorN.name) {
+ for (size_t i=0; i<ident; i++) std::cout << " ";
+ std::cout << "intersectorN = " << intersectorN.name << std::endl;
+ }
+ }
+
+ void select(bool filter)
+ {
+ if (intersector4_filter) {
+ if (filter) intersector4 = intersector4_filter;
+ else intersector4 = intersector4_nofilter;
+ }
+ if (intersector8_filter) {
+ if (filter) intersector8 = intersector8_filter;
+ else intersector8 = intersector8_nofilter;
+ }
+ if (intersector16_filter) {
+ if (filter) intersector16 = intersector16_filter;
+ else intersector16 = intersector16_nofilter;
+ }
+ if (intersectorN_filter) {
+ if (filter) intersectorN = intersectorN_filter;
+ else intersectorN = intersectorN_nofilter;
+ }
+ }
+
+ __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) {
+ assert(intersector1.pointQuery);
+ return intersector1.pointQuery(this,query,context);
+ }
+
+ /*! collides two scenes */
+ __forceinline void collide (Accel* scene0, Accel* scene1, RTCCollideFunc callback, void* userPtr) {
+ assert(collider.collide);
+ collider.collide(scene0->intersectors.ptr,scene1->intersectors.ptr,callback,userPtr);
+ }
+
+ /*! Intersects a single ray with the scene. */
+ __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) {
+ assert(intersector1.intersect);
+ intersector1.intersect(this,ray,context);
+ }
+
+ /*! Intersects a packet of 4 rays with the scene. */
+ __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) {
+ assert(intersector4.intersect);
+ intersector4.intersect(valid,this,ray,context);
+ }
+
+ /*! Intersects a packet of 8 rays with the scene. */
+ __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) {
+ assert(intersector8.intersect);
+ intersector8.intersect(valid,this,ray,context);
+ }
+
+ /*! Intersects a packet of 16 rays with the scene. */
+ __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) {
+ assert(intersector16.intersect);
+ intersector16.intersect(valid,this,ray,context);
+ }
+
+ /*! Intersects a stream of N rays in SOA layout with the scene. */
+ __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context)
+ {
+ assert(intersectorN.intersect);
+ intersectorN.intersect(this,rayN,N,context);
+ }
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+ __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
+ const vint<4> mask = valid.mask32();
+ intersect4(&mask,(RTCRayHit4&)ray,context);
+ }
+#endif
+#if defined(__AVX__)
+ __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) {
+ const vint<8> mask = valid.mask32();
+ intersect8(&mask,(RTCRayHit8&)ray,context);
+ }
+#endif
+#if defined(__AVX512F__)
+ __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) {
+ const vint<16> mask = valid.mask32();
+ intersect16(&mask,(RTCRayHit16&)ray,context);
+ }
+#endif
+
+ template<int K>
+ __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context)
+ {
+ intersectN((RTCRayHitN**)rayN,N,context);
+ }
+
+ /*! Tests if single ray is occluded by the scene. */
+ __forceinline void occluded (RTCRay& ray, IntersectContext* context) {
+ assert(intersector1.occluded);
+ intersector1.occluded(this,ray,context);
+ }
+
+ /*! Tests if a packet of 4 rays is occluded by the scene. */
+ __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) {
+ assert(intersector4.occluded);
+ intersector4.occluded(valid,this,ray,context);
+ }
+
+ /*! Tests if a packet of 8 rays is occluded by the scene. */
+ __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) {
+ assert(intersector8.occluded);
+ intersector8.occluded(valid,this,ray,context);
+ }
+
+ /*! Tests if a packet of 16 rays is occluded by the scene. */
+ __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) {
+ assert(intersector16.occluded);
+ intersector16.occluded(valid,this,ray,context);
+ }
+
+ /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */
+ __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context)
+ {
+ assert(intersectorN.occluded);
+ intersectorN.occluded(this,rayN,N,context);
+ }
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+ __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
+ const vint<4> mask = valid.mask32();
+ occluded4(&mask,(RTCRay4&)ray,context);
+ }
+#endif
+#if defined(__AVX__)
+ __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) {
+ const vint<8> mask = valid.mask32();
+ occluded8(&mask,(RTCRay8&)ray,context);
+ }
+#endif
+#if defined(__AVX512F__)
+ __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) {
+ const vint<16> mask = valid.mask32();
+ occluded16(&mask,(RTCRay16&)ray,context);
+ }
+#endif
+
+ template<int K>
+ __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context)
+ {
+ occludedN((RTCRayN**)rayN,N,context);
+ }
+
+ /*! Tests if single ray is occluded by the scene. */
+ __forceinline void intersect(RTCRay& ray, IntersectContext* context) {
+ occluded(ray, context);
+ }
+
+ /*! Tests if a packet of K rays is occluded by the scene. */
+ template<int K>
+ __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) {
+ occluded(valid, ray, context);
+ }
+
+ /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */
+ template<int K>
+ __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) {
+ occludedN(rayN, N, context);
+ }
+
+ public:
+ AccelData* ptr;
+ void* leafIntersector;
+ Collider collider;
+ Intersector1 intersector1;
+ Intersector4 intersector4;
+ Intersector4 intersector4_filter;
+ Intersector4 intersector4_nofilter;
+ Intersector8 intersector8;
+ Intersector8 intersector8_filter;
+ Intersector8 intersector8_nofilter;
+ Intersector16 intersector16;
+ Intersector16 intersector16_filter;
+ Intersector16 intersector16_nofilter;
+ IntersectorN intersectorN;
+ IntersectorN intersectorN_filter;
+ IntersectorN intersectorN_nofilter;
+ };
+
+ public:
+
+ /*! Construction */
+ Accel (const AccelData::Type type)
+ : AccelData(type) {}
+
+ /*! Construction */
+ Accel (const AccelData::Type type, const Intersectors& intersectors)
+ : AccelData(type), intersectors(intersectors) {}
+
+ /*! Virtual destructor */
+ virtual ~Accel() {}
+
+ /*! makes the acceleration structure immutable */
+ virtual void immutable () {}
+
+ /*! build acceleration structure */
+ virtual void build () = 0;
+
+ public:
+ Intersectors intersectors;
+ };
+
+#define DEFINE_COLLIDER(symbol,collider) \
+ Accel::Collider symbol() { \
+ return Accel::Collider((Accel::CollideFunc)collider::collide, \
+ TOSTRING(isa) "::" TOSTRING(symbol)); \
+ }
+
+#define DEFINE_INTERSECTOR1(symbol,intersector) \
+ Accel::Intersector1 symbol() { \
+ return Accel::Intersector1((Accel::IntersectFunc )intersector::intersect, \
+ (Accel::OccludedFunc )intersector::occluded, \
+ (Accel::PointQueryFunc)intersector::pointQuery,\
+ TOSTRING(isa) "::" TOSTRING(symbol)); \
+ }
+
+#define DEFINE_INTERSECTOR4(symbol,intersector) \
+ Accel::Intersector4 symbol() { \
+ return Accel::Intersector4((Accel::IntersectFunc4)intersector::intersect, \
+ (Accel::OccludedFunc4)intersector::occluded, \
+ TOSTRING(isa) "::" TOSTRING(symbol)); \
+ }
+
+#define DEFINE_INTERSECTOR8(symbol,intersector) \
+ Accel::Intersector8 symbol() { \
+ return Accel::Intersector8((Accel::IntersectFunc8)intersector::intersect, \
+ (Accel::OccludedFunc8)intersector::occluded, \
+ TOSTRING(isa) "::" TOSTRING(symbol)); \
+ }
+
+#define DEFINE_INTERSECTOR16(symbol,intersector) \
+ Accel::Intersector16 symbol() { \
+ return Accel::Intersector16((Accel::IntersectFunc16)intersector::intersect, \
+ (Accel::OccludedFunc16)intersector::occluded, \
+ TOSTRING(isa) "::" TOSTRING(symbol)); \
+ }
+
+#define DEFINE_INTERSECTORN(symbol,intersector) \
+ Accel::IntersectorN symbol() { \
+ return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \
+ (Accel::OccludedFuncN)intersector::occluded, \
+ TOSTRING(isa) "::" TOSTRING(symbol)); \
+ }
+
+ /* ray stream filter interface */
+ typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit* _rayN, const size_t N, const size_t stride, IntersectContext* context);
+ typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context);
+ typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+ typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context);
+
+ typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay* _rayN, const size_t N, const size_t stride, IntersectContext* context);
+ typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context);
+ typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+ typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context);
+
+ struct RayStreamFilterFuncs
+ {
+ RayStreamFilterFuncs()
+ : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr),
+ occludedAOS(nullptr), occludedAOP(nullptr), occludedSOA(nullptr), occludedSOP(nullptr) {}
+
+ RayStreamFilterFuncs(void (*ptr) ())
+ : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr),
+ occludedAOS((occludedStreamAOS_func) ptr), occludedAOP((occludedStreamAOP_func) ptr), occludedSOA((occludedStreamSOA_func) ptr), occludedSOP((occludedStreamSOP_func) ptr) {}
+
+ RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP,
+ occludedStreamAOS_func occludedAOS, occludedStreamAOP_func occludedAOP, occludedStreamSOA_func occludedSOA, occludedStreamSOP_func occludedSOP)
+ : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP),
+ occludedAOS(occludedAOS), occludedAOP(occludedAOP), occludedSOA(occludedSOA), occludedSOP(occludedSOP) {}
+
+ public:
+ intersectStreamAOS_func intersectAOS;
+ intersectStreamAOP_func intersectAOP;
+ intersectStreamSOA_func intersectSOA;
+ intersectStreamSOP_func intersectSOP;
+
+ occludedStreamAOS_func occludedAOS;
+ occludedStreamAOP_func occludedAOP;
+ occludedStreamSOA_func occludedSOA;
+ occludedStreamSOP_func occludedSOP;
+ };
+
+ typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)();
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelinstance.h b/thirdparty/embree-aarch64/kernels/common/accelinstance.h
new file mode 100644
index 0000000000..d74b96df3f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelinstance.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+#include "builder.h"
+
+namespace embree
+{
+ class AccelInstance : public Accel
+ {
+ public:
+ AccelInstance (AccelData* accel, Builder* builder, Intersectors& intersectors)
+ : Accel(AccelData::TY_ACCEL_INSTANCE,intersectors), accel(accel), builder(builder) {}
+
+ void immutable () {
+ builder.reset(nullptr);
+ }
+
+ public:
+ void build () {
+ if (builder) builder->build();
+ bounds = accel->bounds;
+ }
+
+ void deleteGeometry(size_t geomID) {
+ if (accel ) accel->deleteGeometry(geomID);
+ if (builder) builder->deleteGeometry(geomID);
+ }
+
+ void clear() {
+ if (accel) accel->clear();
+ if (builder) builder->clear();
+ }
+
+ private:
+ std::unique_ptr<AccelData> accel;
+ std::unique_ptr<Builder> builder;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.cpp b/thirdparty/embree-aarch64/kernels/common/acceln.cpp
new file mode 100644
index 0000000000..aadb4a64ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/acceln.cpp
@@ -0,0 +1,232 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "acceln.h"
+#include "ray.h"
+#include "../../include/embree3/rtcore_ray.h"
+#include "../../common/algorithms/parallel_for.h"
+
+namespace embree
+{
+ AccelN::AccelN()
+ : Accel(AccelData::TY_ACCELN), accels() {}
+
+ AccelN::~AccelN()
+ {
+ for (size_t i=0; i<accels.size(); i++)
+ delete accels[i];
+ }
+
+ void AccelN::accels_add(Accel* accel)
+ {
+ assert(accel);
+ accels.push_back(accel);
+ }
+
+ void AccelN::accels_init()
+ {
+ for (size_t i=0; i<accels.size(); i++)
+ delete accels[i];
+
+ accels.clear();
+ }
+
+ bool AccelN::pointQuery (Accel::Intersectors* This_in, PointQuery* query, PointQueryContext* context)
+ {
+ bool changed = false;
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++)
+ if (!This->accels[i]->isEmpty())
+ changed |= This->accels[i]->intersectors.pointQuery(query,context);
+ return changed;
+ }
+
+ void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++)
+ if (!This->accels[i]->isEmpty())
+ This->accels[i]->intersectors.intersect(ray,context);
+ }
+
+ void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++)
+ if (!This->accels[i]->isEmpty())
+ This->accels[i]->intersectors.intersect4(valid,ray,context);
+ }
+
+ void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++)
+ if (!This->accels[i]->isEmpty())
+ This->accels[i]->intersectors.intersect8(valid,ray,context);
+ }
+
+ void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++)
+ if (!This->accels[i]->isEmpty())
+ This->accels[i]->intersectors.intersect16(valid,ray,context);
+ }
+
+ void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++)
+ if (!This->accels[i]->isEmpty())
+ This->accels[i]->intersectors.intersectN(ray,N,context);
+ }
+
+ void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++) {
+ if (This->accels[i]->isEmpty()) continue;
+ This->accels[i]->intersectors.occluded(ray,context);
+ if (ray.tfar < 0.0f) break;
+ }
+ }
+
+ void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++) {
+ if (This->accels[i]->isEmpty()) continue;
+ This->accels[i]->intersectors.occluded4(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON)
+ vbool4 valid0 = asBool(((vint4*)valid)[0]);
+ vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+ if (unlikely(none(valid0 & hit0))) break;
+#endif
+ }
+ }
+
+ void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++) {
+ if (This->accels[i]->isEmpty()) continue;
+ This->accels[i]->intersectors.occluded8(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+ vbool4 valid0 = asBool(((vint4*)valid)[0]);
+ vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+ vbool4 valid1 = asBool(((vint4*)valid)[1]);
+ vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+ if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break;
+#endif
+ }
+ }
+
+ void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ for (size_t i=0; i<This->accels.size(); i++) {
+ if (This->accels[i]->isEmpty()) continue;
+ This->accels[i]->intersectors.occluded16(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+ vbool4 valid0 = asBool(((vint4*)valid)[0]);
+ vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+ vbool4 valid1 = asBool(((vint4*)valid)[1]);
+ vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+ vbool4 valid2 = asBool(((vint4*)valid)[2]);
+ vbool4 hit2 = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero);
+ vbool4 valid3 = asBool(((vint4*)valid)[3]);
+ vbool4 hit3 = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero);
+ if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break;
+#endif
+ }
+ }
+
+ void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context)
+ {
+ AccelN* This = (AccelN*)This_in->ptr;
+ size_t M = N;
+ for (size_t i=0; i<This->accels.size(); i++)
+ if (!This->accels[i]->isEmpty())
+ This->accels[i]->intersectors.occludedN(ray,M,context);
+ }
+
+ void AccelN::accels_print(size_t ident)
+ {
+ for (size_t i=0; i<accels.size(); i++)
+ {
+ for (size_t j=0; j<ident; j++) std::cout << " ";
+ std::cout << "accels[" << i << "]" << std::endl;
+ accels[i]->intersectors.print(ident+2);
+ }
+ }
+
+ void AccelN::accels_immutable()
+ {
+ for (size_t i=0; i<accels.size(); i++)
+ accels[i]->immutable();
+ }
+
+ void AccelN::accels_build ()
+ {
+ /* reduce memory consumption */
+ accels.shrink_to_fit();
+
+ /* build all acceleration structures in parallel */
+ parallel_for (accels.size(), [&] (size_t i) {
+ accels[i]->build();
+ });
+
+ /* create list of non-empty acceleration structures */
+ bool valid1 = true;
+ bool valid4 = true;
+ bool valid8 = true;
+ bool valid16 = true;
+ for (size_t i=0; i<accels.size(); i++) {
+ valid1 &= (bool) accels[i]->intersectors.intersector1;
+ valid4 &= (bool) accels[i]->intersectors.intersector4;
+ valid8 &= (bool) accels[i]->intersectors.intersector8;
+ valid16 &= (bool) accels[i]->intersectors.intersector16;
+ }
+
+ if (accels.size() == 1) {
+ type = accels[0]->type; // FIXME: should just assign entire Accel
+ bounds = accels[0]->bounds;
+ intersectors = accels[0]->intersectors;
+ }
+ else
+ {
+ type = AccelData::TY_ACCELN;
+ intersectors.ptr = this;
+ intersectors.intersector1 = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr);
+ intersectors.intersector4 = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr);
+ intersectors.intersector8 = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr);
+ intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr);
+ intersectors.intersectorN = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN");
+
+ /*! calculate bounds */
+ bounds = empty;
+ for (size_t i=0; i<accels.size(); i++)
+ bounds.extend(accels[i]->bounds);
+ }
+ }
+
+ void AccelN::accels_select(bool filter)
+ {
+ for (size_t i=0; i<accels.size(); i++)
+ accels[i]->intersectors.select(filter);
+ }
+
+ void AccelN::accels_deleteGeometry(size_t geomID)
+ {
+ for (size_t i=0; i<accels.size(); i++)
+ accels[i]->deleteGeometry(geomID);
+ }
+
+ void AccelN::accels_clear()
+ {
+ for (size_t i=0; i<accels.size(); i++) {
+ accels[i]->clear();
+ }
+ }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.h b/thirdparty/embree-aarch64/kernels/common/acceln.h
new file mode 100644
index 0000000000..2edd98f647
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/acceln.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+
+namespace embree
+{
+ /*! merges N acceleration structures together, by processing them in order */
+ class AccelN : public Accel
+ {
+ public:
+ AccelN ();
+ ~AccelN();
+
+ public:
+ void accels_add(Accel* accel);
+ void accels_init();
+
+ public:
+ static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+
+ public:
+ static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context);
+ static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context);
+ static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context);
+ static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context);
+ static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context);
+
+ public:
+ static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context);
+ static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context);
+ static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context);
+ static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context);
+ static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context);
+
+ public:
+ void accels_print(size_t ident);
+ void accels_immutable();
+ void accels_build ();
+ void accels_select(bool filter);
+ void accels_deleteGeometry(size_t geomID);
+ void accels_clear ();
+
+ public:
+ std::vector<Accel*> accels;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.cpp b/thirdparty/embree-aarch64/kernels/common/accelset.cpp
new file mode 100644
index 0000000000..79be1c4301
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelset.cpp
@@ -0,0 +1,17 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "accelset.h"
+#include "scene.h"
+
+namespace embree
+{
+ AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps)
+ : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {}
+
+ AccelSet::IntersectorN::IntersectorN (ErrorFunc error)
+ : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+
+ AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+ : intersect(intersect), occluded(occluded), name(name) {}
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.h b/thirdparty/embree-aarch64/kernels/common/accelset.h
new file mode 100644
index 0000000000..3774b2accb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelset.h
@@ -0,0 +1,248 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "builder.h"
+#include "geometry.h"
+#include "ray.h"
+#include "hit.h"
+
+namespace embree
+{
+ struct IntersectFunctionNArguments;
+ struct OccludedFunctionNArguments;
+
+ typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+ typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+
+ struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
+ {
+ IntersectContext* internal_context;
+ Geometry* geometry;
+ ReportIntersectionFunc report;
+ };
+
+ struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
+ {
+ IntersectContext* internal_context;
+ Geometry* geometry;
+ ReportOcclusionFunc report;
+ };
+
+ /*! Base class for set of acceleration structures. */
+ class AccelSet : public Geometry
+ {
+ public:
+ typedef RTCIntersectFunctionN IntersectFuncN;
+ typedef RTCOccludedFunctionN OccludedFuncN;
+ typedef void (*ErrorFunc) ();
+
+ struct IntersectorN
+ {
+ IntersectorN (ErrorFunc error = nullptr) ;
+ IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name);
+
+ operator bool() const { return name; }
+
+ public:
+ static const char* type;
+ IntersectFuncN intersect;
+ OccludedFuncN occluded;
+ const char* name;
+ };
+
+ public:
+
+ /*! construction */
+ AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps);
+
+ /*! makes the acceleration structure immutable */
+ virtual void immutable () {}
+
+ /*! build accel */
+ virtual void build () = 0;
+
+ /*! check if the i'th primitive is valid between the specified time range */
+ __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+ {
+ for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+ if (!isvalid_non_empty(bounds(i,itime))) return false;
+
+ return true;
+ }
+
+ /*! Calculates the bounds of an item */
+ __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const
+ {
+ BBox3fa box;
+ assert(i < size());
+ RTCBoundsFunctionArguments args;
+ args.geometryUserPtr = userPtr;
+ args.primID = (unsigned int)i;
+ args.timeStep = (unsigned int)itime;
+ args.bounds_o = (RTCBounds*)&box;
+ boundsFunc(&args);
+ return box;
+ }
+
+ /*! calculates the linear bounds of the i'th item at the itime'th time segment */
+ __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const
+ {
+ BBox3fa box[2];
+ assert(i < size());
+ RTCBoundsFunctionArguments args;
+ args.geometryUserPtr = userPtr;
+ args.primID = (unsigned int)i;
+ args.timeStep = (unsigned int)(itime+0);
+ args.bounds_o = (RTCBounds*)&box[0];
+ boundsFunc(&args);
+ args.timeStep = (unsigned int)(itime+1);
+ args.bounds_o = (RTCBounds*)&box[1];
+ boundsFunc(&args);
+ return LBBox3fa(box[0],box[1]);
+ }
+
+ /*! calculates the build bounds of the i'th item, if it's valid */
+ __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+ {
+ const BBox3fa b = bounds(i);
+ if (bbox) *bbox = b;
+ return isvalid_non_empty(b);
+ }
+
+ /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+ __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+ {
+ const LBBox3fa bounds = linearBounds(i,itime);
+ bbox = bounds.bounds0; // use bounding box of first timestep to build BVH
+ return isvalid_non_empty(bounds);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+ return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const {
+ if (!valid(i, timeSegmentRange(time_range))) return false;
+ bbox = linearBounds(i, time_range);
+ return true;
+ }
+
+ /* gets version info of topology */
+ unsigned int getTopologyVersion() const {
+ return numPrimitives;
+ }
+
+ /* returns true if topology changed */
+ bool topologyChanged(unsigned int otherVersion) const {
+ return numPrimitives != otherVersion;
+ }
+
+ public:
+
+ /*! Intersects a single ray with the scene. */
+ __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report)
+ {
+ assert(primID < size());
+ assert(intersectorN.intersect);
+
+ int mask = -1;
+ IntersectFunctionNArguments args;
+ args.valid = &mask;
+ args.geometryUserPtr = userPtr;
+ args.context = context->user;
+ args.rayhit = (RTCRayHitN*)&ray;
+ args.N = 1;
+ args.geomID = geomID;
+ args.primID = primID;
+ args.internal_context = context;
+ args.geometry = this;
+ args.report = report;
+
+ intersectorN.intersect(&args);
+ }
+
+ /*! Tests if single ray is occluded by the scene. */
+ __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+ {
+ assert(primID < size());
+ assert(intersectorN.occluded);
+
+ int mask = -1;
+ OccludedFunctionNArguments args;
+ args.valid = &mask;
+ args.geometryUserPtr = userPtr;
+ args.context = context->user;
+ args.ray = (RTCRayN*)&ray;
+ args.N = 1;
+ args.geomID = geomID;
+ args.primID = primID;
+ args.internal_context = context;
+ args.geometry = this;
+ args.report = report;
+
+ intersectorN.occluded(&args);
+ }
+
+ /*! Intersects a packet of K rays with the scene. */
+ template<int K>
+ __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report)
+ {
+ assert(primID < size());
+ assert(intersectorN.intersect);
+
+ vint<K> mask = valid.mask32();
+ IntersectFunctionNArguments args;
+ args.valid = (int*)&mask;
+ args.geometryUserPtr = userPtr;
+ args.context = context->user;
+ args.rayhit = (RTCRayHitN*)&ray;
+ args.N = K;
+ args.geomID = geomID;
+ args.primID = primID;
+ args.internal_context = context;
+ args.geometry = this;
+ args.report = report;
+
+ intersectorN.intersect(&args);
+ }
+
+ /*! Tests if a packet of K rays is occluded by the scene. */
+ template<int K>
+ __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+ {
+ assert(primID < size());
+ assert(intersectorN.occluded);
+
+ vint<K> mask = valid.mask32();
+ OccludedFunctionNArguments args;
+ args.valid = (int*)&mask;
+ args.geometryUserPtr = userPtr;
+ args.context = context->user;
+ args.ray = (RTCRayN*)&ray;
+ args.N = K;
+ args.geomID = geomID;
+ args.primID = primID;
+ args.internal_context = context;
+ args.geometry = this;
+ args.report = report;
+
+ intersectorN.occluded(&args);
+ }
+
+ public:
+ RTCBoundsFunction boundsFunc;
+ IntersectorN intersectorN;
+ };
+
+#define DEFINE_SET_INTERSECTORN(symbol,intersector) \
+ AccelSet::IntersectorN symbol() { \
+ return AccelSet::IntersectorN(intersector::intersect, \
+ intersector::occluded, \
+ TOSTRING(isa) "::" TOSTRING(symbol)); \
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.cpp b/thirdparty/embree-aarch64/kernels/common/alloc.cpp
new file mode 100644
index 0000000000..6fa406f03a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/alloc.cpp
@@ -0,0 +1,82 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "../../common/sys/thread.h"
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include "../../common/sys/barrier.h"
+#endif
+
+namespace embree
+{
+ __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr;
+ SpinLock FastAllocator::s_thread_local_allocators_lock;
+ std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators;
+
+ struct fast_allocator_regression_test : public RegressionTest
+ {
+ BarrierSys barrier;
+ std::atomic<size_t> numFailed;
+ std::unique_ptr<FastAllocator> alloc;
+
+ fast_allocator_regression_test()
+ : RegressionTest("fast_allocator_regression_test"), numFailed(0)
+ {
+ registerRegressionTest(this);
+ }
+
+ static void thread_alloc(fast_allocator_regression_test* This)
+ {
+ FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator();
+
+ size_t* ptrs[1000];
+ for (size_t j=0; j<1000; j++)
+ {
+ This->barrier.wait();
+ for (size_t i=0; i<1000; i++) {
+ ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32));
+ *ptrs[i] = size_t(threadalloc.talloc0) + i;
+ }
+ for (size_t i=0; i<1000; i++) {
+ if (*ptrs[i] != size_t(threadalloc.talloc0) + i)
+ This->numFailed++;
+ }
+ This->barrier.wait();
+ }
+ }
+
+ bool run ()
+ {
+ alloc = make_unique(new FastAllocator(nullptr,false));
+ numFailed.store(0);
+
+ size_t numThreads = getNumberOfLogicalThreads();
+ barrier.init(numThreads+1);
+
+ /* create threads */
+ std::vector<thread_t> threads;
+ for (size_t i=0; i<numThreads; i++)
+ threads.push_back(createThread((thread_func)thread_alloc,this));
+
+ /* run test */
+ for (size_t i=0; i<1000; i++)
+ {
+ alloc->reset();
+ barrier.wait();
+ barrier.wait();
+ }
+
+ /* destroy threads */
+ for (size_t i=0; i<numThreads; i++)
+ join(threads[i]);
+
+ alloc = nullptr;
+
+ return numFailed == 0;
+ }
+ };
+
+ fast_allocator_regression_test fast_allocator_regression;
+}
+
+
diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.h b/thirdparty/embree-aarch64/kernels/common/alloc.h
new file mode 100644
index 0000000000..488fa707ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/alloc.h
@@ -0,0 +1,1006 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "primref.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
+namespace embree
+{
+ class FastAllocator
+ {
+ /*! maximum supported alignment */
+ static const size_t maxAlignment = 64;
+
+ /*! maximum allocation size */
+
+ /* default settings */
+ //static const size_t defaultBlockSize = 4096;
+#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
+
+ static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
+
+ public:
+
+ struct ThreadLocal2;
+ enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
+
+ /*! Per thread structure holding the current memory block. */
+ struct __aligned(64) ThreadLocal
+ {
+ ALIGNED_CLASS_(64);
+ public:
+
+ /*! Constructor for usage with ThreadLocalData */
+ __forceinline ThreadLocal (ThreadLocal2* parent)
+ : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
+
+ /*! initialize allocator */
+ void init(FastAllocator* alloc)
+ {
+ ptr = nullptr;
+ cur = end = 0;
+ bytesUsed = 0;
+ bytesWasted = 0;
+ allocBlockSize = 0;
+ if (alloc) allocBlockSize = alloc->defaultBlockSize;
+ }
+
+ /* Allocate aligned memory from the threads memory block. */
+ __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
+ {
+ /* bind the thread local allocator to the proper FastAllocator*/
+ parent->bind(alloc);
+
+ assert(align <= maxAlignment);
+ bytesUsed += bytes;
+
+ /* try to allocate in local block */
+ size_t ofs = (align - cur) & (align-1);
+ cur += bytes + ofs;
+ if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+ cur -= bytes + ofs;
+
+ /* if allocation is too large allocate with parent allocator */
+ if (4*bytes > allocBlockSize) {
+ return alloc->malloc(bytes,maxAlignment,false);
+ }
+
+ /* get new partial block if allocation failed */
+ size_t blockSize = allocBlockSize;
+ ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
+ bytesWasted += end-cur;
+ cur = 0; end = blockSize;
+
+ /* retry allocation */
+ ofs = (align - cur) & (align-1);
+ cur += bytes + ofs;
+ if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+ cur -= bytes + ofs;
+
+ /* get new full block if allocation failed */
+ blockSize = allocBlockSize;
+ ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
+ bytesWasted += end-cur;
+ cur = 0; end = blockSize;
+
+ /* retry allocation */
+ ofs = (align - cur) & (align-1);
+ cur += bytes + ofs;
+ if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+ cur -= bytes + ofs;
+
+ /* should never happen as large allocations get handled specially above */
+ assert(false);
+ return nullptr;
+ }
+
+
+ /*! returns amount of used bytes */
+ __forceinline size_t getUsedBytes() const { return bytesUsed; }
+
+ /*! returns amount of free bytes */
+ __forceinline size_t getFreeBytes() const { return end-cur; }
+
+ /*! returns amount of wasted bytes */
+ __forceinline size_t getWastedBytes() const { return bytesWasted; }
+
+ private:
+ ThreadLocal2* parent;
+ char* ptr; //!< pointer to memory block
+ size_t cur; //!< current location of the allocator
+ size_t end; //!< end of the memory block
+ size_t allocBlockSize; //!< block size for allocations
+ size_t bytesUsed; //!< number of total bytes allocated
+ size_t bytesWasted; //!< number of bytes wasted
+ };
+
+ /*! Two thread local structures. */
+ struct __aligned(64) ThreadLocal2
+ {
+ ALIGNED_CLASS_(64);
+ public:
+
+ __forceinline ThreadLocal2()
+ : alloc(nullptr), alloc0(this), alloc1(this) {}
+
+ /*! bind to fast allocator */
+ __forceinline void bind(FastAllocator* alloc_i)
+ {
+ assert(alloc_i);
+ if (alloc.load() == alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(mutex);
+#else
+ Lock<SpinLock> lock(mutex);
+#endif
+ //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
+ if (alloc.load()) {
+ alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
+ alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
+ alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+ }
+ alloc0.init(alloc_i);
+ alloc1.init(alloc_i);
+ alloc.store(alloc_i);
+ alloc_i->join(this);
+ }
+
+ /*! unbind to fast allocator */
+ void unbind(FastAllocator* alloc_i)
+ {
+ assert(alloc_i);
+ if (alloc.load() != alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(mutex);
+#else
+ Lock<SpinLock> lock(mutex);
+#endif
+ if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
+ alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
+ alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
+ alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+ alloc0.init(nullptr);
+ alloc1.init(nullptr);
+ alloc.store(nullptr);
+ }
+
+ public:
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::mutex mutex;
+#else
+ SpinLock mutex; //!< required as unbind is called from other threads
+#endif
+ std::atomic<FastAllocator*> alloc; //!< parent allocator
+ ThreadLocal alloc0;
+ ThreadLocal alloc1;
+ };
+
+ FastAllocator (Device* device, bool osAllocation)
+ : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
+ growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
+ primrefarray(device,0)
+ {
+ for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+ {
+ threadUsedBlocks[i] = nullptr;
+ threadBlocks[i] = nullptr;
+ assert(!slotMutex[i].isLocked());
+ }
+ }
+
+ ~FastAllocator () {
+ clear();
+ }
+
+ /*! returns the device attached to this allocator */
+ Device* getDevice() {
+ return device;
+ }
+
+ void share(mvector<PrimRef>& primrefarray_i) {
+ primrefarray = std::move(primrefarray_i);
+ }
+
+ void unshare(mvector<PrimRef>& primrefarray_o)
+ {
+ reset(); // this removes blocks that are allocated inside the shared primref array
+ primrefarray_o = std::move(primrefarray);
+ }
+
+ /*! returns first fast thread local allocator */
+ __forceinline ThreadLocal* _threadLocal() {
+ return &threadLocal2()->alloc0;
+ }
+
+ void setOSallocation(bool flag)
+ {
+ atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
+ }
+
+ private:
+
+ /*! returns both fast thread local allocators */
+ __forceinline ThreadLocal2* threadLocal2()
+ {
+ ThreadLocal2* alloc = thread_local_allocator2;
+ if (alloc == nullptr) {
+ thread_local_allocator2 = alloc = new ThreadLocal2;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
+ Lock<SpinLock> lock(s_thread_local_allocators_lock);
+#endif
+ s_thread_local_allocators.push_back(make_unique(alloc));
+ }
+ return alloc;
+ }
+
+ public:
+
+ __forceinline void join(ThreadLocal2* alloc)
+ {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
+ Lock<SpinLock> lock(thread_local_allocators_lock);
+#endif
+ thread_local_allocators.push_back(alloc);
+ }
+
+ public:
+
+ struct CachedAllocator
+ {
+ __forceinline CachedAllocator(void* ptr)
+ : alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
+ {
+ assert(ptr == nullptr);
+ }
+
+ __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
+ : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
+
+ __forceinline operator bool () const {
+ return alloc != nullptr;
+ }
+
+ __forceinline void* operator() (size_t bytes, size_t align = 16) const {
+ return talloc0->malloc(alloc,bytes,align);
+ }
+
+ __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
+ return talloc0->malloc(alloc,bytes,align);
+ }
+
+ __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
+ return talloc1->malloc(alloc,bytes,align);
+ }
+
+ public:
+ FastAllocator* alloc;
+ ThreadLocal* talloc0;
+ ThreadLocal* talloc1;
+ };
+
+ __forceinline CachedAllocator getCachedAllocator() {
+ return CachedAllocator(this,threadLocal2());
+ }
+
+ /*! Builder interface to create thread local allocator */
+ struct Create
+ {
+ public:
+ __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
+ __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
+
+ private:
+ FastAllocator* allocator;
+ };
+
+ void internal_fix_used_blocks()
+ {
+ /* move thread local blocks to global block list */
+ for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
+ {
+ while (threadBlocks[i].load() != nullptr) {
+ Block* nextUsedBlock = threadBlocks[i].load()->next;
+ threadBlocks[i].load()->next = usedBlocks.load();
+ usedBlocks = threadBlocks[i].load();
+ threadBlocks[i] = nextUsedBlock;
+ }
+ threadBlocks[i] = nullptr;
+ }
+ }
+
+ static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
+#if defined(__AVX512ER__) // KNL
+ static const size_t mainAllocOverheadStatic = 15; //! 15 means 7.5% allocation overhead through unfilled main alloc blocks
+#else
+ static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks
+#endif
+ static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
+
+ /* calculates a single threaded threshold for the builders such
+ * that for small scenes the overhead of partly allocated blocks
+ * per thread is low */
+ size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
+ {
+ if (numPrimitives == 0 || bytesEstimated == 0)
+ return defaultThreshold;
+
+ /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
+ const size_t single_mode_factor = use_single_mode ? 1 : 2;
+ const size_t threadCount = TaskScheduler::threadCount();
+ const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
+
+ /* if we do not have to limit number of threads use optimal thresdhold */
+ if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+ return defaultThreshold;
+
+ /* otherwise limit number of threads by calculating proper single thread threshold */
+ else {
+ double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
+ return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));
+ }
+ }
+
+ __forceinline size_t alignSize(size_t i) {
+ return (i+127)/128*128;
+ }
+
+ /*! initializes the grow size */
+ __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
+ {
+ /* we do not need single thread local allocator mode */
+ use_single_mode = false;
+
+ /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
+ size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
+ size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
+ growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
+
+ /* if we reached the maxAllocationSize for growSize, we can
+ * increase the number of allocation slots by still guaranteeing
+ * the mainAllocationOverhead */
+ slotMask = 0x0;
+
+ if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
+ if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
+ if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
+ if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
+
+ /* set the thread local alloc block size */
+ size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
+
+ /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
+#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
+ const size_t threadCount = TaskScheduler::threadCount();
+ const size_t single_mode_factor = use_single_mode ? 1 : 2;
+ const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
+ if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+ defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
+
+ /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
+ else
+#endif
+ defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
+
+ if (bytesEstimated == 0) {
+ maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
+ defaultBlockSize = defaultBlockSizeSwitch;
+ }
+ log2_grow_size_scale = 0;
+
+ if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
+ if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
+ if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
+ if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
+ if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
+ if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
+ if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
+ }
+
+ /*! initializes the allocator */
+ void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
+ {
+ internal_fix_used_blocks();
+ /* distribute the allocation to multiple thread block slots */
+ slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
+ if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+ if (bytesReserve == 0) bytesReserve = bytesAllocate;
+ freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
+ estimatedSize = bytesEstimate;
+ initGrowSizeAndNumSlots(bytesEstimate,true);
+ }
+
+ /*! initializes the allocator */
+ void init_estimate(size_t bytesEstimate)
+ {
+ internal_fix_used_blocks();
+ if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+ /* single allocator mode ? */
+ estimatedSize = bytesEstimate;
+ //initGrowSizeAndNumSlots(bytesEstimate,false);
+ initGrowSizeAndNumSlots(bytesEstimate,false);
+
+ }
+
+ /*! frees state not required after build */
+ __forceinline void cleanup()
+ {
+ internal_fix_used_blocks();
+
+ /* unbind all thread local allocators */
+ for (auto alloc : thread_local_allocators) alloc->unbind(this);
+ thread_local_allocators.clear();
+ }
+
+ /*! resets the allocator, memory blocks get reused */
+ void reset ()
+ {
+ internal_fix_used_blocks();
+
+ bytesUsed.store(0);
+ bytesFree.store(0);
+ bytesWasted.store(0);
+
+ /* reset all used blocks and move them to begin of free block list */
+ while (usedBlocks.load() != nullptr) {
+ usedBlocks.load()->reset_block();
+ Block* nextUsedBlock = usedBlocks.load()->next;
+ usedBlocks.load()->next = freeBlocks.load();
+ freeBlocks = usedBlocks.load();
+ usedBlocks = nextUsedBlock;
+ }
+
+ /* remove all shared blocks as they are re-added during build */
+ freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
+
+ for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+ {
+ threadUsedBlocks[i] = nullptr;
+ threadBlocks[i] = nullptr;
+ }
+
+ /* unbind all thread local allocators */
+ for (auto alloc : thread_local_allocators) alloc->unbind(this);
+ thread_local_allocators.clear();
+ }
+
+ /*! frees all allocated memory */
+ __forceinline void clear()
+ {
+ cleanup();
+ bytesUsed.store(0);
+ bytesFree.store(0);
+ bytesWasted.store(0);
+ if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
+ if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
+ for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
+ threadUsedBlocks[i] = nullptr;
+ threadBlocks[i] = nullptr;
+ }
+ primrefarray.clear();
+ }
+
+ __forceinline size_t incGrowSizeScale()
+ {
+ size_t scale = log2_grow_size_scale.fetch_add(1)+1;
+ return size_t(1) << min(size_t(16),scale);
+ }
+
+ /*! thread safe allocation of memory */
+ void* malloc(size_t& bytes, size_t align, bool partial)
+ {
+ assert(align <= maxAlignment);
+
+ while (true)
+ {
+ /* allocate using current block */
+ size_t threadID = TaskScheduler::threadID();
+ size_t slot = threadID & slotMask;
+ Block* myUsedBlocks = threadUsedBlocks[slot];
+ if (myUsedBlocks) {
+ void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+ if (ptr) return ptr;
+ }
+
+ /* throw error if allocation is too large */
+ if (bytes > maxAllocationSize)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
+
+ /* parallel block creation in case of no freeBlocks, avoids single global mutex */
+ if (likely(freeBlocks.load() == nullptr))
+ {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(slotMutex[slot]);
+#else
+ Lock<SpinLock> lock(slotMutex[slot]);
+#endif
+ if (myUsedBlocks == threadUsedBlocks[slot]) {
+ const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
+ const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
+ assert(allocSize >= bytes);
+ threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
+ // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
+ }
+ continue;
+ }
+
+ /* if this fails allocate new block */
+ {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(mutex);
+#else
+ Lock<SpinLock> lock(mutex);
+#endif
+ if (myUsedBlocks == threadUsedBlocks[slot])
+ {
+ if (freeBlocks.load() != nullptr) {
+ Block* nextFreeBlock = freeBlocks.load()->next;
+ freeBlocks.load()->next = usedBlocks;
+ __memory_barrier();
+ usedBlocks = freeBlocks.load();
+ threadUsedBlocks[slot] = freeBlocks.load();
+ freeBlocks = nextFreeBlock;
+ } else {
+ const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
+ usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
+ }
+ }
+ }
+ }
+ }
+
+ /*! add new block */
+ void addBlock(void* ptr, ssize_t bytes)
+ {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(mutex);
+#else
+ Lock<SpinLock> lock(mutex);
+#endif
+ const size_t sizeof_Header = offsetof(Block,data[0]);
+ void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
+ size_t ofs = (size_t) aptr - (size_t) ptr;
+ bytes -= ofs;
+ if (bytes < 4096) return; // ignore empty or very small blocks
+ freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
+ }
+
+ /* special allocation only used from morton builder only a single time for each build */
+ void* specialAlloc(size_t bytes)
+ {
+ assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
+ return freeBlocks.load()->ptr();
+ }
+
+ struct Statistics
+ {
+ Statistics ()
+ : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
+
+ Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
+ : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
+
+ Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
+ : bytesUsed(0), bytesFree(0), bytesWasted(0)
+ {
+ Block* usedBlocks = alloc->usedBlocks.load();
+ Block* freeBlocks = alloc->freeBlocks.load();
+ if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
+ if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
+ if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
+ if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
+ if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
+ }
+
+ std::string str(size_t numPrimitives)
+ {
+ std::stringstream str;
+ str.setf(std::ios::fixed, std::ios::floatfield);
+ str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+ << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
+ << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
+ << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
+ << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
+ return str.str();
+ }
+
+ friend Statistics operator+ ( const Statistics& a, const Statistics& b)
+ {
+ return Statistics(a.bytesUsed+b.bytesUsed,
+ a.bytesFree+b.bytesFree,
+ a.bytesWasted+b.bytesWasted);
+ }
+
+ size_t bytesAllocatedTotal() const {
+ return bytesUsed + bytesFree + bytesWasted;
+ }
+
+ public:
+ size_t bytesUsed;
+ size_t bytesFree;
+ size_t bytesWasted;
+ };
+
+ Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
+ return Statistics(this,atype,huge_pages);
+ }
+
+ size_t getUsedBytes() {
+ return bytesUsed;
+ }
+
+ size_t getWastedBytes() {
+ return bytesWasted;
+ }
+
+ struct AllStatistics
+ {
+ AllStatistics (FastAllocator* alloc)
+
+ : bytesUsed(alloc->bytesUsed),
+ bytesFree(alloc->bytesFree),
+ bytesWasted(alloc->bytesWasted),
+ stat_all(alloc,ANY_TYPE),
+ stat_malloc(alloc,ALIGNED_MALLOC),
+ stat_4K(alloc,EMBREE_OS_MALLOC,false),
+ stat_2M(alloc,EMBREE_OS_MALLOC,true),
+ stat_shared(alloc,SHARED) {}
+
+ AllStatistics (size_t bytesUsed,
+ size_t bytesFree,
+ size_t bytesWasted,
+ Statistics stat_all,
+ Statistics stat_malloc,
+ Statistics stat_4K,
+ Statistics stat_2M,
+ Statistics stat_shared)
+
+ : bytesUsed(bytesUsed),
+ bytesFree(bytesFree),
+ bytesWasted(bytesWasted),
+ stat_all(stat_all),
+ stat_malloc(stat_malloc),
+ stat_4K(stat_4K),
+ stat_2M(stat_2M),
+ stat_shared(stat_shared) {}
+
+ friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
+ {
+ return AllStatistics(a.bytesUsed+b.bytesUsed,
+ a.bytesFree+b.bytesFree,
+ a.bytesWasted+b.bytesWasted,
+ a.stat_all + b.stat_all,
+ a.stat_malloc + b.stat_malloc,
+ a.stat_4K + b.stat_4K,
+ a.stat_2M + b.stat_2M,
+ a.stat_shared + b.stat_shared);
+ }
+
+ void print(size_t numPrimitives)
+ {
+ std::stringstream str0;
+ str0.setf(std::ios::fixed, std::ios::floatfield);
+ str0 << " alloc : "
+ << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+ << " "
+ << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
+ std::cout << str0.str() << std::endl;
+
+ std::stringstream str1;
+ str1.setf(std::ios::fixed, std::ios::floatfield);
+ str1 << " alloc : "
+ << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+ << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
+ << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
+ << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
+ << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
+ std::cout << str1.str() << std::endl;
+
+ std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
+ std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
+ std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
+ std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
+ std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
+ }
+
+ private:
+ size_t bytesUsed;
+ size_t bytesFree;
+ size_t bytesWasted;
+ Statistics stat_all;
+ Statistics stat_malloc;
+ Statistics stat_4K;
+ Statistics stat_2M;
+ Statistics stat_shared;
+ };
+
+ void print_blocks()
+ {
+ std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
+
+ std::cout << " used blocks = ";
+ if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
+ std::cout << "[END]" << std::endl;
+
+ std::cout << " free blocks = ";
+ if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
+ std::cout << "[END]" << std::endl;
+ }
+
+ private:
+
+ struct Block
+ {
+ static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
+ {
+ /* We avoid using os_malloc for small blocks as this could
+ * cause a risk of fragmenting the virtual address space and
+ * reach the limit of vm.max_map_count = 65k under Linux. */
+ if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
+ atype = ALIGNED_MALLOC;
+
+ /* we need to additionally allocate some header */
+ const size_t sizeof_Header = offsetof(Block,data[0]);
+ bytesAllocate = sizeof_Header+bytesAllocate;
+ bytesReserve = sizeof_Header+bytesReserve;
+
+ /* consume full 4k pages with using os_malloc */
+ if (atype == EMBREE_OS_MALLOC) {
+ bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+ bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+ }
+
+ /* either use alignedMalloc or os_malloc */
+ void *ptr = nullptr;
+ if (atype == ALIGNED_MALLOC)
+ {
+ /* special handling for default block size */
+ if (bytesAllocate == (2*PAGE_SIZE_2M))
+ {
+ const size_t alignment = maxAlignment;
+ if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+ ptr = alignedMalloc(bytesAllocate,alignment);
+
+ /* give hint to transparently convert these pages to 2MB pages */
+ const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
+ os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block
+ os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
+ os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
+
+ return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+ }
+ else
+ {
+ const size_t alignment = maxAlignment;
+ if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+ ptr = alignedMalloc(bytesAllocate,alignment);
+ return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+ }
+ }
+ else if (atype == EMBREE_OS_MALLOC)
+ {
+ if (device) device->memoryMonitor(bytesAllocate,false);
+ bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
+ return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+ }
+ else
+ assert(false);
+
+ return NULL;
+ }
+
+ Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
+ : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
+ {
+ assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
+ }
+
+ static Block* remove_shared_blocks(Block* head)
+ {
+ Block** prev_next = &head;
+ for (Block* block = head; block; block = block->next) {
+ if (block->atype == SHARED) *prev_next = block->next;
+ else prev_next = &block->next;
+ }
+ return head;
+ }
+
+ void clear_list(MemoryMonitorInterface* device)
+ {
+ Block* block = this;
+ while (block) {
+ Block* next = block->next;
+ block->clear_block(device);
+ block = next;
+ }
+ }
+
+ void clear_block (MemoryMonitorInterface* device)
+ {
+ const size_t sizeof_Header = offsetof(Block,data[0]);
+ const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
+
+ if (atype == ALIGNED_MALLOC) {
+ alignedFree(this);
+ if (device) device->memoryMonitor(-sizeof_Alloced,true);
+ }
+
+ else if (atype == EMBREE_OS_MALLOC) {
+ size_t sizeof_This = sizeof_Header+reserveEnd;
+ os_free(this,sizeof_This,huge_pages);
+ if (device) device->memoryMonitor(-sizeof_Alloced,true);
+ }
+
+ else /* if (atype == SHARED) */ {
+ }
+ }
+
+ void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
+ {
+ size_t bytes = bytes_in;
+ assert(align <= maxAlignment);
+ bytes = (bytes+(align-1)) & ~(align-1);
+ if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
+ const size_t i = cur.fetch_add(bytes);
+ if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
+ if (unlikely(i > reserveEnd)) return nullptr;
+ bytes_in = bytes = min(bytes,reserveEnd-i);
+
+ if (i+bytes > allocEnd) {
+ if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
+ }
+ return &data[i];
+ }
+
+ void* ptr() {
+ return &data[cur];
+ }
+
+ void reset_block ()
+ {
+ allocEnd = max(allocEnd,(size_t)cur);
+ cur = 0;
+ }
+
+ size_t getBlockUsedBytes() const {
+ return min(size_t(cur),reserveEnd);
+ }
+
+ size_t getBlockFreeBytes() const {
+ return getBlockAllocatedBytes() - getBlockUsedBytes();
+ }
+
+ size_t getBlockAllocatedBytes() const {
+ return min(max(allocEnd,size_t(cur)),reserveEnd);
+ }
+
+ size_t getBlockWastedBytes() const {
+ const size_t sizeof_Header = offsetof(Block,data[0]);
+ return sizeof_Header + wasted;
+ }
+
+ size_t getBlockReservedBytes() const {
+ return reserveEnd;
+ }
+
+ bool hasType(AllocationType atype_i, bool huge_pages_i) const
+ {
+ if (atype_i == ANY_TYPE ) return true;
+ else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+ else return atype_i == atype;
+ }
+
+ size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
+ size_t bytes = 0;
+ for (const Block* block = this; block; block = block->next) {
+ if (!block->hasType(atype,huge_pages)) continue;
+ bytes += block->getBlockUsedBytes();
+ }
+ return bytes;
+ }
+
+ size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
+ size_t bytes = 0;
+ for (const Block* block = this; block; block = block->next) {
+ if (!block->hasType(atype,huge_pages)) continue;
+ bytes += block->getBlockFreeBytes();
+ }
+ return bytes;
+ }
+
+ size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
+ size_t bytes = 0;
+ for (const Block* block = this; block; block = block->next) {
+ if (!block->hasType(atype,huge_pages)) continue;
+ bytes += block->getBlockWastedBytes();
+ }
+ return bytes;
+ }
+
+ size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
+ size_t bytes = 0;
+ for (const Block* block = this; block; block = block->next) {
+ if (!block->hasType(atype,huge_pages)) continue;
+ bytes += block->getBlockAllocatedBytes();
+ }
+ return bytes;
+ }
+
+ void print_list ()
+ {
+ for (const Block* block = this; block; block = block->next)
+ block->print_block();
+ }
+
+ void print_block() const
+ {
+ if (atype == ALIGNED_MALLOC) std::cout << "A";
+ else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
+ else if (atype == SHARED) std::cout << "S";
+ if (huge_pages) std::cout << "H";
+ size_t bytesUsed = getBlockUsedBytes();
+ size_t bytesFree = getBlockFreeBytes();
+ size_t bytesWasted = getBlockWastedBytes();
+ std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
+ }
+
+ public:
+ std::atomic<size_t> cur; //!< current location of the allocator
+ std::atomic<size_t> allocEnd; //!< end of the allocated memory region
+ std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
+ Block* next; //!< pointer to next block in list
+ size_t wasted; //!< amount of memory wasted through block alignment
+ AllocationType atype; //!< allocation mode of the block
+ bool huge_pages; //!< whether the block uses huge pages
+ char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
+ char data[1]; //!< here starts memory to use for allocations
+ };
+
+ private:
+ Device* device;
+ SpinLock mutex;
+ size_t slotMask;
+ std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+ std::atomic<Block*> usedBlocks;
+ std::atomic<Block*> freeBlocks;
+
+ std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#else
+ SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#endif
+
+ bool use_single_mode;
+ size_t defaultBlockSize;
+ size_t estimatedSize;
+ size_t growSize;
+ size_t maxGrowSize;
+ std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
+ std::atomic<size_t> bytesUsed;
+ std::atomic<size_t> bytesFree;
+ std::atomic<size_t> bytesWasted;
+ static __thread ThreadLocal2* thread_local_allocator2;
+ static SpinLock s_thread_local_allocators_lock;
+ static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::mutex thread_local_allocators_lock;
+#else
+ SpinLock thread_local_allocators_lock;
+#endif
+ std::vector<ThreadLocal2*> thread_local_allocators;
+ AllocationType atype;
+ mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/buffer.h b/thirdparty/embree-aarch64/kernels/common/buffer.h
new file mode 100644
index 0000000000..02d319c59d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/buffer.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+
+namespace embree
+{
+ /*! Implements an API data buffer object. This class may or may not own the data. */
+ class Buffer : public RefCount
+ {
+ public:
+ /*! Buffer construction */
+ Buffer()
+ : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {}
+
+ /*! Buffer construction */
+ Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr)
+ : device(device), numBytes(numBytes_in)
+ {
+ device->refInc();
+
+ if (ptr_in)
+ {
+ shared = true;
+ ptr = (char*)ptr_in;
+ }
+ else
+ {
+ shared = false;
+ alloc();
+ }
+ }
+
+ /*! Buffer destruction */
+ ~Buffer() {
+ free();
+ device->refDec();
+ }
+
+ /*! this class is not copyable */
+ private:
+ Buffer(const Buffer& other) DELETED; // do not implement
+ Buffer& operator =(const Buffer& other) DELETED; // do not implement
+
+ public:
+ /* inits and allocates the buffer */
+ void create(Device* device_in, size_t numBytes_in)
+ {
+ init(device_in, numBytes_in);
+ alloc();
+ }
+
+ /* inits the buffer */
+ void init(Device* device_in, size_t numBytes_in)
+ {
+ free();
+ device = device_in;
+ ptr = nullptr;
+ numBytes = numBytes_in;
+ shared = false;
+ }
+
+ /*! sets shared buffer */
+ void set(Device* device_in, void* ptr_in, size_t numBytes_in)
+ {
+ free();
+ device = device_in;
+ ptr = (char*)ptr_in;
+ if (numBytes_in != (size_t)-1)
+ numBytes = numBytes_in;
+ shared = true;
+ }
+
+ /*! allocated buffer */
+ void alloc()
+ {
+ if (device)
+ device->memoryMonitor(this->bytes(), false);
+ size_t b = (this->bytes()+15) & ssize_t(-16);
+ ptr = (char*)alignedMalloc(b,16);
+ }
+
+ /*! frees the buffer */
+ void free()
+ {
+ if (shared) return;
+ alignedFree(ptr);
+ if (device)
+ device->memoryMonitor(-ssize_t(this->bytes()), true);
+ ptr = nullptr;
+ }
+
+ /*! gets buffer pointer */
+ void* data()
+ {
+ /* report error if buffer is not existing */
+ if (!device)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified");
+
+ /* return buffer */
+ return ptr;
+ }
+
+ /*! returns pointer to first element */
+ __forceinline char* getPtr() const {
+ return ptr;
+ }
+
+ /*! returns the number of bytes of the buffer */
+ __forceinline size_t bytes() const {
+ return numBytes;
+ }
+
+ /*! returns true of the buffer is not empty */
+ __forceinline operator bool() const {
+ return ptr;
+ }
+
+ public:
+ Device* device; //!< device to report memory usage to
+ char* ptr; //!< pointer to buffer data
+ size_t numBytes; //!< number of bytes in the buffer
+ bool shared; //!< set if memory is shared with application
+ };
+
+ /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */
+ class RawBufferView
+ {
+ public:
+ /*! Buffer construction */
+ RawBufferView()
+ : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {}
+
+ public:
+ /*! sets the buffer view */
+ void set(const Ref<Buffer>& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in)
+ {
+ if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes))
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds");
+
+ ptr_ofs = buffer_in->ptr + offset_in;
+ stride = stride_in;
+ num = num_in;
+ format = format_in;
+ modCounter++;
+ modified = true;
+ buffer = buffer_in;
+ }
+
+ /*! returns pointer to the first element */
+ __forceinline char* getPtr() const {
+ return ptr_ofs;
+ }
+
+ /*! returns pointer to the i'th element */
+ __forceinline char* getPtr(size_t i) const
+ {
+ assert(i<num);
+ return ptr_ofs + i*stride;
+ }
+
+ /*! returns the number of elements of the buffer */
+ __forceinline size_t size() const {
+ return num;
+ }
+
+ /*! returns the number of bytes of the buffer */
+ __forceinline size_t bytes() const {
+ return num*stride;
+ }
+
+ /*! returns the buffer stride */
+ __forceinline unsigned getStride() const
+ {
+ assert(stride <= unsigned(inf));
+ return unsigned(stride);
+ }
+
+ /*! return the buffer format */
+ __forceinline RTCFormat getFormat() const {
+ return format;
+ }
+
+ /*! mark buffer as modified or unmodified */
+ __forceinline void setModified() {
+ modCounter++;
+ modified = true;
+ }
+
+ /*! mark buffer as modified or unmodified */
+ __forceinline bool isModified(unsigned int otherModCounter) const {
+ return modCounter > otherModCounter;
+ }
+
+ /*! mark buffer as modified or unmodified */
+ __forceinline bool isLocalModified() const {
+ return modified;
+ }
+
+ /*! clear local modified flag */
+ __forceinline void clearLocalModified() {
+ modified = false;
+ }
+
+ /*! returns true of the buffer is not empty */
+ __forceinline operator bool() const {
+ return ptr_ofs;
+ }
+
+ /*! checks padding to 16 byte check, fails hard */
+ __forceinline void checkPadding16() const
+ {
+ if (ptr_ofs && num)
+ volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable?
+ }
+
+ public:
+ char* ptr_ofs; //!< base pointer plus offset
+ size_t stride; //!< stride of the buffer in bytes
+ size_t num; //!< number of elements in the buffer
+ RTCFormat format; //!< format of the buffer
+ unsigned int modCounter; //!< version ID of this buffer
+ bool modified; //!< local modified data
+ int userData; //!< special data
+ Ref<Buffer> buffer; //!< reference to the parent buffer
+ };
+
+ /*! A typed contiguous range of a buffer. This class does not own the buffer content. */
+ template<typename T>
+ class BufferView : public RawBufferView
+ {
+ public:
+ typedef T value_type;
+
+ /*! access to the ith element of the buffer */
+ __forceinline T& operator [](size_t i) { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+ __forceinline const T& operator [](size_t i) const { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+ };
+
+ template<>
+ class BufferView<Vec3fa> : public RawBufferView
+ {
+ public:
+ typedef Vec3fa value_type;
+
+ /*! access to the ith element of the buffer */
+ __forceinline const Vec3fa operator [](size_t i) const
+ {
+ assert(i<num);
+ return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride)));
+ }
+
+ /*! writes the i'th element */
+ __forceinline void store(size_t i, const Vec3fa& v)
+ {
+ assert(i<num);
+ vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v);
+ }
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/builder.h b/thirdparty/embree-aarch64/kernels/common/builder.h
new file mode 100644
index 0000000000..d2a1cfe3ce
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/builder.h
@@ -0,0 +1,60 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "accel.h"
+
+namespace embree
+{
+#define MODE_HIGH_QUALITY (1<<8)
+
+ /*! virtual interface for all hierarchy builders */
+ class Builder : public RefCount {
+ public:
+
+ static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+ /*! initiates the hierarchy builder */
+ virtual void build() = 0;
+
+ /*! notifies the builder about the deletion of some geometry */
+ virtual void deleteGeometry(size_t geomID) {};
+
+ /*! clears internal builder state */
+ virtual void clear() = 0;
+ };
+
+ /*! virtual interface for progress monitor class */
+ struct BuildProgressMonitor {
+ virtual void operator() (size_t dn) const = 0;
+ };
+
+ /*! build the progress monitor interface from a closure */
+ template<typename Closure>
+ struct ProgressMonitorClosure : BuildProgressMonitor
+ {
+ public:
+ ProgressMonitorClosure (const Closure& closure) : closure(closure) {}
+ void operator() (size_t dn) const { closure(dn); }
+ private:
+ const Closure closure;
+ };
+ template<typename Closure> __forceinline const ProgressMonitorClosure<Closure> BuildProgressMonitorFromClosure(const Closure& closure) {
+ return ProgressMonitorClosure<Closure>(closure);
+ }
+
+ struct LineSegments;
+ struct TriangleMesh;
+ struct QuadMesh;
+ struct UserGeometry;
+
+ class Scene;
+
+ typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder);
+ typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+ typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+ typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/context.h b/thirdparty/embree-aarch64/kernels/common/context.h
new file mode 100644
index 0000000000..d0185a74f2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/context.h
@@ -0,0 +1,131 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "rtcore.h"
+#include "point_query.h"
+
+namespace embree
+{
+ class Scene;
+
+ struct IntersectContext
+ {
+ public:
+ __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context)
+ : scene(scene), user(user_context) {}
+
+ __forceinline bool hasContextFilter() const {
+ return user->filter != nullptr;
+ }
+
+ __forceinline bool isCoherent() const {
+ return embree::isCoherent(user->flags);
+ }
+
+ __forceinline bool isIncoherent() const {
+ return embree::isIncoherent(user->flags);
+ }
+
+ public:
+ Scene* scene;
+ RTCIntersectContext* user;
+ };
+
+ template<int M, typename Geometry>
+ __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v)
+ {
+#if RTC_MIN_WIDTH
+ const vfloat<M> d = length(Vec3vf<M>(v) - ray_org);
+ const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+ return Vec4vf<M>(v.x,v.y,v.z,r);
+#else
+ return v;
+#endif
+ }
+
+ template<typename Geometry>
+ __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v)
+ {
+#if RTC_MIN_WIDTH
+ const float d = length(Vec3fa(v) - ray_org);
+ const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+ return Vec3ff(v.x,v.y,v.z,r);
+#else
+ return v;
+#endif
+ }
+
+ enum PointQueryType
+ {
+ POINT_QUERY_TYPE_UNDEFINED = 0,
+ POINT_QUERY_TYPE_SPHERE = 1,
+ POINT_QUERY_TYPE_AABB = 2,
+ };
+
+ typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+
+ struct PointQueryContext
+ {
+ public:
+ __forceinline PointQueryContext(Scene* scene,
+ PointQuery* query_ws,
+ PointQueryType query_type,
+ PointQueryFunction func,
+ RTCPointQueryContext* userContext,
+ float similarityScale,
+ void* userPtr)
+ : scene(scene)
+ , query_ws(query_ws)
+ , query_type(query_type)
+ , func(func)
+ , userContext(userContext)
+ , similarityScale(similarityScale)
+ , userPtr(userPtr)
+ , primID(RTC_INVALID_GEOMETRY_ID)
+ , geomID(RTC_INVALID_GEOMETRY_ID)
+ , query_radius(query_ws->radius)
+ {
+ if (query_type == POINT_QUERY_TYPE_AABB) {
+ assert(similarityScale == 0.f);
+ updateAABB();
+ }
+ if (userContext->instStackSize == 0) {
+ assert(similarityScale == 1.f);
+ }
+ }
+
+ public:
+ __forceinline void updateAABB()
+ {
+ if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) {
+ query_radius = Vec3fa(query_ws->radius);
+ return;
+ }
+
+ const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+ BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius));
+ bbox = xfmBounds(m, bbox);
+ query_radius = 0.5f * (bbox.upper - bbox.lower);
+ }
+
+public:
+ Scene* scene;
+
+ PointQuery* query_ws; // the original world space point query
+ PointQueryType query_type;
+ PointQueryFunction func;
+ RTCPointQueryContext* userContext;
+ const float similarityScale;
+
+ void* userPtr;
+
+ unsigned int primID;
+ unsigned int geomID;
+
+ Vec3fa query_radius; // used if the query is converted to an AABB internally
+ };
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/common/default.h b/thirdparty/embree-aarch64/kernels/common/default.h
new file mode 100644
index 0000000000..709119163b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/default.h
@@ -0,0 +1,273 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+#include "../../common/sys/thread.h"
+#include "../../common/sys/alloc.h"
+#include "../../common/sys/ref.h"
+#include "../../common/sys/intrinsics.h"
+#include "../../common/sys/atomic.h"
+#include "../../common/sys/mutex.h"
+#include "../../common/sys/vector.h"
+#include "../../common/sys/array.h"
+#include "../../common/sys/string.h"
+#include "../../common/sys/regression.h"
+#include "../../common/sys/vector.h"
+
+#include "../../common/math/math.h"
+#include "../../common/math/transcendental.h"
+#include "../../common/simd/simd.h"
+#include "../../common/math/vec2.h"
+#include "../../common/math/vec3.h"
+#include "../../common/math/vec4.h"
+#include "../../common/math/vec2fa.h"
+#include "../../common/math/vec3fa.h"
+#include "../../common/math/interval.h"
+#include "../../common/math/bbox.h"
+#include "../../common/math/obbox.h"
+#include "../../common/math/lbbox.h"
+#include "../../common/math/linearspace2.h"
+#include "../../common/math/linearspace3.h"
+#include "../../common/math/affinespace.h"
+#include "../../common/math/range.h"
+#include "../../common/lexers/tokenstream.h"
+
+#include "../../common/tasking/taskscheduler.h"
+
+#define COMMA ,
+
+#include "../config.h"
+#include "isa.h"
+#include "stat.h"
+#include "profile.h"
+#include "rtcore.h"
+#include "vector.h"
+#include "state.h"
+#include "instance_stack.h"
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <sstream>
+
+#if !defined(_DEBUG) && defined(BUILD_IOS)
+#undef assert
+#define assert(_EXPR)
+#endif
+
+namespace embree
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Vec2 shortcuts
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int N> using Vec2vf = Vec2<vfloat<N>>;
+ template<int N> using Vec2vd = Vec2<vdouble<N>>;
+ template<int N> using Vec2vr = Vec2<vreal<N>>;
+ template<int N> using Vec2vi = Vec2<vint<N>>;
+ template<int N> using Vec2vl = Vec2<vllong<N>>;
+ template<int N> using Vec2vb = Vec2<vbool<N>>;
+ template<int N> using Vec2vbf = Vec2<vboolf<N>>;
+ template<int N> using Vec2vbd = Vec2<vboold<N>>;
+
+ typedef Vec2<vfloat4> Vec2vf4;
+ typedef Vec2<vdouble4> Vec2vd4;
+ typedef Vec2<vreal4> Vec2vr4;
+ typedef Vec2<vint4> Vec2vi4;
+ typedef Vec2<vllong4> Vec2vl4;
+ typedef Vec2<vbool4> Vec2vb4;
+ typedef Vec2<vboolf4> Vec2vbf4;
+ typedef Vec2<vboold4> Vec2vbd4;
+
+ typedef Vec2<vfloat8> Vec2vf8;
+ typedef Vec2<vdouble8> Vec2vd8;
+ typedef Vec2<vreal8> Vec2vr8;
+ typedef Vec2<vint8> Vec2vi8;
+ typedef Vec2<vllong8> Vec2vl8;
+ typedef Vec2<vbool8> Vec2vb8;
+ typedef Vec2<vboolf8> Vec2vbf8;
+ typedef Vec2<vboold8> Vec2vbd8;
+
+ typedef Vec2<vfloat16> Vec2vf16;
+ typedef Vec2<vdouble16> Vec2vd16;
+ typedef Vec2<vreal16> Vec2vr16;
+ typedef Vec2<vint16> Vec2vi16;
+ typedef Vec2<vllong16> Vec2vl16;
+ typedef Vec2<vbool16> Vec2vb16;
+ typedef Vec2<vboolf16> Vec2vbf16;
+ typedef Vec2<vboold16> Vec2vbd16;
+
+ typedef Vec2<vfloatx> Vec2vfx;
+ typedef Vec2<vdoublex> Vec2vdx;
+ typedef Vec2<vrealx> Vec2vrx;
+ typedef Vec2<vintx> Vec2vix;
+ typedef Vec2<vllongx> Vec2vlx;
+ typedef Vec2<vboolx> Vec2vbx;
+ typedef Vec2<vboolfx> Vec2vbfx;
+ typedef Vec2<vbooldx> Vec2vbdx;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Vec3 shortcuts
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int N> using Vec3vf = Vec3<vfloat<N>>;
+ template<int N> using Vec3vd = Vec3<vdouble<N>>;
+ template<int N> using Vec3vr = Vec3<vreal<N>>;
+ template<int N> using Vec3vi = Vec3<vint<N>>;
+ template<int N> using Vec3vl = Vec3<vllong<N>>;
+ template<int N> using Vec3vb = Vec3<vbool<N>>;
+ template<int N> using Vec3vbf = Vec3<vboolf<N>>;
+ template<int N> using Vec3vbd = Vec3<vboold<N>>;
+
+ typedef Vec3<vfloat4> Vec3vf4;
+ typedef Vec3<vdouble4> Vec3vd4;
+ typedef Vec3<vreal4> Vec3vr4;
+ typedef Vec3<vint4> Vec3vi4;
+ typedef Vec3<vllong4> Vec3vl4;
+ typedef Vec3<vbool4> Vec3vb4;
+ typedef Vec3<vboolf4> Vec3vbf4;
+ typedef Vec3<vboold4> Vec3vbd4;
+
+ typedef Vec3<vfloat8> Vec3vf8;
+ typedef Vec3<vdouble8> Vec3vd8;
+ typedef Vec3<vreal8> Vec3vr8;
+ typedef Vec3<vint8> Vec3vi8;
+ typedef Vec3<vllong8> Vec3vl8;
+ typedef Vec3<vbool8> Vec3vb8;
+ typedef Vec3<vboolf8> Vec3vbf8;
+ typedef Vec3<vboold8> Vec3vbd8;
+
+ typedef Vec3<vfloat16> Vec3vf16;
+ typedef Vec3<vdouble16> Vec3vd16;
+ typedef Vec3<vreal16> Vec3vr16;
+ typedef Vec3<vint16> Vec3vi16;
+ typedef Vec3<vllong16> Vec3vl16;
+ typedef Vec3<vbool16> Vec3vb16;
+ typedef Vec3<vboolf16> Vec3vbf16;
+ typedef Vec3<vboold16> Vec3vbd16;
+
+ typedef Vec3<vfloatx> Vec3vfx;
+ typedef Vec3<vdoublex> Vec3vdx;
+ typedef Vec3<vrealx> Vec3vrx;
+ typedef Vec3<vintx> Vec3vix;
+ typedef Vec3<vllongx> Vec3vlx;
+ typedef Vec3<vboolx> Vec3vbx;
+ typedef Vec3<vboolfx> Vec3vbfx;
+ typedef Vec3<vbooldx> Vec3vbdx;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Vec4 shortcuts
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int N> using Vec4vf = Vec4<vfloat<N>>;
+ template<int N> using Vec4vd = Vec4<vdouble<N>>;
+ template<int N> using Vec4vr = Vec4<vreal<N>>;
+ template<int N> using Vec4vi = Vec4<vint<N>>;
+ template<int N> using Vec4vl = Vec4<vllong<N>>;
+ template<int N> using Vec4vb = Vec4<vbool<N>>;
+ template<int N> using Vec4vbf = Vec4<vboolf<N>>;
+ template<int N> using Vec4vbd = Vec4<vboold<N>>;
+
+ typedef Vec4<vfloat4> Vec4vf4;
+ typedef Vec4<vdouble4> Vec4vd4;
+ typedef Vec4<vreal4> Vec4vr4;
+ typedef Vec4<vint4> Vec4vi4;
+ typedef Vec4<vllong4> Vec4vl4;
+ typedef Vec4<vbool4> Vec4vb4;
+ typedef Vec4<vboolf4> Vec4vbf4;
+ typedef Vec4<vboold4> Vec4vbd4;
+
+ typedef Vec4<vfloat8> Vec4vf8;
+ typedef Vec4<vdouble8> Vec4vd8;
+ typedef Vec4<vreal8> Vec4vr8;
+ typedef Vec4<vint8> Vec4vi8;
+ typedef Vec4<vllong8> Vec4vl8;
+ typedef Vec4<vbool8> Vec4vb8;
+ typedef Vec4<vboolf8> Vec4vbf8;
+ typedef Vec4<vboold8> Vec4vbd8;
+
+ typedef Vec4<vfloat16> Vec4vf16;
+ typedef Vec4<vdouble16> Vec4vd16;
+ typedef Vec4<vreal16> Vec4vr16;
+ typedef Vec4<vint16> Vec4vi16;
+ typedef Vec4<vllong16> Vec4vl16;
+ typedef Vec4<vbool16> Vec4vb16;
+ typedef Vec4<vboolf16> Vec4vbf16;
+ typedef Vec4<vboold16> Vec4vbd16;
+
+ typedef Vec4<vfloatx> Vec4vfx;
+ typedef Vec4<vdoublex> Vec4vdx;
+ typedef Vec4<vrealx> Vec4vrx;
+ typedef Vec4<vintx> Vec4vix;
+ typedef Vec4<vllongx> Vec4vlx;
+ typedef Vec4<vboolx> Vec4vbx;
+ typedef Vec4<vboolfx> Vec4vbfx;
+ typedef Vec4<vbooldx> Vec4vbdx;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Other shortcuts
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<int N> using BBox3vf = BBox<Vec3vf<N>>;
+ typedef BBox<Vec3vf4> BBox3vf4;
+ typedef BBox<Vec3vf8> BBox3vf8;
+ typedef BBox<Vec3vf16> BBox3vf16;
+
+ /* calculate time segment itime and fractional time ftime */
+ __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime)
+ {
+ const float timeScaled = time * numTimeSegments;
+ const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+ ftime = timeScaled - itimef;
+ return int(itimef);
+ }
+
+ __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime)
+ {
+ const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+ const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+ ftime = timeScaled - itimef;
+ return int(itimef);
+ }
+
+ template<int N>
+ __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+ {
+ const vfloat<N> timeScaled = time * numTimeSegments;
+ const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+ ftime = timeScaled - itimef;
+ return vint<N>(itimef);
+ }
+
+ template<int N>
+ __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& start_time, const vfloat<N>& end_time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+ {
+ const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+ const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+ ftime = timeScaled - itimef;
+ return vint<N>(itimef);
+ }
+
+ /* calculate overlapping time segment range */
+ __forceinline range<int> getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments)
+ {
+ const float round_up = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step
+ const float round_down = 1.0f-2.0f*float(ulp);
+ const int itime_lower = (int)max(floor(round_up *time_range.lower*numTimeSegments), 0.0f);
+ const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments);
+ return make_range(itime_lower, itime_upper);
+ }
+
+ /* calculate overlapping time segment range */
+ __forceinline range<int> getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments)
+ {
+ const float lower = (range.lower-time_range.lower)/time_range.size();
+ const float upper = (range.upper-time_range.lower)/time_range.size();
+ return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments);
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/device.cpp b/thirdparty/embree-aarch64/kernels/common/device.cpp
new file mode 100644
index 0000000000..16ec11b892
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/device.cpp
@@ -0,0 +1,567 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "device.h"
+#include "../hash.h"
+#include "scene_triangle_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_subdiv_mesh.h"
+
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+#include "../geometry/cylinder.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+
+#include "../../common/tasking/taskscheduler.h"
+#include "../../common/sys/alloc.h"
+
+namespace embree
+{
+ /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */
+ ssize_t Device::debug_int0 = 0;
+ ssize_t Device::debug_int1 = 0;
+ ssize_t Device::debug_int2 = 0;
+ ssize_t Device::debug_int3 = 0;
+
+ DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs);
+
+ static MutexSys g_mutex;
+ static std::map<Device*,size_t> g_cache_size_map;
+ static std::map<Device*,size_t> g_num_threads_map;
+
+ Device::Device (const char* cfg)
+ {
+ /* check that CPU supports lowest ISA */
+ if (!hasISA(ISA)) {
+ throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR);
+ }
+
+ /* set default frequency level for detected CPU */
+ switch (getCPUModel()) {
+ case CPU::UNKNOWN: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::XEON_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::CORE_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+ case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+ case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break;
+ case CPU::CORE_KABY_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+ case CPU::XEON_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+ case CPU::CORE_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+ case CPU::XEON_BROADWELL: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::CORE_BROADWELL: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::XEON_HASWELL: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::CORE_HASWELL: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::SANDY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+ case CPU::NEHALEM: frequency_level = FREQUENCY_SIMD128; break;
+ case CPU::CORE2: frequency_level = FREQUENCY_SIMD128; break;
+ case CPU::CORE1: frequency_level = FREQUENCY_SIMD128; break;
+ }
+
+ /* initialize global state */
+#if defined(EMBREE_CONFIG)
+ State::parseString(EMBREE_CONFIG);
+#endif
+ State::parseString(cfg);
+ if (!ignore_config_files && FileName::executableFolder() != FileName(""))
+ State::parseFile(FileName::executableFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
+ if (!ignore_config_files && FileName::homeFolder() != FileName(""))
+ State::parseFile(FileName::homeFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
+ State::verify();
+
+ /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */
+ if (!checkISASupport()) {
+ throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA");
+ }
+
+ /*! do some internal tests */
+ assert(isa::Cylinder::verify());
+
+ /*! enable huge page support if desired */
+#if defined(__WIN32__)
+ if (State::enable_selockmemoryprivilege)
+ State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3));
+#endif
+ State::hugepages_success &= os_init(State::hugepages,State::verbosity(3));
+
+ /*! set tessellation cache size */
+ setCacheSize( State::tessellation_cache_size );
+
+ /*! enable some floating point exceptions to catch bugs */
+ if (State::float_exceptions)
+ {
+ int exceptions = _MM_MASK_MASK;
+ //exceptions &= ~_MM_MASK_INVALID;
+ exceptions &= ~_MM_MASK_DENORM;
+ exceptions &= ~_MM_MASK_DIV_ZERO;
+ //exceptions &= ~_MM_MASK_OVERFLOW;
+ //exceptions &= ~_MM_MASK_UNDERFLOW;
+ //exceptions &= ~_MM_MASK_INEXACT;
+ _MM_SET_EXCEPTION_MASK(exceptions);
+ }
+
+ /* print info header */
+ if (State::verbosity(1))
+ print();
+ if (State::verbosity(2))
+ State::print();
+
+ /* register all algorithms */
+ bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features));
+
+#if defined(EMBREE_TARGET_SIMD8)
+ bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features));
+#endif
+
+ /* setup tasking system */
+ initTaskingSystem(numThreads);
+
+ /* ray stream SOA to AOS conversion */
+#if defined(EMBREE_RAY_PACKETS)
+ RayStreamFilterFuncsType rayStreamFilterFuncs;
+ SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(enabled_cpu_features,rayStreamFilterFuncs);
+ rayStreamFilters = rayStreamFilterFuncs();
+#endif
+ }
+
+ Device::~Device ()
+ {
+ setCacheSize(0);
+ exitTaskingSystem();
+ }
+
+ std::string getEnabledTargets()
+ {
+ std::string v;
+#if defined(EMBREE_TARGET_SSE2)
+ v += "SSE2 ";
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+ v += "SSE4.2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX)
+ v += "AVX ";
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+ v += "AVX2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX512KNL)
+ v += "AVX512KNL ";
+#endif
+#if defined(EMBREE_TARGET_AVX512SKX)
+ v += "AVX512SKX ";
+#endif
+ return v;
+ }
+
+ std::string getEmbreeFeatures()
+ {
+ std::string v;
+#if defined(EMBREE_RAY_MASK)
+ v += "raymasks ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING)
+ v += "backfaceculling ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING_CURVES)
+ v += "backfacecullingcurves ";
+#endif
+#if defined(EMBREE_FILTER_FUNCTION)
+ v += "intersection_filter ";
+#endif
+#if defined (EMBREE_COMPACT_POLYS)
+ v += "compact_polys ";
+#endif
+ return v;
+ }
+
+ void Device::print()
+ {
+ const int cpu_features = getCPUFeatures();
+ std::cout << std::endl;
+ std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl;
+ std::cout << " Compiler : " << getCompilerName() << std::endl;
+ std::cout << " Build : ";
+#if defined(DEBUG)
+ std::cout << "Debug " << std::endl;
+#else
+ std::cout << "Release " << std::endl;
+#endif
+ std::cout << " Platform : " << getPlatformName() << std::endl;
+ std::cout << " CPU : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl;
+ std::cout << " Threads : " << getNumberOfLogicalThreads() << std::endl;
+ std::cout << " ISA : " << stringOfCPUFeatures(cpu_features) << std::endl;
+ std::cout << " Targets : " << supportedTargetList(cpu_features) << std::endl;
+ const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON;
+ const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON;
+ std::cout << " MXCSR : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl;
+ std::cout << " Config" << std::endl;
+ std::cout << " Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl;
+ std::cout << " ISA : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl;
+ std::cout << " Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl;
+ std::cout << " " << getEnabledTargets() << " (compile time enabled)" << std::endl;
+ std::cout << " Features: " << getEmbreeFeatures() << std::endl;
+ std::cout << " Tasking : ";
+#if defined(TASKING_TBB)
+ std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " ";
+ #if TBB_INTERFACE_VERSION >= 12002
+ std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " ";
+ #else
+ std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " ";
+ #endif
+#endif
+#if defined(TASKING_INTERNAL)
+ std::cout << "internal_tasking_system ";
+#endif
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+ std::cout << "GCD tasking system ";
+#endif
+#if defined(TASKING_PPL)
+ std::cout << "PPL ";
+#endif
+ std::cout << std::endl;
+
+ /* check of FTZ and DAZ flags are set in CSR */
+ if (!hasFTZ || !hasDAZ)
+ {
+#if !defined(_DEBUG)
+ if (State::verbosity(1))
+#endif
+ {
+ std::cout << std::endl;
+ std::cout << "================================================================================" << std::endl;
+ std::cout << " WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled " << std::endl
+ << " in the MXCSR control and status register. This can have a severe " << std::endl
+ << " performance impact. Please enable these modes for each application " << std::endl
+ << " thread the following way:" << std::endl
+ << std::endl
+ << " #include \"xmmintrin.h\"" << std::endl
+ << " #include \"pmmintrin.h\"" << std::endl
+ << std::endl
+ << " _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl
+ << " _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl;
+ std::cout << "================================================================================" << std::endl;
+ std::cout << std::endl;
+ }
+ }
+ std::cout << std::endl;
+ }
+
+ void Device::setDeviceErrorCode(RTCError error)
+ {
+ RTCError* stored_error = errorHandler.error();
+ if (*stored_error == RTC_ERROR_NONE)
+ *stored_error = error;
+ }
+
+ RTCError Device::getDeviceErrorCode()
+ {
+ RTCError* stored_error = errorHandler.error();
+ RTCError error = *stored_error;
+ *stored_error = RTC_ERROR_NONE;
+ return error;
+ }
+
+ void Device::setThreadErrorCode(RTCError error)
+ {
+ RTCError* stored_error = g_errorHandler.error();
+ if (*stored_error == RTC_ERROR_NONE)
+ *stored_error = error;
+ }
+
+ RTCError Device::getThreadErrorCode()
+ {
+ RTCError* stored_error = g_errorHandler.error();
+ RTCError error = *stored_error;
+ *stored_error = RTC_ERROR_NONE;
+ return error;
+ }
+
+ void Device::process_error(Device* device, RTCError error, const char* str)
+ {
+ /* store global error code when device construction failed */
+ if (!device)
+ return setThreadErrorCode(error);
+
+ /* print error when in verbose mode */
+ if (device->verbosity(1))
+ {
+ switch (error) {
+ case RTC_ERROR_NONE : std::cerr << "Embree: No error"; break;
+ case RTC_ERROR_UNKNOWN : std::cerr << "Embree: Unknown error"; break;
+ case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break;
+ case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break;
+ case RTC_ERROR_OUT_OF_MEMORY : std::cerr << "Embree: Out of memory"; break;
+ case RTC_ERROR_UNSUPPORTED_CPU : std::cerr << "Embree: Unsupported CPU"; break;
+ default : std::cerr << "Embree: Invalid error code"; break;
+ };
+ if (str) std::cerr << ", (" << str << ")";
+ std::cerr << std::endl;
+ }
+
+ /* call user specified error callback */
+ if (device->error_function)
+ device->error_function(device->error_function_userptr,error,str);
+
+ /* record error code */
+ device->setDeviceErrorCode(error);
+ }
+
+ void Device::memoryMonitor(ssize_t bytes, bool post)
+ {
+ if (State::memory_monitor_function && bytes != 0) {
+ if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) {
+ if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor
+ throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination");
+ }
+ }
+ }
+ }
+
+ size_t getMaxNumThreads()
+ {
+ size_t maxNumThreads = 0;
+ for (std::map<Device*,size_t>::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++)
+ maxNumThreads = max(maxNumThreads, (*i).second);
+ if (maxNumThreads == 0)
+ maxNumThreads = std::numeric_limits<size_t>::max();
+ return maxNumThreads;
+ }
+
+ size_t getMaxCacheSize()
+ {
+ size_t maxCacheSize = 0;
+ for (std::map<Device*,size_t>::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++)
+ maxCacheSize = max(maxCacheSize, (*i).second);
+ return maxCacheSize;
+ }
+
+ void Device::setCacheSize(size_t bytes)
+ {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+ Lock<MutexSys> lock(g_mutex);
+ if (bytes == 0) g_cache_size_map.erase(this);
+ else g_cache_size_map[this] = bytes;
+
+ size_t maxCacheSize = getMaxCacheSize();
+ resizeTessellationCache(maxCacheSize);
+#endif
+ }
+
+ void Device::initTaskingSystem(size_t numThreads)
+ {
+ Lock<MutexSys> lock(g_mutex);
+ if (numThreads == 0)
+ g_num_threads_map[this] = std::numeric_limits<size_t>::max();
+ else
+ g_num_threads_map[this] = numThreads;
+
+ /* create task scheduler */
+ size_t maxNumThreads = getMaxNumThreads();
+ TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+#if USE_TASK_ARENA
+ const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount());
+ const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads);
+ arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads));
+#endif
+ }
+
+ void Device::exitTaskingSystem()
+ {
+ Lock<MutexSys> lock(g_mutex);
+ g_num_threads_map.erase(this);
+
+ /* terminate tasking system */
+ if (g_num_threads_map.size() == 0) {
+ TaskScheduler::destroy();
+ }
+ /* or configure new number of threads */
+ else {
+ size_t maxNumThreads = getMaxNumThreads();
+ TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+ }
+#if USE_TASK_ARENA
+ arena.reset();
+#endif
+ }
+
+ void Device::setProperty(const RTCDeviceProperty prop, ssize_t val)
+ {
+ /* hidden internal properties */
+ switch ((size_t)prop)
+ {
+ case 1000000: debug_int0 = val; return;
+ case 1000001: debug_int1 = val; return;
+ case 1000002: debug_int2 = val; return;
+ case 1000003: debug_int3 = val; return;
+ }
+
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property");
+ }
+
+ ssize_t Device::getProperty(const RTCDeviceProperty prop)
+ {
+ size_t iprop = (size_t)prop;
+
+ /* get name of internal regression test */
+ if (iprop >= 2000000 && iprop < 3000000)
+ {
+ RegressionTest* test = getRegressionTest(iprop-2000000);
+ if (test) return (ssize_t) test->name.c_str();
+ else return 0;
+ }
+
+ /* run internal regression test */
+ if (iprop >= 3000000 && iprop < 4000000)
+ {
+ RegressionTest* test = getRegressionTest(iprop-3000000);
+ if (test) return test->run();
+ else return 0;
+ }
+
+ /* documented properties */
+ switch (prop)
+ {
+ case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR;
+ case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR;
+ case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH;
+ case RTC_DEVICE_PROPERTY_VERSION : return RTC_VERSION;
+
+#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS)
+ case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return hasISA(SSE2);
+#else
+ case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS)
+ case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return hasISA(AVX);
+#else
+ case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS)
+ case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512KNL) | hasISA(AVX512SKX);
+#else
+ case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_RAY_PACKETS)
+ case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_RAY_MASK)
+ case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING)
+ case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING_CURVES)
+ case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_COMPACT_POLYS)
+ case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+ case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0;
+#endif
+
+#if defined(TASKING_INTERNAL)
+ case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0;
+#endif
+
+#if defined(TASKING_TBB)
+ case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1;
+#endif
+
+#if defined(TASKING_PPL)
+ case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
+#endif
+
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+ case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3;
+#endif
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+ case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+ case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+ case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+ case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+ case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(TASKING_PPL)
+ case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+ case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#else
+ case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1;
+#endif
+
+#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+ case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1;
+#else
+ case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0;
+#endif
+
+ default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break;
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/device.h b/thirdparty/embree-aarch64/kernels/common/device.h
new file mode 100644
index 0000000000..e9a81bb109
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/device.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "state.h"
+#include "accel.h"
+
+namespace embree
+{
+ class BVH4Factory;
+ class BVH8Factory;
+
+ class Device : public State, public MemoryMonitorInterface
+ {
+ ALIGNED_CLASS_(16);
+
+ public:
+
+ /*! Device construction */
+ Device (const char* cfg);
+
+ /*! Device destruction */
+ virtual ~Device ();
+
+ /*! prints info about the device */
+ void print();
+
+ /*! sets the error code */
+ void setDeviceErrorCode(RTCError error);
+
+ /*! returns and clears the error code */
+ RTCError getDeviceErrorCode();
+
+ /*! sets the error code */
+ static void setThreadErrorCode(RTCError error);
+
+ /*! returns and clears the error code */
+ static RTCError getThreadErrorCode();
+
+ /*! processes error codes, do not call directly */
+ static void process_error(Device* device, RTCError error, const char* str);
+
+ /*! invokes the memory monitor callback */
+ void memoryMonitor(ssize_t bytes, bool post);
+
+ /*! sets the size of the software cache. */
+ void setCacheSize(size_t bytes);
+
+ /*! sets a property */
+ void setProperty(const RTCDeviceProperty prop, ssize_t val);
+
+ /*! gets a property */
+ ssize_t getProperty(const RTCDeviceProperty prop);
+
+ private:
+
+ /*! initializes the tasking system */
+ void initTaskingSystem(size_t numThreads);
+
+ /*! shuts down the tasking system */
+ void exitTaskingSystem();
+
+ /*! some variables that can be set via rtcSetParameter1i for debugging purposes */
+ public:
+ static ssize_t debug_int0;
+ static ssize_t debug_int1;
+ static ssize_t debug_int2;
+ static ssize_t debug_int3;
+
+ public:
+ std::unique_ptr<BVH4Factory> bvh4_factory;
+#if defined(EMBREE_TARGET_SIMD8)
+ std::unique_ptr<BVH8Factory> bvh8_factory;
+#endif
+
+#if USE_TASK_ARENA
+ std::unique_ptr<tbb::task_arena> arena;
+#endif
+
+ /* ray streams filter */
+ RayStreamFilterFuncs rayStreamFilters;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.cpp b/thirdparty/embree-aarch64/kernels/common/geometry.cpp
new file mode 100644
index 0000000000..b3aa8e3396
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/geometry.cpp
@@ -0,0 +1,259 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "geometry.h"
+#include "scene.h"
+
+namespace embree
+{
+ const char* Geometry::gtype_names[Geometry::GTY_END] =
+ {
+ "flat_linear_curve",
+ "round_linear_curve",
+ "oriented_linear_curve",
+ "",
+ "flat_bezier_curve",
+ "round_bezier_curve",
+ "oriented_bezier_curve",
+ "",
+ "flat_bspline_curve",
+ "round_bspline_curve",
+ "oriented_bspline_curve",
+ "",
+ "flat_hermite_curve",
+ "round_hermite_curve",
+ "oriented_hermite_curve",
+ "",
+ "flat_catmull_rom_curve",
+ "round_catmull_rom_curve",
+ "oriented_catmull_rom_curve",
+ "",
+ "triangles",
+ "quads",
+ "grid",
+ "subdivs",
+ "",
+ "sphere",
+ "disc",
+ "oriented_disc",
+ "",
+ "usergeom",
+ "instance_cheap",
+ "instance_expensive",
+ };
+
+ Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps)
+ : device(device), userPtr(nullptr),
+ numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f),
+ mask(-1),
+ gtype(gtype),
+ gsubtype(GTY_SUBTYPE_DEFAULT),
+ quality(RTC_BUILD_QUALITY_MEDIUM),
+ state((unsigned)State::MODIFIED),
+ enabled(true),
+ intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr)
+ {
+ device->refInc();
+ }
+
+ Geometry::~Geometry()
+ {
+ device->refDec();
+ }
+
+ void Geometry::setNumPrimitives(unsigned int numPrimitives_in)
+ {
+ if (numPrimitives_in == numPrimitives) return;
+
+ numPrimitives = numPrimitives_in;
+
+ Geometry::update();
+ }
+
+ void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in)
+ {
+ if (numTimeSteps_in == numTimeSteps) {
+ return;
+ }
+
+ numTimeSteps = numTimeSteps_in;
+ fnumTimeSegments = float(numTimeSteps_in-1);
+
+ Geometry::update();
+ }
+
+ void Geometry::setTimeRange (const BBox1f range)
+ {
+ time_range = range;
+ Geometry::update();
+ }
+
+ void Geometry::update()
+ {
+ ++modCounter_; // FIXME: required?
+ state = (unsigned)State::MODIFIED;
+ }
+
+ void Geometry::commit()
+ {
+ ++modCounter_;
+ state = (unsigned)State::COMMITTED;
+ }
+
+ void Geometry::preCommit()
+ {
+ if (State::MODIFIED == (State)state)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed");
+ }
+
+ void Geometry::postCommit()
+ {
+ }
+
+ void Geometry::enable ()
+ {
+ if (isEnabled())
+ return;
+
+ enabled = true;
+ ++modCounter_;
+ }
+
+ void Geometry::disable ()
+ {
+ if (isDisabled())
+ return;
+
+ enabled = false;
+ ++modCounter_;
+ }
+
+ void Geometry::setUserData (void* ptr)
+ {
+ userPtr = ptr;
+ }
+
+ void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter)
+ {
+ if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry");
+
+ intersectionFilterN = filter;
+ }
+
+ void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter)
+ {
+ if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry");
+
+ occlusionFilterN = filter;
+ }
+
+ void Geometry::setPointQueryFunction (RTCPointQueryFunction func)
+ {
+ pointQueryFunc = func;
+ }
+
+ void Geometry::interpolateN(const RTCInterpolateNArguments* const args)
+ {
+ const void* valid_i = args->valid;
+ const unsigned* primIDs = args->primIDs;
+ const float* u = args->u;
+ const float* v = args->v;
+ unsigned int N = args->N;
+ RTCBufferType bufferType = args->bufferType;
+ unsigned int bufferSlot = args->bufferSlot;
+ float* P = args->P;
+ float* dPdu = args->dPdu;
+ float* dPdv = args->dPdv;
+ float* ddPdudu = args->ddPdudu;
+ float* ddPdvdv = args->ddPdvdv;
+ float* ddPdudv = args->ddPdudv;
+ unsigned int valueCount = args->valueCount;
+
+ if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex");
+ const int* valid = (const int*) valid_i;
+
+ __aligned(64) float P_tmp[256];
+ __aligned(64) float dPdu_tmp[256];
+ __aligned(64) float dPdv_tmp[256];
+ __aligned(64) float ddPdudu_tmp[256];
+ __aligned(64) float ddPdvdv_tmp[256];
+ __aligned(64) float ddPdudv_tmp[256];
+
+ float* Pt = P ? P_tmp : nullptr;
+ float* dPdut = nullptr, *dPdvt = nullptr;
+ if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; }
+ float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr;
+ if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; }
+
+ for (unsigned int i=0; i<N; i++)
+ {
+ if (valid && !valid[i]) continue;
+
+ RTCInterpolateArguments iargs;
+ iargs.primID = primIDs[i];
+ iargs.u = u[i];
+ iargs.v = v[i];
+ iargs.bufferType = bufferType;
+ iargs.bufferSlot = bufferSlot;
+ iargs.P = Pt;
+ iargs.dPdu = dPdut;
+ iargs.dPdv = dPdvt;
+ iargs.ddPdudu = ddPdudut;
+ iargs.ddPdvdv = ddPdvdvt;
+ iargs.ddPdudv = ddPdudvt;
+ iargs.valueCount = valueCount;
+ interpolate(&iargs);
+
+ if (likely(P)) {
+ for (unsigned int j=0; j<valueCount; j++)
+ P[j*N+i] = Pt[j];
+ }
+ if (likely(dPdu))
+ {
+ for (unsigned int j=0; j<valueCount; j++) {
+ dPdu[j*N+i] = dPdut[j];
+ dPdv[j*N+i] = dPdvt[j];
+ }
+ }
+ if (likely(ddPdudu))
+ {
+ for (unsigned int j=0; j<valueCount; j++) {
+ ddPdudu[j*N+i] = ddPdudut[j];
+ ddPdvdv[j*N+i] = ddPdvdvt[j];
+ ddPdudv[j*N+i] = ddPdudvt[j];
+ }
+ }
+ }
+ }
+
+ bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context)
+ {
+ assert(context->primID < size());
+
+ RTCPointQueryFunctionArguments args;
+ args.query = (RTCPointQuery*)context->query_ws;
+ args.userPtr = context->userPtr;
+ args.primID = context->primID;
+ args.geomID = context->geomID;
+ args.context = context->userContext;
+ args.similarityScale = context->similarityScale;
+
+ bool update = false;
+ if(context->func) update |= context->func(&args);
+ if(pointQueryFunc) update |= pointQueryFunc(&args);
+
+ if (update && context->userContext->instStackSize > 0)
+ {
+ // update point query
+ if (context->query_type == POINT_QUERY_TYPE_AABB) {
+ context->updateAABB();
+ } else {
+ assert(context->similarityScale > 0.f);
+ query->radius = context->query_ws->radius * context->similarityScale;
+ }
+ }
+ return update;
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.h b/thirdparty/embree-aarch64/kernels/common/geometry.h
new file mode 100644
index 0000000000..953974bfd2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/geometry.h
@@ -0,0 +1,582 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "buffer.h"
+#include "../common/point_query.h"
+#include "../builders/priminfo.h"
+
+namespace embree
+{
+ class Scene;
+ class Geometry;
+
+ struct GeometryCounts
+ {
+ __forceinline GeometryCounts()
+ : numFilterFunctions(0),
+ numTriangles(0), numMBTriangles(0),
+ numQuads(0), numMBQuads(0),
+ numBezierCurves(0), numMBBezierCurves(0),
+ numLineSegments(0), numMBLineSegments(0),
+ numSubdivPatches(0), numMBSubdivPatches(0),
+ numUserGeometries(0), numMBUserGeometries(0),
+ numInstancesCheap(0), numMBInstancesCheap(0),
+ numInstancesExpensive(0), numMBInstancesExpensive(0),
+ numGrids(0), numMBGrids(0),
+ numPoints(0), numMBPoints(0) {}
+
+ __forceinline size_t size() const {
+ return numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints
+ + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints;
+ }
+
+ __forceinline unsigned int enabledGeometryTypesMask() const
+ {
+ unsigned int mask = 0;
+ if (numTriangles) mask |= 1 << 0;
+ if (numQuads) mask |= 1 << 1;
+ if (numBezierCurves+numLineSegments) mask |= 1 << 2;
+ if (numSubdivPatches) mask |= 1 << 3;
+ if (numUserGeometries) mask |= 1 << 4;
+ if (numInstancesCheap) mask |= 1 << 5;
+ if (numInstancesExpensive) mask |= 1 << 6;
+ if (numGrids) mask |= 1 << 7;
+ if (numPoints) mask |= 1 << 8;
+
+ unsigned int maskMB = 0;
+ if (numMBTriangles) maskMB |= 1 << 0;
+ if (numMBQuads) maskMB |= 1 << 1;
+ if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2;
+ if (numMBSubdivPatches) maskMB |= 1 << 3;
+ if (numMBUserGeometries) maskMB |= 1 << 4;
+ if (numMBInstancesCheap) maskMB |= 1 << 5;
+ if (numMBInstancesExpensive) maskMB |= 1 << 6;
+ if (numMBGrids) maskMB |= 1 << 7;
+ if (numMBPoints) maskMB |= 1 << 8;
+
+ return (mask<<8) + maskMB;
+ }
+
+ __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const
+ {
+ GeometryCounts ret;
+ ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions;
+ ret.numTriangles = numTriangles + rhs.numTriangles;
+ ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles;
+ ret.numQuads = numQuads + rhs.numQuads;
+ ret.numMBQuads = numMBQuads + rhs.numMBQuads;
+ ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves;
+ ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves;
+ ret.numLineSegments = numLineSegments + rhs.numLineSegments;
+ ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments;
+ ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches;
+ ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches;
+ ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries;
+ ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries;
+ ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap;
+ ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap;
+ ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive;
+ ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive;
+ ret.numGrids = numGrids + rhs.numGrids;
+ ret.numMBGrids = numMBGrids + rhs.numMBGrids;
+ ret.numPoints = numPoints + rhs.numPoints;
+ ret.numMBPoints = numMBPoints + rhs.numMBPoints;
+
+ return ret;
+ }
+
+ size_t numFilterFunctions; //!< number of geometries with filter functions enabled
+ size_t numTriangles; //!< number of enabled triangles
+ size_t numMBTriangles; //!< number of enabled motion blured triangles
+ size_t numQuads; //!< number of enabled quads
+ size_t numMBQuads; //!< number of enabled motion blurred quads
+ size_t numBezierCurves; //!< number of enabled curves
+ size_t numMBBezierCurves; //!< number of enabled motion blurred curves
+ size_t numLineSegments; //!< number of enabled line segments
+ size_t numMBLineSegments; //!< number of enabled line motion blurred segments
+ size_t numSubdivPatches; //!< number of enabled subdivision patches
+ size_t numMBSubdivPatches; //!< number of enabled motion blured subdivision patches
+ size_t numUserGeometries; //!< number of enabled user geometries
+ size_t numMBUserGeometries; //!< number of enabled motion blurred user geometries
+ size_t numInstancesCheap; //!< number of enabled cheap instances
+ size_t numMBInstancesCheap; //!< number of enabled motion blurred cheap instances
+ size_t numInstancesExpensive; //!< number of enabled expensive instances
+ size_t numMBInstancesExpensive; //!< number of enabled motion blurred expensive instances
+ size_t numGrids; //!< number of enabled grid geometries
+ size_t numMBGrids; //!< number of enabled motion blurred grid geometries
+ size_t numPoints; //!< number of enabled points
+ size_t numMBPoints; //!< number of enabled motion blurred points
+ };
+
+ /*! Base class all geometries are derived from */
+ class Geometry : public RefCount
+ {
+ friend class Scene;
+ public:
+
+ /*! type of geometry */
+ enum GType
+ {
+ GTY_FLAT_LINEAR_CURVE = 0,
+ GTY_ROUND_LINEAR_CURVE = 1,
+ GTY_ORIENTED_LINEAR_CURVE = 2,
+ GTY_CONE_LINEAR_CURVE = 3,
+
+ GTY_FLAT_BEZIER_CURVE = 4,
+ GTY_ROUND_BEZIER_CURVE = 5,
+ GTY_ORIENTED_BEZIER_CURVE = 6,
+
+ GTY_FLAT_BSPLINE_CURVE = 8,
+ GTY_ROUND_BSPLINE_CURVE = 9,
+ GTY_ORIENTED_BSPLINE_CURVE = 10,
+
+ GTY_FLAT_HERMITE_CURVE = 12,
+ GTY_ROUND_HERMITE_CURVE = 13,
+ GTY_ORIENTED_HERMITE_CURVE = 14,
+
+ GTY_FLAT_CATMULL_ROM_CURVE = 16,
+ GTY_ROUND_CATMULL_ROM_CURVE = 17,
+ GTY_ORIENTED_CATMULL_ROM_CURVE = 18,
+
+ GTY_TRIANGLE_MESH = 20,
+ GTY_QUAD_MESH = 21,
+ GTY_GRID_MESH = 22,
+ GTY_SUBDIV_MESH = 23,
+
+ GTY_SPHERE_POINT = 25,
+ GTY_DISC_POINT = 26,
+ GTY_ORIENTED_DISC_POINT = 27,
+
+ GTY_USER_GEOMETRY = 29,
+ GTY_INSTANCE_CHEAP = 30,
+ GTY_INSTANCE_EXPENSIVE = 31,
+ GTY_END = 32,
+
+ GTY_BASIS_LINEAR = 0,
+ GTY_BASIS_BEZIER = 4,
+ GTY_BASIS_BSPLINE = 8,
+ GTY_BASIS_HERMITE = 12,
+ GTY_BASIS_CATMULL_ROM = 16,
+ GTY_BASIS_MASK = 28,
+
+ GTY_SUBTYPE_FLAT_CURVE = 0,
+ GTY_SUBTYPE_ROUND_CURVE = 1,
+ GTY_SUBTYPE_ORIENTED_CURVE = 2,
+ GTY_SUBTYPE_MASK = 3,
+ };
+
+ enum GSubType
+ {
+ GTY_SUBTYPE_DEFAULT= 0,
+ GTY_SUBTYPE_INSTANCE_LINEAR = 0,
+ GTY_SUBTYPE_INSTANCE_QUATERNION = 1
+ };
+
+ enum GTypeMask
+ {
+ MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE,
+ MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE,
+ MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE,
+ MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE,
+
+ MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE,
+ MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE,
+ MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE,
+
+ MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE,
+ MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE,
+ MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE,
+
+ MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE,
+ MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE,
+ MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE,
+
+ MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE,
+ MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE,
+ MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE,
+
+ MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE,
+
+ MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE |
+ MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE |
+ MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE |
+ MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE,
+
+ MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT,
+ MTY_DISC_POINT = 1ul << GTY_DISC_POINT,
+ MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT,
+
+ MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT,
+
+ MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS,
+
+ MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH,
+ MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH,
+ MTY_GRID_MESH = 1ul << GTY_GRID_MESH,
+ MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH,
+ MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY,
+
+ MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP,
+ MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE,
+ MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE
+ };
+
+ static const char* gtype_names[GTY_END];
+
+ enum class State : unsigned {
+ MODIFIED = 0,
+ COMMITTED = 1,
+ };
+
+ public:
+
+ /*! Geometry constructor */
+ Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps);
+
+ /*! Geometry destructor */
+ virtual ~Geometry();
+
+ public:
+
+ /*! tests if geometry is enabled */
+ __forceinline bool isEnabled() const { return enabled; }
+
+ /*! tests if geometry is disabled */
+ __forceinline bool isDisabled() const { return !isEnabled(); }
+
+ /*! tests if that geometry has some filter function set */
+ __forceinline bool hasFilterFunctions () const {
+ return (intersectionFilterN != nullptr) || (occlusionFilterN != nullptr);
+ }
+
+ /*! returns geometry type */
+ __forceinline GType getType() const { return gtype; }
+
+ /*! returns curve type */
+ __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); }
+
+ /*! returns curve basis */
+ __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); }
+
+ /*! returns geometry type mask */
+ __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); }
+
+ /*! returns number of primitives */
+ __forceinline size_t size() const { return numPrimitives; }
+
+ /*! sets the number of primitives */
+ virtual void setNumPrimitives(unsigned int numPrimitives_in);
+
+ /*! sets number of time steps */
+ virtual void setNumTimeSteps (unsigned int numTimeSteps_in);
+
+ /*! sets motion blur time range */
+ void setTimeRange (const BBox1f range);
+
+ /*! sets number of vertex attributes */
+ virtual void setVertexAttributeCount (unsigned int N) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! sets number of topologies */
+ virtual void setTopologyCount (unsigned int N) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! sets the build quality */
+ void setBuildQuality(RTCBuildQuality quality_in)
+ {
+ this->quality = quality_in;
+ Geometry::update();
+ }
+
+ /* calculate time segment itime and fractional time ftime */
+ __forceinline int timeSegment(float time, float& ftime) const {
+ return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime);
+ }
+
+ template<int N>
+ __forceinline vint<N> timeSegment(const vfloat<N>& time, vfloat<N>& ftime) const {
+ return getTimeSegment(time,vfloat<N>(time_range.lower),vfloat<N>(time_range.upper),vfloat<N>(fnumTimeSegments),ftime);
+ }
+
+ /* calculate overlapping time segment range */
+ __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+ return getTimeSegmentRange(range,time_range,fnumTimeSegments);
+ }
+
+ /* returns time that corresponds to time step */
+ __forceinline float timeStep(const int i) const {
+ assert(i>=0 && i<(int)numTimeSteps);
+ return time_range.lower + time_range.size()*float(i)/fnumTimeSegments;
+ }
+
+ /*! for all geometries */
+ public:
+
+ /*! Enable geometry. */
+ virtual void enable();
+
+ /*! Update geometry. */
+ void update();
+
+ /*! commit of geometry */
+ virtual void commit();
+
+ /*! Update geometry buffer. */
+ virtual void updateBuffer(RTCBufferType type, unsigned int slot) {
+ update(); // update everything for geometries not supporting this call
+ }
+
+ /*! Disable geometry. */
+ virtual void disable();
+
+ /*! Verify the geometry */
+ virtual bool verify() { return true; }
+
+ /*! called before every build */
+ virtual void preCommit();
+
+ /*! called after every build */
+ virtual void postCommit();
+
+ virtual void addElementsToCount (GeometryCounts & counts) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ };
+
+ /*! sets constant tessellation rate for the geometry */
+ virtual void setTessellationRate(float N) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Sets the maximal curve radius scale allowed by min-width feature. */
+ virtual void setMaxRadiusScale(float s) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Set user data pointer. */
+ virtual void setUserData(void* ptr);
+
+ /*! Get user data pointer. */
+ __forceinline void* getUserData() const {
+ return userPtr;
+ }
+
+ /*! interpolates user data to the specified u/v location */
+ virtual void interpolate(const RTCInterpolateArguments* const args) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! interpolates user data to the specified u/v locations */
+ virtual void interpolateN(const RTCInterpolateNArguments* const args);
+
+ /* point query api */
+ bool pointQuery(PointQuery* query, PointQueryContext* context);
+
+ /*! for subdivision surfaces only */
+ public:
+ virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Set displacement function. */
+ virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ virtual unsigned int getFirstHalfEdge(unsigned int faceID) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ virtual unsigned int getFace(unsigned int edgeID) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ virtual unsigned int getNextHalfEdge(unsigned int edgeID) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! get fast access to first vertex buffer if applicable */
+ virtual float * getCompactVertexArray () const {
+ return nullptr;
+ }
+
+ /*! Returns the modified counter - how many times the geo has been modified */
+ __forceinline unsigned int getModCounter () const {
+ return modCounter_;
+ }
+
+ /*! for triangle meshes and bezier curves only */
+ public:
+
+
+ /*! Sets ray mask. */
+ virtual void setMask(unsigned mask) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Sets specified buffer. */
+ virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Gets specified buffer. */
+ virtual void* getBuffer(RTCBufferType type, unsigned int slot) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Set intersection filter function for ray packets of size N. */
+ virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN);
+
+ /*! Set occlusion filter function for ray packets of size N. */
+ virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN);
+
+ /*! for instances only */
+ public:
+
+ /*! Sets the instanced scene */
+ virtual void setInstancedScene(const Ref<Scene>& scene) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Sets transformation of the instance */
+ virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Sets transformation of the instance */
+ virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Returns the transformation of the instance */
+ virtual AffineSpace3fa getTransform(float time) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! for user geometries only */
+ public:
+
+ /*! Set bounds function. */
+ virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Set intersect function for ray packets of size N. */
+ virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Set occlusion function for ray packets of size N. */
+ virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+ }
+
+ /*! Set point query function. */
+ void setPointQueryFunction(RTCPointQueryFunction func);
+
+ /*! returns number of time segments */
+ __forceinline unsigned numTimeSegments () const {
+ return numTimeSteps-1;
+ }
+
+ public:
+
+ virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry");
+ }
+
+ virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry");
+ }
+
+ virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry");
+ }
+
+ virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry");
+ }
+
+ virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry");
+ }
+
+ virtual Vec3fa computeDirection(unsigned int primID) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry");
+ }
+
+ virtual Vec3fa computeDirection(unsigned int primID, size_t time) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry");
+ }
+
+ virtual BBox3fa vbounds(size_t primID) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry");
+ }
+
+ virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry");
+ }
+
+ virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry");
+ }
+
+ virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry");
+ }
+
+ virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry");
+ }
+
+ virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry");
+ }
+
+ public:
+ __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; }
+ __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; }
+
+ public:
+ Device* device; //!< device this geometry belongs to
+
+ void* userPtr; //!< user pointer
+ unsigned int numPrimitives; //!< number of primitives of this geometry
+
+ unsigned int numTimeSteps; //!< number of time steps
+ float fnumTimeSegments; //!< number of time segments (precalculation)
+ BBox1f time_range; //!< motion blur time range
+
+ unsigned int mask; //!< for masking out geometry
+ unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified
+
+ struct {
+ GType gtype : 8; //!< geometry type
+ GSubType gsubtype : 8; //!< geometry subtype
+ RTCBuildQuality quality : 3; //!< build quality for geometry
+ unsigned state : 2;
+ bool enabled : 1; //!< true if geometry is enabled
+ };
+
+ RTCFilterFunctionN intersectionFilterN;
+ RTCFilterFunctionN occlusionFilterN;
+ RTCPointQueryFunction pointQueryFunc;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/hit.h b/thirdparty/embree-aarch64/kernels/common/hit.h
new file mode 100644
index 0000000000..32a198cdfe
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/hit.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "instance_stack.h"
+
+namespace embree
+{
+ /* Hit structure for K hits */
+ template<int K>
+ struct HitK
+ {
+ /* Default construction does nothing */
+ __forceinline HitK() {}
+
+ /* Constructs a hit */
+ __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng)
+ : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID)
+ {
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+ instID[l] = RTC_INVALID_GEOMETRY_ID;
+ instance_id_stack::copy(context->instID, instID);
+ }
+
+ /* Returns the size of the hit */
+ static __forceinline size_t size() { return K; }
+
+ public:
+ Vec3vf<K> Ng; // geometry normal
+ vfloat<K> u; // barycentric u coordinate of hit
+ vfloat<K> v; // barycentric v coordinate of hit
+ vuint<K> primID; // primitive ID
+ vuint<K> geomID; // geometry ID
+ vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+ };
+
+ /* Specialization for a single hit */
+ template<>
+ struct __aligned(16) HitK<1>
+ {
+ /* Default construction does nothing */
+ __forceinline HitK() {}
+
+ /* Constructs a hit */
+ __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng)
+ : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID)
+ {
+ instance_id_stack::copy(context->instID, instID);
+ }
+
+ /* Returns the size of the hit */
+ static __forceinline size_t size() { return 1; }
+
+ public:
+ Vec3<float> Ng; // geometry normal
+ float u; // barycentric u coordinate of hit
+ float v; // barycentric v coordinate of hit
+ unsigned int primID; // primitive ID
+ unsigned int geomID; // geometry ID
+ unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+ };
+
+ /* Shortcuts */
+ typedef HitK<1> Hit;
+ typedef HitK<4> Hit4;
+ typedef HitK<8> Hit8;
+ typedef HitK<16> Hit16;
+
+ /* Outputs hit to stream */
+ template<int K>
+ __forceinline embree_ostream operator<<(embree_ostream cout, const HitK<K>& ray)
+ {
+ cout << "{ " << embree_endl
+ << " Ng = " << ray.Ng << embree_endl
+ << " u = " << ray.u << embree_endl
+ << " v = " << ray.v << embree_endl
+ << " primID = " << ray.primID << embree_endl
+ << " geomID = " << ray.geomID << embree_endl
+ << " instID =";
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+ {
+ cout << " " << ray.instID[l];
+ }
+ cout << embree_endl;
+ return cout << "}";
+ }
+
+ template<typename Hit>
+ __forceinline void copyHitToRay(RayHit& ray, const Hit& hit)
+ {
+ ray.Ng = hit.Ng;
+ ray.u = hit.u;
+ ray.v = hit.v;
+ ray.primID = hit.primID;
+ ray.geomID = hit.geomID;
+ instance_id_stack::copy(hit.instID, ray.instID);
+ }
+
+ template<int K>
+ __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit)
+ {
+ vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x);
+ vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y);
+ vfloat<K>::storeu(mask,&ray.Ng.z, hit.Ng.z);
+ vfloat<K>::storeu(mask,&ray.u, hit.u);
+ vfloat<K>::storeu(mask,&ray.v, hit.v);
+ vuint<K>::storeu(mask,&ray.primID, hit.primID);
+ vuint<K>::storeu(mask,&ray.geomID, hit.geomID);
+ instance_id_stack::copy(hit.instID, ray.instID, mask);
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/instance_stack.h b/thirdparty/embree-aarch64/kernels/common/instance_stack.h
new file mode 100644
index 0000000000..d7e3637f7b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/instance_stack.h
@@ -0,0 +1,199 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore.h"
+
+namespace embree {
+namespace instance_id_stack {
+
+static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0,
+ "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0.");
+
+/*******************************************************************************
+ * Instance ID stack manipulation.
+ * This is used from the instance intersector.
+ ******************************************************************************/
+
+/*
+ * Push an instance to the stack.
+ */
+RTC_FORCEINLINE bool push(RTCIntersectContext* context,
+ unsigned instanceId)
+{
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+ const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT;
+ /* We assert here because instances are silently dropped when the stack is full.
+ This might be quite hard to find in production. */
+ assert(spaceAvailable);
+ if (likely(spaceAvailable))
+ context->instID[context->instStackSize++] = instanceId;
+ return spaceAvailable;
+#else
+ const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID);
+ assert(spaceAvailable);
+ if (likely(spaceAvailable))
+ context->instID[0] = instanceId;
+ return spaceAvailable;
+#endif
+}
+
+
+/*
+ * Pop the last instance pushed to the stack.
+ * Do not call on an empty stack.
+ */
+RTC_FORCEINLINE void pop(RTCIntersectContext* context)
+{
+ assert(context);
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+ assert(context->instStackSize > 0);
+ context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#else
+ assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID);
+ context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+#endif
+}
+
+/*******************************************************************************
+ * Optimized instance id stack copy.
+ * The copy() function at the bottom of this block will either copy full
+ * stacks or copy only until the last valid element has been copied, depending
+ * on RTC_MAX_INSTANCE_LEVEL_COUNT.
+ ******************************************************************************/
+
+/*
+ * Plain array assignment. This works for scalar->scalar,
+ * scalar->vector, and vector->vector.
+ */
+template <class Src, class Tgt>
+RTC_FORCEINLINE void level_copy(unsigned level, Src* src, Tgt* tgt)
+{
+ tgt[level] = src[level];
+}
+
+/*
+ * Masked SIMD vector->vector store.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+ vuint<K>::storeu(mask, tgt + level, src[level]);
+}
+
+/*
+ * Masked scalar->SIMD vector store.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+ vuint<K>::store(mask, tgt + level, src[level]);
+}
+
+/*
+ * Indexed assign from vector to scalar.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, unsigned* tgt, const size_t& idx)
+{
+ tgt[level] = src[level][idx];
+}
+
+/*
+ * Indexed assign from scalar to vector.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const size_t& idx)
+{
+ tgt[level][idx] = src[level];
+}
+
+/*
+ * Indexed assign from vector to vector.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const size_t& i, const size_t& j)
+{
+ tgt[level][j] = src[level][i];
+}
+
+/*
+ * Check if the given stack level is valid.
+ * These are only used for large max stack sizes.
+ */
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack)
+{
+ return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const size_t& /*i*/)
+{
+ return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const vbool<K>& /*mask*/)
+{
+ return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack)
+{
+ return any(stack[level] != RTC_INVALID_GEOMETRY_ID);
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const vbool<K>& mask)
+{
+ return any(mask & (stack[level] != RTC_INVALID_GEOMETRY_ID));
+}
+
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i)
+{
+ return stack[level][i] != RTC_INVALID_GEOMETRY_ID;
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i, const size_t& /*j*/)
+{
+ return stack[level][i] != RTC_INVALID_GEOMETRY_ID;
+}
+
+/*
+ * Copy an instance ID stack.
+ *
+ * This function automatically selects a LevelFunctor from the above Assign
+ * structs.
+ */
+template <class Src, class Tgt, class... Args>
+RTC_FORCEINLINE void copy(Src src, Tgt tgt, Args&&... args)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+ /*
+ * Avoid all loops for only one level.
+ */
+ level_copy(0, src, tgt, std::forward<Args>(args)...);
+
+#elif (RTC_MAX_INSTANCE_LEVEL_COUNT <= 4)
+ /*
+ * It is faster to avoid the valid test for low level counts.
+ * Just copy the whole stack.
+ */
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+ level_copy(l, src, tgt, std::forward<Args>(args)...);
+
+#else
+ /*
+ * For general stack sizes, it pays off to test for validity.
+ */
+ bool valid = true;
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT && valid; ++l)
+ {
+ level_copy(l, src, tgt, std::forward<Args>(args)...);
+ valid = level_valid(l, src, std::forward<Args>(args)...);
+ }
+#endif
+}
+
+} // namespace instance_id_stack
+} // namespace embree
+
diff --git a/thirdparty/embree-aarch64/kernels/common/isa.h b/thirdparty/embree-aarch64/kernels/common/isa.h
new file mode 100644
index 0000000000..63fb8d3351
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/isa.h
@@ -0,0 +1,271 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+
+namespace embree
+{
+#define DEFINE_SYMBOL2(type,name) \
+ typedef type (*name##Func)(); \
+ name##Func name;
+
+#define DECLARE_SYMBOL2(type,name) \
+ namespace sse2 { extern type name(); } \
+ namespace sse42 { extern type name(); } \
+ namespace avx { extern type name(); } \
+ namespace avx2 { extern type name(); } \
+ namespace avx512knl { extern type name(); } \
+ namespace avx512skx { extern type name(); } \
+ void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \
+ type name##_error() { return type(name##_error2); } \
+ type name##_zero() { return type(nullptr); }
+
+#define DECLARE_ISA_FUNCTION(type,symbol,args) \
+ namespace sse2 { extern type symbol(args); } \
+ namespace sse42 { extern type symbol(args); } \
+ namespace avx { extern type symbol(args); } \
+ namespace avx2 { extern type symbol(args); } \
+ namespace avx512knl { extern type symbol(args); } \
+ namespace avx512skx { extern type symbol(args); } \
+ inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \
+ typedef type (*symbol##Ty)(args); \
+
+#define DEFINE_ISA_FUNCTION(type,symbol,args) \
+ typedef type (*symbol##Func)(args); \
+ symbol##Func symbol;
+
+#define ZERO_SYMBOL(features,intersector) \
+ intersector = intersector##_zero;
+
+#define INIT_SYMBOL(features,intersector) \
+ intersector = decltype(intersector)(intersector##_error);
+
+#define SELECT_SYMBOL_DEFAULT(features,intersector) \
+ intersector = isa::intersector;
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#if !defined(EMBREE_TARGET_SIMD4)
+#define EMBREE_TARGET_SIMD4
+#endif
+#endif
+
+#if defined(EMBREE_TARGET_SSE42)
+#define SELECT_SYMBOL_SSE42(features,intersector) \
+ if ((features & SSE42) == SSE42) intersector = sse42::intersector;
+#else
+#define SELECT_SYMBOL_SSE42(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX) || defined(__AVX__)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target
+#define SELECT_SYMBOL_AVX(features,intersector) \
+ if ((features & ISA) == ISA) intersector = isa::intersector;
+#else
+#define SELECT_SYMBOL_AVX(features,intersector) \
+ if ((features & AVX) == AVX) intersector = avx::intersector;
+#endif
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX2)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#define SELECT_SYMBOL_AVX2(features,intersector) \
+ if ((features & AVX2) == AVX2) intersector = avx2::intersector;
+#else
+#define SELECT_SYMBOL_AVX2(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512KNL)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512KNL(features,intersector) \
+ if ((features & AVX512KNL) == AVX512KNL) intersector = avx512knl::intersector;
+#else
+#define SELECT_SYMBOL_AVX512KNL(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512SKX)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512SKX(features,intersector) \
+ if ((features & AVX512SKX) == AVX512SKX) intersector = avx512skx::intersector;
+#else
+#define SELECT_SYMBOL_AVX512SKX(features,intersector)
+#endif
+
+#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL_AVX512SKX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512SKX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX512KNL_AVX512SKX(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+ ZERO_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+ SELECT_SYMBOL_DEFAULT(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,intersector) \
+ INIT_SYMBOL(features,intersector); \
+ SELECT_SYMBOL_AVX512KNL(features,intersector); \
+ SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \
+ SELECT_SYMBOL_SSE42(features,intersector); \
+ SELECT_SYMBOL_AVX(features,intersector); \
+ SELECT_SYMBOL_AVX2(features,intersector);
+
+ struct VerifyMultiTargetLinking {
+ static __noinline int getISA(int depth = 5) {
+ if (depth == 0) return ISA;
+ else return getISA(depth-1);
+ }
+ };
+ namespace sse2 { int getISA(); };
+ namespace sse42 { int getISA(); };
+ namespace avx { int getISA(); };
+ namespace avx2 { int getISA(); };
+ namespace avx512knl { int getISA(); };
+ namespace avx512skx { int getISA(); };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/motion_derivative.h b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h
new file mode 100644
index 0000000000..82953f0e89
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/math/affinespace.h"
+#include "../../common/math/interval.h"
+
+#include <functional>
+
+namespace embree {
+
+#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f
+
+static void motion_derivative_coefficients(const float *p, float *coeff);
+
+struct MotionDerivativeCoefficients
+{
+ float theta;
+ float coeffs[3*8*7];
+
+ MotionDerivativeCoefficients() {}
+
+ // xfm0 and xfm1 are interpret as quaternion decomposition
+ MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1)
+ {
+ // cosTheta of the two quaternions
+ const float cosTheta = min(1.f, max(-1.f,
+ xfm0.l.vx.w * xfm1.l.vx.w
+ + xfm0.l.vy.w * xfm1.l.vy.w
+ + xfm0.l.vz.w * xfm1.l.vz.w
+ + xfm0.p.w * xfm1.p.w));
+
+ theta = std::acos(cosTheta);
+ Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w);
+ if (cosTheta < 0.995f) {
+ // compute perpendicular quaternion
+ qperp.x = xfm1.p.w - cosTheta * xfm0.p.w;
+ qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w;
+ qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w;
+ qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w;
+ qperp = normalize(qperp);
+ }
+ const float p[33] = {
+ theta,
+ xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0
+ xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1
+ xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0
+ qperp.x, qperp.y, qperp.z, qperp.w,
+ xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0
+ xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y,
+ xfm0.l.vz.z, xfm0.p.z,
+ xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1
+ xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y,
+ xfm1.l.vz.z, xfm1.p.z
+ };
+ motion_derivative_coefficients(p, coeffs);
+ }
+};
+
+struct MotionDerivative
+{
+ float twoTheta;
+ float c[8];
+
+ MotionDerivative(MotionDerivativeCoefficients const& mdc,
+ int dim, Vec3fa const& p0, Vec3fa const& p1)
+ : twoTheta(2.f*mdc.theta)
+ {
+ const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z };
+ for (int i = 0; i < 8; ++i) {
+ c[i] = 0;
+ for (int j = 0; j < 7; ++j) {
+ c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j];
+ }
+ }
+ }
+
+ template<typename T>
+ struct EvalMotionDerivative
+ {
+ MotionDerivative const& md;
+ float offset;
+
+ EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {}
+
+ T operator()(T const& time) const {
+ return md.c[0] + md.c[1] * time
+ + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time)
+ + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time)
+ + offset;
+ }
+ };
+
+ unsigned int findRoots(
+ Interval1f const& interval,
+ float offset,
+ float* roots,
+ unsigned int maxNumRoots)
+ {
+ unsigned int numRoots = 0;
+ EvalMotionDerivative<Interval1f> eval(*this, offset);
+ findRoots(eval, interval, numRoots, roots, maxNumRoots);
+ return numRoots;
+ }
+
+ template<typename Eval>
+ static void findRoots(
+
+ Eval const& eval,
+ Interval1f const& interval,
+ unsigned int& numRoots,
+ float* roots,
+ unsigned int maxNumRoots)
+ {
+ Interval1f range = eval(interval);
+ if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return;
+
+ const float split = 0.5f * (interval.upper + interval.lower);
+ if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f || abs(split-interval.upper) < 1e-7f)
+ {
+ // check if the root already exists
+ for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) {
+ if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON)
+ return;
+ }
+ if (numRoots < maxNumRoots) {
+ roots[numRoots++] = split;
+ }
+ if (numRoots > maxNumRoots) {
+ printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS
+ return;
+ }
+ return;
+ }
+
+ findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots);
+ findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots);
+ }
+};
+
+/******************************************************************************
+ * Code generated with sympy 1.4 *
+ * See http://www.sympy.org/ for more information. *
+ * *
+ * see *
+ * *
+ * scripts/generate_motion_derivative_coefficients.py *
+ * *
+ * for how this code is generated *
+ * *
+ ******************************************************************************/
+static void motion_derivative_coefficients(const float *p, float *coeff)
+{
+ coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27];
+ coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24];
+ coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25];
+ coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26];
+ coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15];
+ coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16];
+ coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17];
+ coeff[7] = 0;
+ coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24];
+ coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25];
+ coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26];
+ coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24];
+ coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25];
+ coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26];
+ coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27];
+ coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24];
+ coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25];
+ coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26];
+ coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15];
+ coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16];
+ coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17];
+ coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0];
+ coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24];
+ coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25];
+ coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26];
+ coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24];
+ coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25];
+ coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26];
+ coeff[28] = 0;
+ coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0];
+ coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0];
+ coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0];
+ coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0];
+ coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0];
+ coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0];
+ coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27];
+ coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24];
+ coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25];
+ coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26];
+ coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15];
+ coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16];
+ coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17];
+ coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0];
+ coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24];
+ coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25];
+ coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26];
+ coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24];
+ coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25];
+ coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26];
+ coeff[49] = 0;
+ coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0];
+ coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0];
+ coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0];
+ coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0];
+ coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0];
+ coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0];
+ coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30];
+ coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24];
+ coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28];
+ coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29];
+ coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15];
+ coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19];
+ coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20];
+ coeff[63] = 0;
+ coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+ coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28];
+ coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29];
+ coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+ coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28];
+ coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29];
+ coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30];
+ coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24];
+ coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28];
+ coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29];
+ coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15];
+ coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19];
+ coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20];
+ coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0];
+ coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+ coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28];
+ coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29];
+ coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+ coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28];
+ coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29];
+ coeff[84] = 0;
+ coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0];
+ coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0];
+ coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0];
+ coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0];
+ coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0];
+ coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0];
+ coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30];
+ coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24];
+ coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28];
+ coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29];
+ coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15];
+ coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19];
+ coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20];
+ coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0];
+ coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24];
+ coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28];
+ coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29];
+ coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24];
+ coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28];
+ coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29];
+ coeff[105] = 0;
+ coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0];
+ coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0];
+ coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0];
+ coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0];
+ coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0];
+ coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0];
+ coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32];
+ coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24];
+ coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28];
+ coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31];
+ coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15];
+ coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19];
+ coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22];
+ coeff[119] = 0;
+ coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+ coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+ coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31];
+ coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+ coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+ coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31];
+ coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30];
+ coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24];
+ coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28];
+ coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29];
+ coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15];
+ coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19];
+ coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20];
+ coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0];
+ coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+ coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+ coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29];
+ coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+ coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+ coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29];
+ coeff[140] = 0;
+ coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0];
+ coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0];
+ coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0];
+ coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0];
+ coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0];
+ coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0];
+ coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32];
+ coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24];
+ coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28];
+ coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31];
+ coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15];
+ coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19];
+ coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22];
+ coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0];
+ coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24];
+ coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28];
+ coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31];
+ coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24];
+ coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28];
+ coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31];
+ coeff[161] = 0;
+ coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0];
+ coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0];
+ coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0];
+ coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0];
+ coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0];
+ coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0];
+}
+
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/common/point_query.h b/thirdparty/embree-aarch64/kernels/common/point_query.h
new file mode 100644
index 0000000000..27d158ca3a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/point_query.h
@@ -0,0 +1,136 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+ /* Point query structure for closest point query */
+ template<int K>
+ struct RTC_ALIGN(16) PointQueryK
+ {
+ /* Default construction does nothing */
+ __forceinline PointQueryK() {}
+
+ /* Constructs a ray from origin, direction, and ray segment. Near
+ * has to be smaller than far */
+ __forceinline PointQueryK(const Vec3vf<K>& p, const vfloat<K>& radius = inf, const vfloat<K>& time = zero)
+ : p(p), time(time), radius(radius) {}
+
+ /* Returns the size of the ray */
+ static __forceinline size_t size() { return K; }
+
+ /* Calculates if this is a valid ray that does not cause issues during traversal */
+ __forceinline vbool<K> valid() const
+ {
+ const vbool<K> vx = (abs(p.x) <= vfloat<K>(FLT_LARGE));
+ const vbool<K> vy = (abs(p.y) <= vfloat<K>(FLT_LARGE));
+ const vbool<K> vz = (abs(p.z) <= vfloat<K>(FLT_LARGE));
+ const vbool<K> vn = radius >= vfloat<K>(0);
+ const vbool<K> vf = abs(time) < vfloat<K>(inf);
+ return vx & vy & vz & vn & vf;
+ }
+
+ __forceinline void get(PointQueryK<1>* ray) const;
+ __forceinline void get(size_t i, PointQueryK<1>& ray) const;
+ __forceinline void set(const PointQueryK<1>* ray);
+ __forceinline void set(size_t i, const PointQueryK<1>& ray);
+
+ Vec3vf<K> p; // location of the query point
+ vfloat<K> time; // time for motion blur
+ vfloat<K> radius; // radius for the point query
+ };
+
+ /* Specialization for a single point query */
+ template<>
+ struct RTC_ALIGN(16) PointQueryK<1>
+ {
+ /* Default construction does nothing */
+ __forceinline PointQueryK() {}
+
+ /* Constructs a ray from origin, direction, and ray segment. Near
+ * has to be smaller than far */
+ __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero)
+ : p(p), time(time), radius(radius) {}
+
+ /* Calculates if this is a valid ray that does not cause issues during traversal */
+ __forceinline bool valid() const {
+ return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf);
+ }
+
+ Vec3f p;
+ float time;
+ float radius;
+ };
+
+ /* Converts point query packet to single point query */
+ template<int K>
+ __forceinline void PointQueryK<K>::get(PointQueryK<1>* query) const
+ {
+ for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+ {
+ query[i].p.x = p.x[i];
+ query[i].p.y = p.y[i];
+ query[i].p.z = p.z[i];
+ query[i].time = time[i];
+ query[i].radius = radius[i];
+ }
+ }
+
+ /* Extracts a single point query out of a point query packet*/
+ template<int K>
+ __forceinline void PointQueryK<K>::get(size_t i, PointQueryK<1>& query) const
+ {
+ query.p.x = p.x[i];
+ query.p.y = p.y[i];
+ query.p.z = p.z[i];
+ query.radius = radius[i];
+ query.time = time[i];
+ }
+
+ /* Converts single point query to point query packet */
+ template<int K>
+ __forceinline void PointQueryK<K>::set(const PointQueryK<1>* query)
+ {
+ for (size_t i = 0; i < K; i++)
+ {
+ p.x[i] = query[i].p.x;
+ p.y[i] = query[i].p.y;
+ p.z[i] = query[i].p.z;
+ radius[i] = query[i].radius;
+ time[i] = query[i].time;
+ }
+ }
+
+ /* inserts a single point query into a point query packet element */
+ template<int K>
+ __forceinline void PointQueryK<K>::set(size_t i, const PointQueryK<1>& query)
+ {
+ p.x[i] = query.p.x;
+ p.y[i] = query.p.y;
+ p.z[i] = query.p.z;
+ radius[i] = query.radius;
+ time[i] = query.time;
+ }
+
+ /* Shortcuts */
+ typedef PointQueryK<1> PointQuery;
+ typedef PointQueryK<4> PointQuery4;
+ typedef PointQueryK<8> PointQuery8;
+ typedef PointQueryK<16> PointQuery16;
+ struct PointQueryN;
+
+ /* Outputs point query to stream */
+ template<int K>
+ __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK<K>& query)
+ {
+ cout << "{ " << embree_endl
+ << " p = " << query.p << embree_endl
+ << " r = " << query.radius << embree_endl
+ << " time = " << query.time << embree_endl
+ << "}";
+ return cout;
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/primref.h b/thirdparty/embree-aarch64/kernels/common/primref.h
new file mode 100644
index 0000000000..ce75c982bb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/primref.h
@@ -0,0 +1,138 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+ /*! A primitive reference stores the bounds of the primitive and its ID. */
+ struct __aligned(32) PrimRef
+ {
+ __forceinline PrimRef () {}
+
+#if defined(__AVX__)
+ __forceinline PrimRef(const PrimRef& v) {
+ vfloat8::store((float*)this,vfloat8::load((float*)&v));
+ }
+ __forceinline PrimRef& operator=(const PrimRef& v) {
+ vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this;
+ }
+#endif
+
+ __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID)
+ {
+ lower = Vec3fx(bounds.lower, geomID);
+ upper = Vec3fx(bounds.upper, primID);
+ }
+
+ __forceinline PrimRef (const BBox3fa& bounds, size_t id)
+ {
+#if defined(__X86_64__) || defined(__aarch64__)
+ lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
+ upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
+#else
+ lower = Vec3fx(bounds.lower, (unsigned)id);
+ upper = Vec3fx(bounds.upper, (unsigned)0);
+#endif
+ }
+
+ /*! calculates twice the center of the primitive */
+ __forceinline const Vec3fa center2() const {
+ return lower+upper;
+ }
+
+ /*! return the bounding box of the primitive */
+ __forceinline const BBox3fa bounds() const {
+ return BBox3fa(lower,upper);
+ }
+
+ /*! size for bin heuristic is 1 */
+ __forceinline unsigned size() const {
+ return 1;
+ }
+
+ /*! returns bounds and centroid used for binning */
+ __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const
+ {
+ bounds_o = bounds();
+ center_o = embree::center2(bounds_o);
+ }
+
+ __forceinline unsigned& geomIDref() { // FIXME: remove !!!!!!!
+ return lower.u;
+ }
+ __forceinline unsigned& primIDref() { // FIXME: remove !!!!!!!
+ return upper.u;
+ }
+
+ /*! returns the geometry ID */
+ __forceinline unsigned geomID() const {
+ return lower.a;
+ }
+
+ /*! returns the primitive ID */
+ __forceinline unsigned primID() const {
+ return upper.a;
+ }
+
+ /*! returns an size_t sized ID */
+ __forceinline size_t ID() const {
+#if defined(__X86_64__) || defined(__aarch64__)
+ return size_t(lower.u) + (size_t(upper.u) << 32);
+#else
+ return size_t(lower.u);
+#endif
+ }
+
+ /*! special function for operator< */
+ __forceinline uint64_t ID64() const {
+ return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+ }
+
+ /*! allows sorting the primrefs by ID */
+ friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) {
+ return p0.ID64() < p1.ID64();
+ }
+
+ /*! Outputs primitive reference to a stream. */
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) {
+ return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }";
+ }
+
+ public:
+ Vec3fx lower; //!< lower bounds and geomID
+ Vec3fx upper; //!< upper bounds and primID
+ };
+
+ /*! fast exchange for PrimRefs */
+ __forceinline void xchg(PrimRef& a, PrimRef& b)
+ {
+#if defined(__AVX__)
+ const vfloat8 aa = vfloat8::load((float*)&a);
+ const vfloat8 bb = vfloat8::load((float*)&b);
+ vfloat8::store((float*)&a,bb);
+ vfloat8::store((float*)&b,aa);
+#else
+ std::swap(a,b);
+#endif
+ }
+
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+ struct SubGridBuildData {
+ unsigned short sx,sy;
+ unsigned int primID;
+
+ __forceinline SubGridBuildData() {};
+ __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {};
+
+ __forceinline size_t x() const { return (size_t)sx & 0x7fff; }
+ __forceinline size_t y() const { return (size_t)sy & 0x7fff; }
+
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/primref_mb.h b/thirdparty/embree-aarch64/kernels/common/primref_mb.h
new file mode 100644
index 0000000000..b6c1ad5712
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/primref_mb.h
@@ -0,0 +1,262 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+#define MBLUR_BIN_LBBOX 1
+
+namespace embree
+{
+#if MBLUR_BIN_LBBOX
+
+ /*! A primitive reference stores the bounds of the primitive and its ID. */
+ struct PrimRefMB
+ {
+ typedef LBBox3fa BBox;
+
+ __forceinline PrimRefMB () {}
+
+ __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+ : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+ {
+ assert(activeTimeSegments > 0);
+ lbounds.bounds0.lower.a = geomID;
+ lbounds.bounds0.upper.a = primID;
+ lbounds.bounds1.lower.a = activeTimeSegments;
+ lbounds.bounds1.upper.a = totalTimeSegments;
+ }
+
+ __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+ : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+ {
+ assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+ lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
+ lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
+#else
+ lbounds.bounds0.lower.a = id;
+ lbounds.bounds0.upper.a = 0;
+#endif
+ lbounds.bounds1.lower.a = activeTimeSegments;
+ lbounds.bounds1.upper.a = totalTimeSegments;
+ }
+
+ __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+ : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+ {
+ assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+ lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
+ lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+ lbounds.bounds0.lower.u = id;
+ lbounds.bounds0.upper.u = 0;
+#endif
+ lbounds.bounds1.lower.a = activeTimeSegments;
+ lbounds.bounds1.upper.a = totalTimeSegments;
+ }
+
+ /*! returns bounds for binning */
+ __forceinline LBBox3fa bounds() const {
+ return lbounds;
+ }
+
+ /*! returns the number of time segments of this primref */
+ __forceinline unsigned size() const {
+ return lbounds.bounds1.lower.a;
+ }
+
+ __forceinline unsigned totalTimeSegments() const {
+ return lbounds.bounds1.upper.a;
+ }
+
+ /* calculate overlapping time segment range */
+ __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+ return getTimeSegmentRange(range,time_range,float(totalTimeSegments()));
+ }
+
+ /* returns time that corresponds to time step */
+ __forceinline float timeStep(const int i) const {
+ assert(i>=0 && i<=(int)totalTimeSegments());
+ return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments());
+ }
+
+ /*! checks if time range overlaps */
+ __forceinline bool time_range_overlap(const BBox1f& range) const
+ {
+ if (0.9999f*time_range.upper <= range.lower) return false;
+ if (1.0001f*time_range.lower >= range.upper) return false;
+ return true;
+ }
+
+ /*! returns center for binning */
+ __forceinline Vec3fa binCenter() const {
+ return center2(lbounds.interpolate(0.5f));
+ }
+
+ /*! returns bounds and centroid used for binning */
+ __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const
+ {
+ bounds_o = bounds();
+ center_o = binCenter();
+ }
+
+ /*! returns the geometry ID */
+ __forceinline unsigned geomID() const {
+ return lbounds.bounds0.lower.a;
+ }
+
+ /*! returns the primitive ID */
+ __forceinline unsigned primID() const {
+ return lbounds.bounds0.upper.a;
+ }
+
+ /*! returns an size_t sized ID */
+ __forceinline size_t ID() const {
+#if defined(__X86_64__) || defined(__aarch64__)
+ return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
+#else
+ return size_t(lbounds.bounds0.lower.u);
+#endif
+ }
+
+ /*! special function for operator< */
+ __forceinline uint64_t ID64() const {
+ return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+ }
+
+ /*! allows sorting the primrefs by ID */
+ friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+ return p0.ID64() < p1.ID64();
+ }
+
+ /*! Outputs primitive reference to a stream. */
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+ return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }";
+ }
+
+ public:
+ LBBox3fx lbounds;
+ BBox1f time_range; // entire geometry time range
+ };
+
+#else
+
+ /*! A primitive reference stores the bounds of the primitive and its ID. */
+ struct __aligned(16) PrimRefMB
+ {
+ typedef BBox3fa BBox;
+
+ __forceinline PrimRefMB () {}
+
+ __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+ : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+ {
+ assert(activeTimeSegments > 0);
+ bbox.lower.a = geomID;
+ bbox.upper.a = primID;
+ }
+
+ __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+ : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+ {
+ assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+ bbox.lower.u = id & 0xFFFFFFFF;
+ bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+ bbox.lower.u = id;
+ bbox.upper.u = 0;
+#endif
+ }
+
+ /*! returns bounds for binning */
+ __forceinline BBox3fa bounds() const {
+ return bbox;
+ }
+
+ /*! returns the number of time segments of this primref */
+ __forceinline unsigned int size() const {
+ return _activeTimeSegments;
+ }
+
+ __forceinline unsigned int totalTimeSegments() const {
+ return _totalTimeSegments;
+ }
+
+ /* calculate overlapping time segment range */
+ __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+ return getTimeSegmentRange(range,time_range,float(_totalTimeSegments));
+ }
+
+ /* returns time that corresponds to time step */
+ __forceinline float timeStep(const int i) const {
+ assert(i>=0 && i<=(int)_totalTimeSegments);
+ return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments);
+ }
+
+ /*! checks if time range overlaps */
+ __forceinline bool time_range_overlap(const BBox1f& range) const
+ {
+ if (0.9999f*time_range.upper <= range.lower) return false;
+ if (1.0001f*time_range.lower >= range.upper) return false;
+ return true;
+ }
+
+ /*! returns center for binning */
+ __forceinline Vec3fa binCenter() const {
+ return center2(bounds());
+ }
+
+ /*! returns bounds and centroid used for binning */
+ __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const
+ {
+ bounds_o = bounds();
+ center_o = center2(bounds());
+ }
+
+ /*! returns the geometry ID */
+ __forceinline unsigned int geomID() const {
+ return bbox.lower.a;
+ }
+
+ /*! returns the primitive ID */
+ __forceinline unsigned int primID() const {
+ return bbox.upper.a;
+ }
+
+ /*! returns an size_t sized ID */
+ __forceinline size_t ID() const {
+#if defined(__X86_64__) || defined(__aarch64__)
+ return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
+#else
+ return size_t(bbox.lower.u);
+#endif
+ }
+
+ /*! special function for operator< */
+ __forceinline uint64_t ID64() const {
+ return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+ }
+
+ /*! allows sorting the primrefs by ID */
+ friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+ return p0.ID64() < p1.ID64();
+ }
+
+ /*! Outputs primitive reference to a stream. */
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+ return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }";
+ }
+
+ public:
+ BBox3fa bbox; // bounds, geomID, primID
+ unsigned int _activeTimeSegments;
+ unsigned int _totalTimeSegments;
+ BBox1f time_range; // entire geometry time range
+ };
+
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/profile.h b/thirdparty/embree-aarch64/kernels/common/profile.h
new file mode 100644
index 0000000000..a7de36414d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/profile.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+ /*! helper structure for the implementation of the profile functions below */
+ struct ProfileTimer
+ {
+ static const size_t N = 20;
+
+ ProfileTimer () {}
+
+ ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0)
+ {
+ for (size_t i=0; i<N; i++) names[i] = nullptr;
+ for (size_t i=0; i<N; i++) dt_fst[i] = 0.0;
+ for (size_t i=0; i<N; i++) dt_min[i] = pos_inf;
+ for (size_t i=0; i<N; i++) dt_avg[i] = 0.0;
+ for (size_t i=0; i<N; i++) dt_max[i] = neg_inf;
+ }
+
+ __forceinline void begin()
+ {
+ j=0;
+ t0 = tj = getSeconds();
+ }
+
+ __forceinline void end() {
+ absolute("total");
+ i++;
+ }
+
+ __forceinline void operator() (const char* name) {
+ relative(name);
+ }
+
+ __forceinline void absolute (const char* name)
+ {
+ const double t1 = getSeconds();
+ const double dt = t1-t0;
+ assert(names[j] == nullptr || names[j] == name);
+ names[j] = name;
+ if (i == 0) dt_fst[j] = dt;
+ if (i>=numSkip) {
+ dt_min[j] = min(dt_min[j],dt);
+ dt_avg[j] = dt_avg[j] + dt;
+ dt_max[j] = max(dt_max[j],dt);
+ }
+ j++;
+ maxJ = max(maxJ,j);
+ }
+
+ __forceinline void relative (const char* name)
+ {
+ const double t1 = getSeconds();
+ const double dt = t1-tj;
+ tj = t1;
+ assert(names[j] == nullptr || names[j] == name);
+ names[j] = name;
+ if (i == 0) dt_fst[j] = dt;
+ if (i>=numSkip) {
+ dt_min[j] = min(dt_min[j],dt);
+ dt_avg[j] = dt_avg[j] + dt;
+ dt_max[j] = max(dt_max[j],dt);
+ }
+ j++;
+ maxJ = max(maxJ,j);
+ }
+
+ void print(size_t numElements)
+ {
+ for (size_t k=0; k<N; k++)
+ dt_avg[k] /= double(i-numSkip);
+
+ printf(" profile [M/s]:\n");
+ for (size_t j=0; j<maxJ; j++)
+ printf("%20s: fst = %7.2f M/s, min = %7.2f M/s, avg = %7.2f M/s, max = %7.2f M/s\n",
+ names[j],numElements/dt_fst[j]*1E-6,numElements/dt_max[j]*1E-6,numElements/dt_avg[j]*1E-6,numElements/dt_min[j]*1E-6);
+
+ printf(" profile [ms]:\n");
+ for (size_t j=0; j<maxJ; j++)
+ printf("%20s: fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+ names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+ }
+
+ void print()
+ {
+ printf(" profile:\n");
+
+ for (size_t k=0; k<N; k++)
+ dt_avg[k] /= double(i-numSkip);
+
+ for (size_t j=0; j<maxJ; j++) {
+ printf("%20s: fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+ names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+ }
+ }
+
+ double avg() {
+ return dt_avg[maxJ-1]/double(i-numSkip);
+ }
+
+ private:
+ size_t i;
+ size_t j;
+ size_t maxJ;
+ size_t numSkip;
+ double t0;
+ double tj;
+ const char* names[N];
+ double dt_fst[N];
+ double dt_min[N];
+ double dt_avg[N];
+ double dt_max[N];
+ };
+
+ /*! This function executes some code block multiple times and measured sections of it.
+ Use the following way:
+
+ profile(1,10,1000,[&](ProfileTimer& timer) {
+ // code
+ timer("A");
+ // code
+ timer("B");
+ });
+ */
+ template<typename Closure>
+ void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure)
+ {
+ ProfileTimer timer(numSkip);
+
+ for (size_t i=0; i<numSkip+numIter; i++)
+ {
+ timer.begin();
+ closure(timer);
+ timer.end();
+ }
+ timer.print(numElements);
+ }
+
+ /*! similar as the function above, but the timer object comes externally */
+ template<typename Closure>
+ void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure)
+ {
+ timer = ProfileTimer(numSkip);
+
+ for (size_t i=0; i<numSkip+numIter; i++)
+ {
+ timer.begin();
+ closure(timer);
+ timer.end();
+ }
+ timer.print(numElements);
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/ray.h b/thirdparty/embree-aarch64/kernels/common/ray.h
new file mode 100644
index 0000000000..336d48942c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/ray.h
@@ -0,0 +1,1517 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "instance_stack.h"
+
+// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
+
+namespace embree
+{
+ static const size_t MAX_INTERNAL_STREAM_SIZE = 32;
+
+ /* Ray structure for K rays */
+ template<int K>
+ struct RayK
+ {
+ /* Default construction does nothing */
+ __forceinline RayK() {}
+
+ /* Constructs a ray from origin, direction, and ray segment. Near
+ * has to be smaller than far */
+ __forceinline RayK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+ const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+ const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+ : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {}
+
+ /* Returns the size of the ray */
+ static __forceinline size_t size() { return K; }
+
+ /* Calculates if this is a valid ray that does not cause issues during traversal */
+ __forceinline vbool<K> valid() const
+ {
+ const vbool<K> vx = (abs(org.x) <= vfloat<K>(FLT_LARGE)) & (abs(dir.x) <= vfloat<K>(FLT_LARGE));
+ const vbool<K> vy = (abs(org.y) <= vfloat<K>(FLT_LARGE)) & (abs(dir.y) <= vfloat<K>(FLT_LARGE));
+ const vbool<K> vz = (abs(org.z) <= vfloat<K>(FLT_LARGE)) & (abs(dir.z) <= vfloat<K>(FLT_LARGE));
+ const vbool<K> vn = abs(tnear()) <= vfloat<K>(inf);
+ const vbool<K> vf = abs(tfar) <= vfloat<K>(inf);
+ return vx & vy & vz & vn & vf;
+ }
+
+ __forceinline void get(RayK<1>* ray) const;
+ __forceinline void get(size_t i, RayK<1>& ray) const;
+ __forceinline void set(const RayK<1>* ray);
+ __forceinline void set(size_t i, const RayK<1>& ray);
+
+ __forceinline void copy(size_t dest, size_t source);
+
+ __forceinline vint<K> octant() const
+ {
+ return select(dir.x < 0.0f, vint<K>(1), vint<K>(zero)) |
+ select(dir.y < 0.0f, vint<K>(2), vint<K>(zero)) |
+ select(dir.z < 0.0f, vint<K>(4), vint<K>(zero));
+ }
+
+ /* Ray data */
+ Vec3vf<K> org; // ray origin
+ vfloat<K> _tnear; // start of ray segment
+ Vec3vf<K> dir; // ray direction
+ vfloat<K> _time; // time of this ray for motion blur
+ vfloat<K> tfar; // end of ray segment
+ vint<K> mask; // used to mask out objects during traversal
+ vint<K> id;
+ vint<K> flags;
+
+ __forceinline vfloat<K>& tnear() { return _tnear; }
+ __forceinline vfloat<K>& time() { return _time; }
+ __forceinline const vfloat<K>& tnear() const { return _tnear; }
+ __forceinline const vfloat<K>& time() const { return _time; }
+ };
+
+ /* Ray+hit structure for K rays */
+ template<int K>
+ struct RayHitK : RayK<K>
+ {
+ using RayK<K>::org;
+ using RayK<K>::_tnear;
+ using RayK<K>::dir;
+ using RayK<K>::_time;
+ using RayK<K>::tfar;
+ using RayK<K>::mask;
+ using RayK<K>::id;
+ using RayK<K>::flags;
+
+ using RayK<K>::tnear;
+ using RayK<K>::time;
+
+ /* Default construction does nothing */
+ __forceinline RayHitK() {}
+
+ /* Constructs a ray from origin, direction, and ray segment. Near
+ * has to be smaller than far */
+ __forceinline RayHitK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+ const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+ const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+ : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags),
+ geomID(RTC_INVALID_GEOMETRY_ID)
+ {
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+ instID[l] = RTC_INVALID_GEOMETRY_ID;
+ }
+
+ __forceinline RayHitK(const RayK<K>& ray)
+ : RayK<K>(ray),
+ geomID(RTC_INVALID_GEOMETRY_ID)
+ {
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+ instID[l] = RTC_INVALID_GEOMETRY_ID;
+ }
+
+ __forceinline RayHitK<K>& operator =(const RayK<K>& ray)
+ {
+ org = ray.org;
+ _tnear = ray._tnear;
+ dir = ray.dir;
+ _time = ray._time;
+ tfar = ray.tfar;
+ mask = ray.mask;
+ id = ray.id;
+ flags = ray.flags;
+
+ geomID = RTC_INVALID_GEOMETRY_ID;
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+ instID[l] = RTC_INVALID_GEOMETRY_ID;
+
+ return *this;
+ }
+
+ /* Calculates if the hit is valid */
+ __forceinline void verifyHit(const vbool<K>& valid0) const
+ {
+ vbool<K> valid = valid0 & geomID != vuint<K>(RTC_INVALID_GEOMETRY_ID);
+ const vbool<K> vt = (abs(tfar) <= vfloat<K>(FLT_LARGE)) | (tfar == vfloat<K>(neg_inf));
+ const vbool<K> vu = (abs(u) <= vfloat<K>(FLT_LARGE));
+ const vbool<K> vv = (abs(u) <= vfloat<K>(FLT_LARGE));
+ const vbool<K> vnx = abs(Ng.x) <= vfloat<K>(FLT_LARGE);
+ const vbool<K> vny = abs(Ng.y) <= vfloat<K>(FLT_LARGE);
+ const vbool<K> vnz = abs(Ng.z) <= vfloat<K>(FLT_LARGE);
+ if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t");
+ if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u");
+ if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v");
+ if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x");
+ if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y");
+ if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z");
+ }
+
+ __forceinline void get(RayHitK<1>* ray) const;
+ __forceinline void get(size_t i, RayHitK<1>& ray) const;
+ __forceinline void set(const RayHitK<1>* ray);
+ __forceinline void set(size_t i, const RayHitK<1>& ray);
+
+ __forceinline void copy(size_t dest, size_t source);
+
+ /* Hit data */
+ Vec3vf<K> Ng; // geometry normal
+ vfloat<K> u; // barycentric u coordinate of hit
+ vfloat<K> v; // barycentric v coordinate of hit
+ vuint<K> primID; // primitive ID
+ vuint<K> geomID; // geometry ID
+ vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+ };
+
+ /* Specialization for a single ray */
+ template<>
+ struct RayK<1>
+ {
+ /* Default construction does nothing */
+ __forceinline RayK() {}
+
+ /* Constructs a ray from origin, direction, and ray segment. Near
+ * has to be smaller than far */
+ __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+ : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {}
+
+ /* Calculates if this is a valid ray that does not cause issues during traversal */
+ __forceinline bool valid() const {
+ return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf);
+ }
+
+ /* Ray data */
+ Vec3ff org; // 3 floats for ray origin, 1 float for tnear
+ //float tnear; // start of ray segment
+ Vec3ff dir; // 3 floats for ray direction, 1 float for time
+ // float time;
+ float tfar; // end of ray segment
+ int mask; // used to mask out objects during traversal
+ int id; // ray ID
+ int flags; // ray flags
+
+ __forceinline float& tnear() { return org.w; };
+ __forceinline const float& tnear() const { return org.w; };
+
+ __forceinline float& time() { return dir.w; };
+ __forceinline const float& time() const { return dir.w; };
+
+ };
+
+ template<>
+ struct RayHitK<1> : RayK<1>
+ {
+ /* Default construction does nothing */
+ __forceinline RayHitK() {}
+
+ /* Constructs a ray from origin, direction, and ray segment. Near
+ * has to be smaller than far */
+ __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+ : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags),
+ geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+ __forceinline RayHitK(const RayK<1>& ray)
+ : RayK<1>(ray),
+ geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+ __forceinline RayHitK<1>& operator =(const RayK<1>& ray)
+ {
+ org = ray.org;
+ dir = ray.dir;
+ tfar = ray.tfar;
+ mask = ray.mask;
+ id = ray.id;
+ flags = ray.flags;
+
+ geomID = RTC_INVALID_GEOMETRY_ID;
+
+ return *this;
+ }
+
+ /* Calculates if the hit is valid */
+ __forceinline void verifyHit() const
+ {
+ if (geomID == RTC_INVALID_GEOMETRY_ID) return;
+ const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf));
+ const bool vu = (abs(u) <= FLT_LARGE);
+ const bool vv = (abs(u) <= FLT_LARGE);
+ const bool vnx = abs(Ng.x) <= FLT_LARGE;
+ const bool vny = abs(Ng.y) <= FLT_LARGE;
+ const bool vnz = abs(Ng.z) <= FLT_LARGE;
+ if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t");
+ if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u");
+ if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v");
+ if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x");
+ if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y");
+ if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z");
+ }
+
+ /* Hit data */
+ Vec3f Ng; // not normalized geometry normal
+ float u; // barycentric u coordinate of hit
+ float v; // barycentric v coordinate of hit
+ unsigned int primID; // primitive ID
+ unsigned int geomID; // geometry ID
+ unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+ };
+
+ /* Converts ray packet to single rays */
+ template<int K>
+ __forceinline void RayK<K>::get(RayK<1>* ray) const
+ {
+ for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+ {
+ ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i];
+ ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time() = time()[i];
+ ray[i].tfar = tfar[i]; ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i];
+ }
+ }
+
+ template<int K>
+ __forceinline void RayHitK<K>::get(RayHitK<1>* ray) const
+ {
+ // FIXME: use SIMD transpose
+ for (size_t i = 0; i < K; i++)
+ get(i, ray[i]);
+ }
+
+ /* Extracts a single ray out of a ray packet*/
+ template<int K>
+ __forceinline void RayK<K>::get(size_t i, RayK<1>& ray) const
+ {
+ ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i];
+ ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time() = time()[i];
+ ray.tfar = tfar[i]; ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i];
+ }
+
+ template<int K>
+ __forceinline void RayHitK<K>::get(size_t i, RayHitK<1>& ray) const
+ {
+ ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i];
+ ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar = tfar[i]; ray.time() = time()[i];
+ ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i];
+ ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i];
+ ray.u = u[i]; ray.v = v[i];
+ ray.primID = primID[i]; ray.geomID = geomID[i];
+
+ instance_id_stack::copy(instID, ray.instID, i);
+ }
+
+ /* Converts single rays to ray packet */
+ template<int K>
+ __forceinline void RayK<K>::set(const RayK<1>* ray)
+ {
+ // FIXME: use SIMD transpose
+ for (size_t i = 0; i < K; i++)
+ set(i, ray[i]);
+ }
+
+ template<int K>
+ __forceinline void RayHitK<K>::set(const RayHitK<1>* ray)
+ {
+ // FIXME: use SIMD transpose
+ for (size_t i = 0; i < K; i++)
+ set(i, ray[i]);
+ }
+
+ /* inserts a single ray into a ray packet element */
+ template<int K>
+ __forceinline void RayK<K>::set(size_t i, const RayK<1>& ray)
+ {
+ org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+ dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+ tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+ }
+
+ template<int K>
+ __forceinline void RayHitK<K>::set(size_t i, const RayHitK<1>& ray)
+ {
+ org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+ dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+ tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+ Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z;
+ u[i] = ray.u; v[i] = ray.v;
+ primID[i] = ray.primID; geomID[i] = ray.geomID;
+
+ instance_id_stack::copy(ray.instID, instID, i);
+ }
+
+ /* copies a ray packet element into another element*/
+ template<int K>
+ __forceinline void RayK<K>::copy(size_t dest, size_t source)
+ {
+ org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+ dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source];
+ tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source];
+ }
+
+ template<int K>
+ __forceinline void RayHitK<K>::copy(size_t dest, size_t source)
+ {
+ org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+ dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source];
+ tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source];
+ Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source];
+ u[dest] = u[source]; v[dest] = v[source];
+ primID[dest] = primID[source]; geomID[dest] = geomID[source];
+
+ instance_id_stack::copy(instID, instID, source, dest);
+ }
+
+ /* Shortcuts */
+ typedef RayK<1> Ray;
+ typedef RayK<4> Ray4;
+ typedef RayK<8> Ray8;
+ typedef RayK<16> Ray16;
+ struct RayN;
+
+ typedef RayHitK<1> RayHit;
+ typedef RayHitK<4> RayHit4;
+ typedef RayHitK<8> RayHit8;
+ typedef RayHitK<16> RayHit16;
+ struct RayHitN;
+
+ template<int K, bool intersect>
+ struct RayTypeHelper;
+
+ template<int K>
+ struct RayTypeHelper<K, true>
+ {
+ typedef RayHitK<K> Ty;
+ };
+
+ template<int K>
+ struct RayTypeHelper<K, false>
+ {
+ typedef RayK<K> Ty;
+ };
+
+ template<bool intersect>
+ using RayType = typename RayTypeHelper<1, intersect>::Ty;
+
+ template<int K, bool intersect>
+ using RayTypeK = typename RayTypeHelper<K, intersect>::Ty;
+
+ /* Outputs ray to stream */
+ template<int K>
+ __forceinline embree_ostream operator <<(embree_ostream cout, const RayK<K>& ray)
+ {
+ return cout << "{ " << embree_endl
+ << " org = " << ray.org << embree_endl
+ << " dir = " << ray.dir << embree_endl
+ << " near = " << ray.tnear() << embree_endl
+ << " far = " << ray.tfar << embree_endl
+ << " time = " << ray.time() << embree_endl
+ << " mask = " << ray.mask << embree_endl
+ << " id = " << ray.id << embree_endl
+ << " flags = " << ray.flags << embree_endl
+ << "}";
+ }
+
+ template<int K>
+ __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK<K>& ray)
+ {
+ cout << "{ " << embree_endl
+ << " org = " << ray.org << embree_endl
+ << " dir = " << ray.dir << embree_endl
+ << " near = " << ray.tnear() << embree_endl
+ << " far = " << ray.tfar << embree_endl
+ << " time = " << ray.time() << embree_endl
+ << " mask = " << ray.mask << embree_endl
+ << " id = " << ray.id << embree_endl
+ << " flags = " << ray.flags << embree_endl
+ << " Ng = " << ray.Ng
+ << " u = " << ray.u << embree_endl
+ << " v = " << ray.v << embree_endl
+ << " primID = " << ray.primID << embree_endl
+ << " geomID = " << ray.geomID << embree_endl
+ << " instID =";
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+ {
+ cout << " " << ray.instID[l];
+ }
+ cout << embree_endl;
+ return cout << "}";
+ }
+
+ struct RayStreamSOA
+ {
+ __forceinline RayStreamSOA(void* rays, size_t N)
+ : ptr((char*)rays), N(N) {}
+
+ /* ray data access functions */
+ __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; } // x coordinate of ray origin
+ __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; } // y coordinate of ray origin
+ __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin
+ __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment
+
+ __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction
+ __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction
+ __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction
+ __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur
+
+ __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance)
+ __forceinline int* mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset]; }; // used to mask out objects during traversal (optional)
+ __forceinline int* id (size_t offset = 0) { return (int*)&ptr[10*4*N+offset]; }; // id
+ __forceinline int* flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset]; }; // flags
+
+ /* hit data access functions */
+ __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal
+ __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal
+ __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal
+
+ __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; }; // barycentric u coordinate of hit
+ __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; }; // barycentric v coordinate of hit
+
+ __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; }; // primitive ID
+ __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; }; // geometry ID
+ __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; }; // instance ID
+
+ __forceinline Ray getRayByOffset(size_t offset)
+ {
+ Ray ray;
+ ray.org.x = org_x(offset)[0];
+ ray.org.y = org_y(offset)[0];
+ ray.org.z = org_z(offset)[0];
+ ray.tnear() = tnear(offset)[0];
+ ray.dir.x = dir_x(offset)[0];
+ ray.dir.y = dir_y(offset)[0];
+ ray.dir.z = dir_z(offset)[0];
+ ray.time() = time(offset)[0];
+ ray.tfar = tfar(offset)[0];
+ ray.mask = mask(offset)[0];
+ ray.id = id(offset)[0];
+ ray.flags = flags(offset)[0];
+ return ray;
+ }
+
+ template<int K>
+ __forceinline RayK<K> getRayByOffset(size_t offset)
+ {
+ RayK<K> ray;
+ ray.org.x = vfloat<K>::loadu(org_x(offset));
+ ray.org.y = vfloat<K>::loadu(org_y(offset));
+ ray.org.z = vfloat<K>::loadu(org_z(offset));
+ ray.tnear = vfloat<K>::loadu(tnear(offset));
+ ray.dir.x = vfloat<K>::loadu(dir_x(offset));
+ ray.dir.y = vfloat<K>::loadu(dir_y(offset));
+ ray.dir.z = vfloat<K>::loadu(dir_z(offset));
+ ray.time = vfloat<K>::loadu(time(offset));
+ ray.tfar = vfloat<K>::loadu(tfar(offset));
+ ray.mask = vint<K>::loadu(mask(offset));
+ ray.id = vint<K>::loadu(id(offset));
+ ray.flags = vint<K>::loadu(flags(offset));
+ return ray;
+ }
+
+ template<int K>
+ __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+ {
+ RayK<K> ray;
+ ray.org.x = vfloat<K>::loadu(valid, org_x(offset));
+ ray.org.y = vfloat<K>::loadu(valid, org_y(offset));
+ ray.org.z = vfloat<K>::loadu(valid, org_z(offset));
+ ray.tnear() = vfloat<K>::loadu(valid, tnear(offset));
+ ray.dir.x = vfloat<K>::loadu(valid, dir_x(offset));
+ ray.dir.y = vfloat<K>::loadu(valid, dir_y(offset));
+ ray.dir.z = vfloat<K>::loadu(valid, dir_z(offset));
+ ray.time() = vfloat<K>::loadu(valid, time(offset));
+ ray.tfar = vfloat<K>::loadu(valid, tfar(offset));
+
+#if !defined(__AVX__)
+ /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults,
+ because the SSE masked loads always access the entire vector */
+ if (unlikely(!all(valid)))
+ {
+ ray.mask = zero;
+ ray.id = zero;
+ ray.flags = zero;
+
+ for (size_t k = 0; k < K; k++)
+ {
+ if (likely(valid[k]))
+ {
+ ray.mask[k] = mask(offset)[k];
+ ray.id[k] = id(offset)[k];
+ ray.flags[k] = flags(offset)[k];
+ }
+ }
+ }
+ else
+#endif
+ {
+ ray.mask = vint<K>::loadu(valid, mask(offset));
+ ray.id = vint<K>::loadu(valid, id(offset));
+ ray.flags = vint<K>::loadu(valid, flags(offset));
+ }
+
+ return ray;
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+ {
+ /*
+ * valid_i: stores which of the input rays exist (do not access nonexistent rays!)
+ * valid: stores which of the rays actually hit something.
+ */
+ vbool<K> valid = valid_i;
+ valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+ if (likely(any(valid)))
+ {
+ vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+ vfloat<K>::storeu(valid, Ng_x(offset), ray.Ng.x);
+ vfloat<K>::storeu(valid, Ng_y(offset), ray.Ng.y);
+ vfloat<K>::storeu(valid, Ng_z(offset), ray.Ng.z);
+ vfloat<K>::storeu(valid, u(offset), ray.u);
+ vfloat<K>::storeu(valid, v(offset), ray.v);
+
+#if !defined(__AVX__)
+ /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults,
+ because the SSE masked stores always access the entire vector */
+ if (unlikely(!all(valid_i)))
+ {
+ for (size_t k = 0; k < K; k++)
+ {
+ if (likely(valid[k]))
+ {
+ primID(offset)[k] = ray.primID[k];
+ geomID(offset)[k] = ray.geomID[k];
+
+ instID(0, offset)[k] = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+ instID(l, offset)[k] = ray.instID[l][k];
+#endif
+ }
+ }
+ }
+ else
+#endif
+ {
+ vuint<K>::storeu(valid, primID(offset), ray.primID);
+ vuint<K>::storeu(valid, geomID(offset), ray.geomID);
+
+ vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+ vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]);
+#endif
+ }
+ }
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.tfar < 0.0f);
+
+ if (likely(any(valid)))
+ vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+ }
+
+ __forceinline size_t getOctantByOffset(size_t offset)
+ {
+ const float dx = dir_x(offset)[0];
+ const float dy = dir_y(offset)[0];
+ const float dz = dir_z(offset)[0];
+ const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+ return octantID;
+ }
+
+ __forceinline bool isValidByOffset(size_t offset)
+ {
+ const float nnear = tnear(offset)[0];
+ const float ffar = tfar(offset)[0];
+ return nnear <= ffar;
+ }
+
+ template<int K>
+ __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+ {
+ RayK<K> ray;
+
+#if defined(__AVX2__)
+ ray.org.x = vfloat<K>::template gather<1>(valid, org_x(), offset);
+ ray.org.y = vfloat<K>::template gather<1>(valid, org_y(), offset);
+ ray.org.z = vfloat<K>::template gather<1>(valid, org_z(), offset);
+ ray.tnear() = vfloat<K>::template gather<1>(valid, tnear(), offset);
+ ray.dir.x = vfloat<K>::template gather<1>(valid, dir_x(), offset);
+ ray.dir.y = vfloat<K>::template gather<1>(valid, dir_y(), offset);
+ ray.dir.z = vfloat<K>::template gather<1>(valid, dir_z(), offset);
+ ray.time() = vfloat<K>::template gather<1>(valid, time(), offset);
+ ray.tfar = vfloat<K>::template gather<1>(valid, tfar(), offset);
+ ray.mask = vint<K>::template gather<1>(valid, mask(), offset);
+ ray.id = vint<K>::template gather<1>(valid, id(), offset);
+ ray.flags = vint<K>::template gather<1>(valid, flags(), offset);
+#else
+ ray.org = zero;
+ ray.tnear() = zero;
+ ray.dir = zero;
+ ray.time() = zero;
+ ray.tfar = zero;
+ ray.mask = zero;
+ ray.id = zero;
+ ray.flags = zero;
+
+ for (size_t k = 0; k < K; k++)
+ {
+ if (likely(valid[k]))
+ {
+ const size_t ofs = offset[k];
+
+ ray.org.x[k] = *org_x(ofs);
+ ray.org.y[k] = *org_y(ofs);
+ ray.org.z[k] = *org_z(ofs);
+ ray.tnear()[k] = *tnear(ofs);
+ ray.dir.x[k] = *dir_x(ofs);
+ ray.dir.y[k] = *dir_y(ofs);
+ ray.dir.z[k] = *dir_z(ofs);
+ ray.time()[k] = *time(ofs);
+ ray.tfar[k] = *tfar(ofs);
+ ray.mask[k] = *mask(ofs);
+ ray.id[k] = *id(ofs);
+ ray.flags[k] = *flags(ofs);
+ }
+ }
+#endif
+
+ return ray;
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+ if (likely(any(valid)))
+ {
+#if defined(__AVX512F__)
+ vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+ vfloat<K>::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x);
+ vfloat<K>::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y);
+ vfloat<K>::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z);
+ vfloat<K>::template scatter<1>(valid, u(), offset, ray.u);
+ vfloat<K>::template scatter<1>(valid, v(), offset, ray.v);
+ vuint<K>::template scatter<1>(valid, primID(), offset, ray.primID);
+ vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID);
+
+ vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+ vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]);
+#endif
+#else
+ size_t valid_bits = movemask(valid);
+ while (valid_bits != 0)
+ {
+ const size_t k = bscf(valid_bits);
+ const size_t ofs = offset[k];
+
+ *tfar(ofs) = ray.tfar[k];
+
+ *Ng_x(ofs) = ray.Ng.x[k];
+ *Ng_y(ofs) = ray.Ng.y[k];
+ *Ng_z(ofs) = ray.Ng.z[k];
+ *u(ofs) = ray.u[k];
+ *v(ofs) = ray.v[k];
+ *primID(ofs) = ray.primID[k];
+ *geomID(ofs) = ray.geomID[k];
+
+ *instID(0, ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+ *instID(l, ofs) = ray.instID[l][k];
+#endif
+ }
+#endif
+ }
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.tfar < 0.0f);
+
+ if (likely(any(valid)))
+ {
+#if defined(__AVX512F__)
+ vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+#else
+ size_t valid_bits = movemask(valid);
+ while (valid_bits != 0)
+ {
+ const size_t k = bscf(valid_bits);
+ const size_t ofs = offset[k];
+
+ *tfar(ofs) = ray.tfar[k];
+ }
+#endif
+ }
+ }
+
+ char* __restrict__ ptr;
+ size_t N;
+ };
+
+ template<size_t MAX_K>
+ struct StackRayStreamSOA : public RayStreamSOA
+ {
+ __forceinline StackRayStreamSOA(size_t K)
+ : RayStreamSOA(data, K) { assert(K <= MAX_K); }
+
+ char data[MAX_K / 4 * sizeof(RayHit4)];
+ };
+
+
+ struct RayStreamSOP
+ {
+ template<class T>
+ __forceinline void init(T& t)
+ {
+ org_x = (float*)&t.org.x;
+ org_y = (float*)&t.org.y;
+ org_z = (float*)&t.org.z;
+ tnear = (float*)&t.tnear;
+ dir_x = (float*)&t.dir.x;
+ dir_y = (float*)&t.dir.y;
+ dir_z = (float*)&t.dir.z;
+ time = (float*)&t.time;
+ tfar = (float*)&t.tfar;
+ mask = (unsigned int*)&t.mask;
+ id = (unsigned int*)&t.id;
+ flags = (unsigned int*)&t.flags;
+
+ Ng_x = (float*)&t.Ng.x;
+ Ng_y = (float*)&t.Ng.y;
+ Ng_z = (float*)&t.Ng.z;
+ u = (float*)&t.u;
+ v = (float*)&t.v;
+ primID = (unsigned int*)&t.primID;
+ geomID = (unsigned int*)&t.geomID;
+
+ for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+ instID[l] = (unsigned int*)&t.instID[l];
+ }
+
+ __forceinline Ray getRayByOffset(size_t offset)
+ {
+ Ray ray;
+ ray.org.x = *(float* __restrict__)((char*)org_x + offset);
+ ray.org.y = *(float* __restrict__)((char*)org_y + offset);
+ ray.org.z = *(float* __restrict__)((char*)org_z + offset);
+ ray.dir.x = *(float* __restrict__)((char*)dir_x + offset);
+ ray.dir.y = *(float* __restrict__)((char*)dir_y + offset);
+ ray.dir.z = *(float* __restrict__)((char*)dir_z + offset);
+ ray.tfar = *(float* __restrict__)((char*)tfar + offset);
+ ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+ ray.time() = time ? *(float* __restrict__)((char*)time + offset) : 0.0f;
+ ray.mask = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1;
+ ray.id = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1;
+ ray.flags = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1;
+ return ray;
+ }
+
+ template<int K>
+ __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+ {
+ RayK<K> ray;
+ ray.org.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_x + offset));
+ ray.org.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_y + offset));
+ ray.org.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_z + offset));
+ ray.dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+ ray.dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+ ray.dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+ ray.tfar = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+ ray.tnear() = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+ ray.time() = time ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f;
+ ray.mask = mask ? vint<K>::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1;
+ ray.id = id ? vint<K>::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1;
+ ray.flags = flags ? vint<K>::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1;
+ return ray;
+ }
+
+ template<int K>
+ __forceinline Vec3vf<K> getDirByOffset(const vbool<K>& valid, size_t offset)
+ {
+ Vec3vf<K> dir;
+ dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+ dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+ dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+ return dir;
+ }
+
+ __forceinline void setHitByOffset(size_t offset, const RayHit& ray)
+ {
+ if (ray.geomID != RTC_INVALID_GEOMETRY_ID)
+ {
+ *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+
+ if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x;
+ if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y;
+ if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z;
+ *(float* __restrict__)((char*)u + offset) = ray.u;
+ *(float* __restrict__)((char*)v + offset) = ray.v;
+ *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID;
+ *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID;
+
+ if (likely(instID[0])) {
+ *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l)
+ *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l];
+#endif
+ }
+ }
+ }
+
+ __forceinline void setHitByOffset(size_t offset, const Ray& ray)
+ {
+ *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+ if (likely(any(valid)))
+ {
+ vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+
+ if (likely(Ng_x)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x);
+ if (likely(Ng_y)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y);
+ if (likely(Ng_z)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z);
+ vfloat<K>::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u);
+ vfloat<K>::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v);
+ vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID);
+ vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID);
+
+ if (likely(instID[0])) {
+ vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+ vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]);
+#endif
+ }
+ }
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.tfar < 0.0f);
+
+ if (likely(any(valid)))
+ vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+ }
+
+ __forceinline size_t getOctantByOffset(size_t offset)
+ {
+ const float dx = *(float* __restrict__)((char*)dir_x + offset);
+ const float dy = *(float* __restrict__)((char*)dir_y + offset);
+ const float dz = *(float* __restrict__)((char*)dir_z + offset);
+ const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+ return octantID;
+ }
+
+ __forceinline bool isValidByOffset(size_t offset)
+ {
+ const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+ const float ffar = *(float* __restrict__)((char*)tfar + offset);
+ return nnear <= ffar;
+ }
+
+ template<int K>
+ __forceinline vbool<K> isValidByOffset(const vbool<K>& valid, size_t offset)
+ {
+ const vfloat<K> nnear = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+ const vfloat<K> ffar = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+ return nnear <= ffar;
+ }
+
+ template<int K>
+ __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+ {
+ RayK<K> ray;
+
+#if defined(__AVX2__)
+ ray.org.x = vfloat<K>::template gather<1>(valid, org_x, offset);
+ ray.org.y = vfloat<K>::template gather<1>(valid, org_y, offset);
+ ray.org.z = vfloat<K>::template gather<1>(valid, org_z, offset);
+ ray.dir.x = vfloat<K>::template gather<1>(valid, dir_x, offset);
+ ray.dir.y = vfloat<K>::template gather<1>(valid, dir_y, offset);
+ ray.dir.z = vfloat<K>::template gather<1>(valid, dir_z, offset);
+ ray.tfar = vfloat<K>::template gather<1>(valid, tfar, offset);
+ ray.tnear() = tnear ? vfloat<K>::template gather<1>(valid, tnear, offset) : vfloat<K>(zero);
+ ray.time() = time ? vfloat<K>::template gather<1>(valid, time, offset) : vfloat<K>(zero);
+ ray.mask = mask ? vint<K>::template gather<1>(valid, (int*)mask, offset) : vint<K>(-1);
+ ray.id = id ? vint<K>::template gather<1>(valid, (int*)id, offset) : vint<K>(-1);
+ ray.flags = flags ? vint<K>::template gather<1>(valid, (int*)flags, offset) : vint<K>(-1);
+#else
+ ray.org = zero;
+ ray.tnear() = zero;
+ ray.dir = zero;
+ ray.tfar = zero;
+ ray.time() = zero;
+ ray.mask = zero;
+ ray.id = zero;
+ ray.flags = zero;
+
+ for (size_t k = 0; k < K; k++)
+ {
+ if (likely(valid[k]))
+ {
+ const size_t ofs = offset[k];
+
+ ray.org.x[k] = *(float* __restrict__)((char*)org_x + ofs);
+ ray.org.y[k] = *(float* __restrict__)((char*)org_y + ofs);
+ ray.org.z[k] = *(float* __restrict__)((char*)org_z + ofs);
+ ray.dir.x[k] = *(float* __restrict__)((char*)dir_x + ofs);
+ ray.dir.y[k] = *(float* __restrict__)((char*)dir_y + ofs);
+ ray.dir.z[k] = *(float* __restrict__)((char*)dir_z + ofs);
+ ray.tfar[k] = *(float* __restrict__)((char*)tfar + ofs);
+ ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f;
+ ray.time()[k] = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f;
+ ray.mask[k] = mask ? *(int* __restrict__)((char*)mask + ofs) : -1;
+ ray.id[k] = id ? *(int* __restrict__)((char*)id + ofs) : -1;
+ ray.flags[k] = flags ? *(int* __restrict__)((char*)flags + ofs) : -1;
+ }
+ }
+#endif
+
+ return ray;
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+ if (likely(any(valid)))
+ {
+#if defined(__AVX512F__)
+ vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+
+ if (likely(Ng_x)) vfloat<K>::template scatter<1>(valid, Ng_x, offset, ray.Ng.x);
+ if (likely(Ng_y)) vfloat<K>::template scatter<1>(valid, Ng_y, offset, ray.Ng.y);
+ if (likely(Ng_z)) vfloat<K>::template scatter<1>(valid, Ng_z, offset, ray.Ng.z);
+ vfloat<K>::template scatter<1>(valid, u, offset, ray.u);
+ vfloat<K>::template scatter<1>(valid, v, offset, ray.v);
+ vuint<K>::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID);
+ vuint<K>::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID);
+
+ if (likely(instID[0])) {
+ vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+ vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]);
+#endif
+ }
+#else
+ size_t valid_bits = movemask(valid);
+ while (valid_bits != 0)
+ {
+ const size_t k = bscf(valid_bits);
+ const size_t ofs = offset[k];
+
+ *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+
+ if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k];
+ if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k];
+ if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k];
+ *(float* __restrict__)((char*)u + ofs) = ray.u[k];
+ *(float* __restrict__)((char*)v + ofs) = ray.v[k];
+ *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k];
+ *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k];
+
+ if (likely(instID[0])) {
+ *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+ *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k];
+#endif
+ }
+ }
+#endif
+ }
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.tfar < 0.0f);
+
+ if (likely(any(valid)))
+ {
+#if defined(__AVX512F__)
+ vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+#else
+ size_t valid_bits = movemask(valid);
+ while (valid_bits != 0)
+ {
+ const size_t k = bscf(valid_bits);
+ const size_t ofs = offset[k];
+
+ *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+ }
+#endif
+ }
+ }
+
+ /* ray data */
+ float* __restrict__ org_x; // x coordinate of ray origin
+ float* __restrict__ org_y; // y coordinate of ray origin
+ float* __restrict__ org_z; // z coordinate of ray origin
+ float* __restrict__ tnear; // start of ray segment (optional)
+
+ float* __restrict__ dir_x; // x coordinate of ray direction
+ float* __restrict__ dir_y; // y coordinate of ray direction
+ float* __restrict__ dir_z; // z coordinate of ray direction
+ float* __restrict__ time; // time of this ray for motion blur (optional)
+
+ float* __restrict__ tfar; // end of ray segment (set to hit distance)
+ unsigned int* __restrict__ mask; // used to mask out objects during traversal (optional)
+ unsigned int* __restrict__ id; // ray ID
+ unsigned int* __restrict__ flags; // ray flags
+
+ /* hit data */
+ float* __restrict__ Ng_x; // x coordinate of geometry normal (optional)
+ float* __restrict__ Ng_y; // y coordinate of geometry normal (optional)
+ float* __restrict__ Ng_z; // z coordinate of geometry normal (optional)
+
+ float* __restrict__ u; // barycentric u coordinate of hit
+ float* __restrict__ v; // barycentric v coordinate of hit
+
+ unsigned int* __restrict__ primID; // primitive ID
+ unsigned int* __restrict__ geomID; // geometry ID
+ unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional)
+ };
+
+
+ struct RayStreamAOS
+ {
+ __forceinline RayStreamAOS(void* rays)
+ : ptr((Ray*)rays) {}
+
+ __forceinline Ray& getRayByOffset(size_t offset)
+ {
+ return *(Ray*)((char*)ptr + offset);
+ }
+
+ template<int K>
+ __forceinline RayK<K> getRayByOffset(const vint<K>& offset);
+
+ template<int K>
+ __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+ {
+ const vint<K> valid_offset = select(valid, offset, vintx(zero));
+ return getRayByOffset(valid_offset);
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+ if (likely(any(valid)))
+ {
+#if defined(__AVX512F__)
+ vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+ vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x);
+ vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y);
+ vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z);
+ vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u);
+ vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v);
+ vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID);
+ vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID);
+
+ vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+ for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+ vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]);
+#endif
+#else
+ size_t valid_bits = movemask(valid);
+ while (valid_bits != 0)
+ {
+ const size_t k = bscf(valid_bits);
+ RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]);
+ ray_k->tfar = ray.tfar[k];
+ ray_k->Ng.x = ray.Ng.x[k];
+ ray_k->Ng.y = ray.Ng.y[k];
+ ray_k->Ng.z = ray.Ng.z[k];
+ ray_k->u = ray.u[k];
+ ray_k->v = ray.v[k];
+ ray_k->primID = ray.primID[k];
+ ray_k->geomID = ray.geomID[k];
+
+ instance_id_stack::copy(ray.instID, ray_k->instID, k);
+ }
+#endif
+ }
+ }
+
+ template<int K>
+ __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.tfar < 0.0f);
+
+ if (likely(any(valid)))
+ {
+#if defined(__AVX512F__)
+ vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+#else
+ size_t valid_bits = movemask(valid);
+ while (valid_bits != 0)
+ {
+ const size_t k = bscf(valid_bits);
+ Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]);
+ ray_k->tfar = ray.tfar[k];
+ }
+#endif
+ }
+ }
+
+ Ray* __restrict__ ptr;
+ };
+
+ template<>
+ __forceinline Ray4 RayStreamAOS::getRayByOffset(const vint4& offset)
+ {
+ Ray4 ray;
+
+ /* load and transpose: org.x, org.y, org.z, tnear */
+ const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+ const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+ const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+ const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+
+ transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+ /* load and transpose: dir.x, dir.y, dir.z, time */
+ const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir);
+ const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir);
+ const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir);
+ const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir);
+
+ transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+ /* load and transpose: tfar, mask, id, flags */
+ const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+ const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+ const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+ const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+
+ vfloat4 maskf, idf, flagsf;
+ transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+ ray.mask = asInt(maskf);
+ ray.id = asInt(idf);
+ ray.flags = asInt(flagsf);
+
+ return ray;
+ }
+
+#if defined(__AVX__)
+ template<>
+ __forceinline Ray8 RayStreamAOS::getRayByOffset(const vint8& offset)
+ {
+ Ray8 ray;
+
+ /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+ const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+ const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+ const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+ const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+ const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org);
+ const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org);
+ const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org);
+ const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org);
+
+ transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+ /* load and transpose: tfar, mask, id, flags */
+ const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+ const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+ const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+ const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+ const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar);
+ const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar);
+ const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar);
+ const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar);
+
+ vfloat8 maskf, idf, flagsf;
+ transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+ ray.mask = asInt(maskf);
+ ray.id = asInt(idf);
+ ray.flags = asInt(flagsf);
+
+ return ray;
+ }
+#endif
+
+#if defined(__AVX512F__)
+ template<>
+ __forceinline Ray16 RayStreamAOS::getRayByOffset(const vint16& offset)
+ {
+ Ray16 ray;
+
+ /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+ const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org);
+ const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org);
+ const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org);
+ const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org);
+ const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org);
+ const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org);
+ const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org);
+ const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org);
+ const vfloat8 ab8 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org);
+ const vfloat8 ab9 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org);
+ const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org);
+ const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org);
+ const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org);
+ const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org);
+ const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org);
+ const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org);
+
+ transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+ ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+ /* load and transpose: tfar, mask, id, flags */
+ const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar);
+ const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar);
+ const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar);
+ const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar);
+ const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar);
+ const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar);
+ const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar);
+ const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar);
+ const vfloat4 c8 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar);
+ const vfloat4 c9 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar);
+ const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar);
+ const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar);
+ const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar);
+ const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar);
+ const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar);
+ const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar);
+
+ vfloat16 maskf, idf, flagsf;
+ transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+ ray.tfar, maskf, idf, flagsf);
+ ray.mask = asInt(maskf);
+ ray.id = asInt(idf);
+ ray.flags = asInt(flagsf);
+
+ return ray;
+ }
+#endif
+
+
+ struct RayStreamAOP
+ {
+ __forceinline RayStreamAOP(void* rays)
+ : ptr((Ray**)rays) {}
+
+ __forceinline Ray& getRayByIndex(size_t index)
+ {
+ return *ptr[index];
+ }
+
+ template<int K>
+ __forceinline RayK<K> getRayByIndex(const vint<K>& index);
+
+ template<int K>
+ __forceinline RayK<K> getRayByIndex(const vbool<K>& valid, const vint<K>& index)
+ {
+ const vint<K> valid_index = select(valid, index, vintx(zero));
+ return getRayByIndex(valid_index);
+ }
+
+ template<int K>
+ __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayHitK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+ if (likely(any(valid)))
+ {
+ size_t valid_bits = movemask(valid);
+ while (valid_bits != 0)
+ {
+ const size_t k = bscf(valid_bits);
+ RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]];
+
+ ray_k->tfar = ray.tfar[k];
+ ray_k->Ng.x = ray.Ng.x[k];
+ ray_k->Ng.y = ray.Ng.y[k];
+ ray_k->Ng.z = ray.Ng.z[k];
+ ray_k->u = ray.u[k];
+ ray_k->v = ray.v[k];
+ ray_k->primID = ray.primID[k];
+ ray_k->geomID = ray.geomID[k];
+ instance_id_stack::copy(ray.instID, ray_k->instID, k);
+ }
+ }
+ }
+
+ template<int K>
+ __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayK<K>& ray)
+ {
+ vbool<K> valid = valid_i;
+ valid &= (ray.tfar < 0.0f);
+
+ if (likely(any(valid)))
+ {
+ size_t valid_bits = movemask(valid);
+ while (valid_bits != 0)
+ {
+ const size_t k = bscf(valid_bits);
+ Ray* __restrict__ ray_k = ptr[index[k]];
+
+ ray_k->tfar = ray.tfar[k];
+ }
+ }
+ }
+
+ Ray** __restrict__ ptr;
+ };
+
+ template<>
+ __forceinline Ray4 RayStreamAOP::getRayByIndex(const vint4& index)
+ {
+ Ray4 ray;
+
+ /* load and transpose: org.x, org.y, org.z, tnear */
+ const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org);
+ const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org);
+ const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org);
+ const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org);
+
+ transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+ /* load and transpose: dir.x, dir.y, dir.z, time */
+ const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir);
+ const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir);
+ const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir);
+ const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir);
+
+ transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+ /* load and transpose: tfar, mask, id, flags */
+ const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+ const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+ const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+ const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+
+ vfloat4 maskf, idf, flagsf;
+ transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+ ray.mask = asInt(maskf);
+ ray.id = asInt(idf);
+ ray.flags = asInt(flagsf);
+
+ return ray;
+ }
+
+#if defined(__AVX__)
+ template<>
+ __forceinline Ray8 RayStreamAOP::getRayByIndex(const vint8& index)
+ {
+ Ray8 ray;
+
+ /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+ const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org);
+ const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org);
+ const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org);
+ const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org);
+ const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org);
+ const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org);
+ const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org);
+ const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org);
+
+ transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+ /* load and transpose: tfar, mask, id, flags */
+ const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+ const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+ const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+ const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+ const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar);
+ const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar);
+ const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar);
+ const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar);
+
+ vfloat8 maskf, idf, flagsf;
+ transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+ ray.mask = asInt(maskf);
+ ray.id = asInt(idf);
+ ray.flags = asInt(flagsf);
+
+ return ray;
+ }
+#endif
+
+#if defined(__AVX512F__)
+ template<>
+ __forceinline Ray16 RayStreamAOP::getRayByIndex(const vint16& index)
+ {
+ Ray16 ray;
+
+ /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+ const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org);
+ const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org);
+ const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org);
+ const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org);
+ const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org);
+ const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org);
+ const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org);
+ const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org);
+ const vfloat8 ab8 = vfloat8::loadu(&ptr[index[8]]->org);
+ const vfloat8 ab9 = vfloat8::loadu(&ptr[index[9]]->org);
+ const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org);
+ const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org);
+ const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org);
+ const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org);
+ const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org);
+ const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org);
+
+ transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+ ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+ /* load and transpose: tfar, mask, id, flags */
+ const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+ const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+ const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+ const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+ const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar);
+ const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar);
+ const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar);
+ const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar);
+ const vfloat4 c8 = vfloat4::loadu(&ptr[index[8]]->tfar);
+ const vfloat4 c9 = vfloat4::loadu(&ptr[index[9]]->tfar);
+ const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar);
+ const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar);
+ const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar);
+ const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar);
+ const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar);
+ const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar);
+
+ vfloat16 maskf, idf, flagsf;
+ transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+ ray.tfar, maskf, idf, flagsf);
+
+ ray.mask = asInt(maskf);
+ ray.id = asInt(idf);
+ ray.flags = asInt(flagsf);
+
+ return ray;
+ }
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
new file mode 100644
index 0000000000..625fbf6d4f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
@@ -0,0 +1,1799 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "../../include/embree3/rtcore_ray.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
+using namespace embree;
+
+RTC_NAMESPACE_BEGIN;
+
+ /* mutex to make API thread safe */
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ static std::mutex g_mutex;
+#else
+ static MutexSys g_mutex;
+#endif
+
+ RTC_API RTCDevice rtcNewDevice(const char* config)
+ {
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcNewDevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(g_mutex);
+#else
+ Lock<MutexSys> lock(g_mutex);
+#endif
+ Device* device = new Device(config);
+ return (RTCDevice) device->refInc();
+ RTC_CATCH_END(nullptr);
+ return (RTCDevice) nullptr;
+ }
+
+ RTC_API void rtcRetainDevice(RTCDevice hdevice)
+ {
+ Device* device = (Device*) hdevice;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcRetainDevice);
+ RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(g_mutex);
+#else
+ Lock<MutexSys> lock(g_mutex);
+#endif
+ device->refInc();
+ RTC_CATCH_END(nullptr);
+ }
+
+ RTC_API void rtcReleaseDevice(RTCDevice hdevice)
+ {
+ Device* device = (Device*) hdevice;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcReleaseDevice);
+ RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(g_mutex);
+#else
+ Lock<MutexSys> lock(g_mutex);
+#endif
+ device->refDec();
+ RTC_CATCH_END(nullptr);
+ }
+
+ RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop)
+ {
+ Device* device = (Device*) hdevice;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetDeviceProperty);
+ RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(g_mutex);
+#else
+ Lock<MutexSys> lock(g_mutex);
+#endif
+ return device->getProperty(prop);
+ RTC_CATCH_END(device);
+ return 0;
+ }
+
+ RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val)
+ {
+ Device* device = (Device*) hdevice;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetDeviceProperty);
+ const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
+ if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(g_mutex);
+#else
+ Lock<MutexSys> lock(g_mutex);
+#endif
+ device->setProperty(prop,val);
+ RTC_CATCH_END(device);
+ }
+
+ RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice)
+ {
+ Device* device = (Device*) hdevice;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetDeviceError);
+ if (device == nullptr) return Device::getThreadErrorCode();
+ else return device->getDeviceErrorCode();
+ RTC_CATCH_END(device);
+ return RTC_ERROR_UNKNOWN;
+ }
+
+ RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr)
+ {
+ Device* device = (Device*) hdevice;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetDeviceErrorFunction);
+ RTC_VERIFY_HANDLE(hdevice);
+ device->setErrorFunction(error, userPtr);
+ RTC_CATCH_END(device);
+ }
+
+ RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr)
+ {
+ Device* device = (Device*) hdevice;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetDeviceMemoryMonitorFunction);
+ device->setMemoryMonitorFunction(memoryMonitor, userPtr);
+ RTC_CATCH_END(device);
+ }
+
+ RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize)
+ {
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcNewBuffer);
+ RTC_VERIFY_HANDLE(hdevice);
+ Buffer* buffer = new Buffer((Device*)hdevice, byteSize);
+ return (RTCBuffer)buffer->refInc();
+ RTC_CATCH_END((Device*)hdevice);
+ return nullptr;
+ }
+
+ RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize)
+ {
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcNewSharedBuffer);
+ RTC_VERIFY_HANDLE(hdevice);
+ Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr);
+ return (RTCBuffer)buffer->refInc();
+ RTC_CATCH_END((Device*)hdevice);
+ return nullptr;
+ }
+
+ RTC_API void* rtcGetBufferData(RTCBuffer hbuffer)
+ {
+ Buffer* buffer = (Buffer*)hbuffer;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetBufferData);
+ RTC_VERIFY_HANDLE(hbuffer);
+ return buffer->data();
+ RTC_CATCH_END2(buffer);
+ return nullptr;
+ }
+
+ RTC_API void rtcRetainBuffer(RTCBuffer hbuffer)
+ {
+ Buffer* buffer = (Buffer*)hbuffer;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcRetainBuffer);
+ RTC_VERIFY_HANDLE(hbuffer);
+ buffer->refInc();
+ RTC_CATCH_END2(buffer);
+ }
+
+ RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer)
+ {
+ Buffer* buffer = (Buffer*)hbuffer;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcReleaseBuffer);
+ RTC_VERIFY_HANDLE(hbuffer);
+ buffer->refDec();
+ RTC_CATCH_END2(buffer);
+ }
+
+ RTC_API RTCScene rtcNewScene (RTCDevice hdevice)
+ {
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcNewScene);
+ RTC_VERIFY_HANDLE(hdevice);
+ Scene* scene = new Scene((Device*)hdevice);
+ return (RTCScene) scene->refInc();
+ RTC_CATCH_END((Device*)hdevice);
+ return nullptr;
+ }
+
+ RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetSceneDevice);
+ RTC_VERIFY_HANDLE(hscene);
+ return (RTCDevice)scene->device->refInc(); // user will own one additional device reference
+ RTC_CATCH_END2(scene);
+ return (RTCDevice)nullptr;
+ }
+
+ RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetSceneProgressMonitorFunction);
+ RTC_VERIFY_HANDLE(hscene);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(g_mutex);
+#else
+ Lock<MutexSys> lock(g_mutex);
+#endif
+ scene->setProgressMonitorFunction(progress,ptr);
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetSceneBuildQuality);
+ RTC_VERIFY_HANDLE(hscene);
+ if (quality != RTC_BUILD_QUALITY_LOW &&
+ quality != RTC_BUILD_QUALITY_MEDIUM &&
+ quality != RTC_BUILD_QUALITY_HIGH)
+ // -- GODOT start --
+ // throw std::runtime_error("invalid build quality");
+ abort();
+ // -- GODOT end --
+ scene->setBuildQuality(quality);
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetSceneFlags);
+ RTC_VERIFY_HANDLE(hscene);
+ scene->setSceneFlags(flags);
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetSceneFlags);
+ RTC_VERIFY_HANDLE(hscene);
+ return scene->getSceneFlags();
+ RTC_CATCH_END2(scene);
+ return RTC_SCENE_FLAG_NONE;
+ }
+
+ RTC_API void rtcCommitScene (RTCScene hscene)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcCommitScene);
+ RTC_VERIFY_HANDLE(hscene);
+ scene->commit(false);
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcJoinCommitScene (RTCScene hscene)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcJoinCommitScene);
+ RTC_VERIFY_HANDLE(hscene);
+ scene->commit(true);
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetSceneBounds);
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ BBox3fa bounds = scene->bounds.bounds();
+ bounds_o->lower_x = bounds.lower.x;
+ bounds_o->lower_y = bounds.lower.y;
+ bounds_o->lower_z = bounds.lower.z;
+ bounds_o->align0 = 0;
+ bounds_o->upper_x = bounds.upper.x;
+ bounds_o->upper_y = bounds.upper.y;
+ bounds_o->upper_z = bounds.upper.z;
+ bounds_o->align1 = 0;
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetSceneBounds);
+ RTC_VERIFY_HANDLE(hscene);
+ if (bounds_o == nullptr)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer");
+ if (scene->isModified())
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+
+ bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x;
+ bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y;
+ bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z;
+ bounds_o->bounds0.align0 = 0;
+ bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x;
+ bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y;
+ bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z;
+ bounds_o->bounds0.align1 = 0;
+ bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x;
+ bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y;
+ bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z;
+ bounds_o->bounds1.align0 = 0;
+ bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x;
+ bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y;
+ bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z;
+ bounds_o->bounds1.align1 = 0;
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr)
+ {
+ Scene* scene0 = (Scene*) hscene0;
+ Scene* scene1 = (Scene*) hscene1;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcCollide);
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene0);
+ RTC_VERIFY_HANDLE(hscene1);
+ if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+ if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+ if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices");
+ auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+ auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+ if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep");
+#endif
+ scene0->intersectors.collide(scene0,scene1,callback,userPtr);
+ RTC_CATCH_END(scene0->device);
+ }
+
+ inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+ {
+ bool changed = false;
+ if (userContext->instStackSize > 0)
+ {
+ const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+
+ float similarityScale = 0.f;
+ const bool similtude = similarityTransform(transform, &similarityScale);
+ assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f));
+
+ PointQuery query_inst;
+ query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z));
+ query_inst.radius = query->radius * similarityScale;
+ query_inst.time = query->time;
+
+ PointQueryContext context_inst(scene, (PointQuery*)query,
+ similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB,
+ queryFunc, userContext, similarityScale, userPtr);
+ changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst);
+ }
+ else
+ {
+ PointQueryContext context(scene, (PointQuery*)query,
+ POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr);
+ changed = scene->intersectors.pointQuery((PointQuery*)query, &context);
+ }
+ return changed;
+ }
+
+ RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcPointQuery);
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ RTC_VERIFY_HANDLE(userContext);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+ if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");
+ if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes");
+#endif
+
+ return pointQuery(scene, query, userContext, queryFunc, userPtr);
+ RTC_CATCH_END2_FALSE(scene);
+ }
+
+ RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcPointQuery4);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+ if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");
+ if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(point_query.travs,cnt,cnt,cnt);
+
+ bool changed = false;
+ PointQuery4* query4 = (PointQuery4*)query;
+ PointQuery query1;
+ for (size_t i=0; i<4; i++) {
+ if (!valid[i]) continue;
+ query4->get(i,query1);
+ changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+ query4->set(i,query1);
+ }
+ return changed;
+ RTC_CATCH_END2_FALSE(scene);
+ }
+
+ RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcPointQuery8);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+ if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");
+ if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(point_query.travs,cnt,cnt,cnt);
+
+ bool changed = false;
+ PointQuery8* query8 = (PointQuery8*)query;
+ PointQuery query1;
+ for (size_t i=0; i<8; i++) {
+ if (!valid[i]) continue;
+ query8->get(i,query1);
+ changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+ query8->set(i,query1);
+ }
+ return changed;
+ RTC_CATCH_END2_FALSE(scene);
+ }
+
+ RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcPointQuery16);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+ if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");
+ if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(point_query.travs,cnt,cnt,cnt);
+
+ bool changed = false;
+ PointQuery16* query16 = (PointQuery16*)query;
+ PointQuery query1;
+ for (size_t i=0; i<16; i++) {
+ if (!valid[i]) continue;
+ PointQuery query1; query16->get(i,query1);
+ changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+ query16->set(i,query1);
+ }
+ return changed;
+ RTC_CATCH_END2_FALSE(scene);
+ }
+
+ RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcIntersect1);
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");
+#endif
+ STAT3(normal.travs,1,1,1);
+ IntersectContext context(scene,user_context);
+ scene->intersectors.intersect(*rayhit,&context);
+#if defined(DEBUG)
+ ((RayHit*)rayhit)->verifyHit();
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcIntersect4);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");
+ if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(normal.travs,cnt,cnt,cnt);
+
+ IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+ RayHit4* rayhit4 = (RayHit4*)rayhit;
+ for (size_t i=0; i<4; i++) {
+ if (!valid[i]) continue;
+ RayHit ray1; rayhit4->get(i,ray1);
+ scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+ rayhit4->set(i,ray1);
+ }
+#else
+ scene->intersectors.intersect4(valid,*rayhit,&context);
+#endif
+
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcIntersect8);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");
+ if (((size_t)rayhit) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(normal.travs,cnt,cnt,cnt);
+
+ IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+ RayHit8* rayhit8 = (RayHit8*) rayhit;
+ for (size_t i=0; i<8; i++) {
+ if (!valid[i]) continue;
+ RayHit ray1; rayhit8->get(i,ray1);
+ scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+ rayhit8->set(i,ray1);
+ }
+#else
+ if (likely(scene->intersectors.intersector8))
+ scene->intersectors.intersect8(valid,*rayhit,&context);
+ else
+ scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context);
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcIntersect16);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");
+ if (((size_t)rayhit) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(normal.travs,cnt,cnt,cnt);
+
+ IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+ RayHit16* rayhit16 = (RayHit16*) rayhit;
+ for (size_t i=0; i<16; i++) {
+ if (!valid[i]) continue;
+ RayHit ray1; rayhit16->get(i,ray1);
+ scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+ rayhit16->set(i,ray1);
+ }
+#else
+ if (likely(scene->intersectors.intersector16))
+ scene->intersectors.intersect16(valid,*rayhit,&context);
+ else
+ scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context);
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcIntersect1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");
+#endif
+ STAT3(normal.travs,M,M,M);
+ IntersectContext context(scene,user_context);
+
+ /* fast codepath for single rays */
+ if (likely(M == 1)) {
+ if (likely(rayhit->ray.tnear <= rayhit->ray.tfar))
+ scene->intersectors.intersect(*rayhit,&context);
+ }
+
+ /* codepath for streams */
+ else {
+ scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context);
+ }
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported");
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcIntersect1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");
+#endif
+ STAT3(normal.travs,M,M,M);
+ IntersectContext context(scene,user_context);
+
+ /* fast codepath for single rays */
+ if (likely(M == 1)) {
+ if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar))
+ scene->intersectors.intersect(*rn[0],&context);
+ }
+
+ /* codepath for streams */
+ else {
+ scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context);
+ }
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported");
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcIntersectNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");
+#endif
+ STAT3(normal.travs,N*M,N*M,N*M);
+ IntersectContext context(scene,user_context);
+
+ /* code path for single ray streams */
+ if (likely(N == 1))
+ {
+ /* fast code path for streams of size 1 */
+ if (likely(M == 1)) {
+ if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar))
+ scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context);
+ }
+ /* normal codepath for single ray streams */
+ else {
+ scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context);
+ }
+ }
+ /* code path for ray packet streams */
+ else {
+ scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context);
+ }
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported");
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcIntersectNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes");
+ if (((size_t)rayhit->ray.mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes");
+ if (((size_t)rayhit->hit.Ng_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes");
+ if (((size_t)rayhit->hit.Ng_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes");
+ if (((size_t)rayhit->hit.Ng_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes");
+ if (((size_t)rayhit->hit.u ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes");
+ if (((size_t)rayhit->hit.v ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes");
+ if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes");
+ if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes");
+ if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes");
+#endif
+ STAT3(normal.travs,N,N,N);
+ IntersectContext context(scene,user_context);
+ scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context);
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported");
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcOccluded1);
+ STAT3(shadow.travs,1,1,1);
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");
+#endif
+ IntersectContext context(scene,user_context);
+ scene->intersectors.occluded(*ray,&context);
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcOccluded4);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");
+ if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(shadow.travs,cnt,cnt,cnt);
+
+ IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+ Ray4* ray4 = (Ray4*) ray;
+ for (size_t i=0; i<4; i++) {
+ if (!valid[i]) continue;
+ Ray ray1; ray4->get(i,ray1);
+ scene->intersectors.occluded((RTCRay&)ray1,&context);
+ ray4->set(i,ray1);
+ }
+#else
+ scene->intersectors.occluded4(valid,*ray,&context);
+#endif
+
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcOccluded8);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");
+ if (((size_t)ray) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(shadow.travs,cnt,cnt,cnt);
+
+ IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+ Ray8* ray8 = (Ray8*) ray;
+ for (size_t i=0; i<8; i++) {
+ if (!valid[i]) continue;
+ Ray ray1; ray8->get(i,ray1);
+ scene->intersectors.occluded((RTCRay&)ray1,&context);
+ ray8->set(i,ray1);
+ }
+#else
+ if (likely(scene->intersectors.intersector8))
+ scene->intersectors.occluded8(valid,*ray,&context);
+ else
+ scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context);
+#endif
+
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcOccluded16);
+
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");
+ if (((size_t)ray) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes");
+#endif
+ STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+ STAT3(shadow.travs,cnt,cnt,cnt);
+
+ IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+ Ray16* ray16 = (Ray16*) ray;
+ for (size_t i=0; i<16; i++) {
+ if (!valid[i]) continue;
+ Ray ray1; ray16->get(i,ray1);
+ scene->intersectors.occluded((RTCRay&)ray1,&context);
+ ray16->set(i,ray1);
+ }
+#else
+ if (likely(scene->intersectors.intersector16))
+ scene->intersectors.occluded16(valid,*ray,&context);
+ else
+ scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context);
+#endif
+
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcOccluded1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");
+#endif
+ STAT3(shadow.travs,M,M,M);
+ IntersectContext context(scene,user_context);
+ /* fast codepath for streams of size 1 */
+ if (likely(M == 1)) {
+ if (likely(ray->tnear <= ray->tfar))
+ scene->intersectors.occluded (*ray,&context);
+ }
+ /* codepath for normal streams */
+ else {
+ scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context);
+ }
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported");
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcOccluded1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");
+#endif
+ STAT3(shadow.travs,M,M,M);
+ IntersectContext context(scene,user_context);
+
+ /* fast codepath for streams of size 1 */
+ if (likely(M == 1)) {
+ if (likely(ray[0]->tnear <= ray[0]->tfar))
+ scene->intersectors.occluded (*ray[0],&context);
+ }
+ /* codepath for normal streams */
+ else {
+ scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context);
+ }
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported");
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcOccludedNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small");
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");
+#endif
+ STAT3(shadow.travs,N*M,N*N,N*N);
+ IntersectContext context(scene,user_context);
+
+ /* codepath for single rays */
+ if (likely(N == 1))
+ {
+ /* fast path for streams of size 1 */
+ if (likely(M == 1)) {
+ if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar))
+ scene->intersectors.occluded (*(RTCRay*)ray,&context);
+ }
+ /* codepath for normal ray streams */
+ else {
+ scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context);
+ }
+ }
+ /* code path for ray packet streams */
+ else {
+ scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context);
+ }
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported");
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcOccludedNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+ if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes");
+ if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes");
+ if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes");
+ if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");
+ if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes");
+ if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes");
+ if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");
+ if (((size_t)ray->tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes");
+ if (((size_t)ray->time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes");
+ if (((size_t)ray->mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes");
+#endif
+ STAT3(shadow.travs,N,N,N);
+ IntersectContext context(scene,user_context);
+ scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context);
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported");
+#endif
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcRetainScene (RTCScene hscene)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcRetainScene);
+ RTC_VERIFY_HANDLE(hscene);
+ scene->refInc();
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcReleaseScene (RTCScene hscene)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcReleaseScene);
+ RTC_VERIFY_HANDLE(hscene);
+ scene->refDec();
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ Ref<Scene> scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryInstancedScene);
+ RTC_VERIFY_HANDLE(hgeometry);
+ RTC_VERIFY_HANDLE(hscene);
+ geometry->setInstancedScene(scene);
+ RTC_CATCH_END2(geometry);
+ }
+
+ AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
+ {
+ AffineSpace3fa space = one;
+ switch (format)
+ {
+ case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+ space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]),
+ Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]),
+ Vec3fa(xfm[ 2], xfm[ 6], xfm[10]),
+ Vec3fa(xfm[ 3], xfm[ 7], xfm[11]));
+ break;
+
+ case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+ space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+ Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]),
+ Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]),
+ Vec3fa(xfm[ 9], xfm[10], xfm[11]));
+ break;
+
+ case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+ space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+ Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]),
+ Vec3fa(xfm[ 8], xfm[ 9], xfm[10]),
+ Vec3fa(xfm[12], xfm[13], xfm[14]));
+ break;
+
+ default:
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+ break;
+ }
+ return space;
+ }
+
+ void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm)
+ {
+ switch (format)
+ {
+ case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+ xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vy.x; xfm[ 2] = space.l.vz.x; xfm[ 3] = space.p.x;
+ xfm[ 4] = space.l.vx.y; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vz.y; xfm[ 7] = space.p.y;
+ xfm[ 8] = space.l.vx.z; xfm[ 9] = space.l.vy.z; xfm[10] = space.l.vz.z; xfm[11] = space.p.z;
+ break;
+
+ case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+ xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z;
+ xfm[ 3] = space.l.vy.x; xfm[ 4] = space.l.vy.y; xfm[ 5] = space.l.vy.z;
+ xfm[ 6] = space.l.vz.x; xfm[ 7] = space.l.vz.y; xfm[ 8] = space.l.vz.z;
+ xfm[ 9] = space.p.x; xfm[10] = space.p.y; xfm[11] = space.p.z;
+ break;
+
+ case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+ xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; xfm[ 3] = 0.f;
+ xfm[ 4] = space.l.vy.x; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vy.z; xfm[ 7] = 0.f;
+ xfm[ 8] = space.l.vz.x; xfm[ 9] = space.l.vz.y; xfm[10] = space.l.vz.z; xfm[11] = 0.f;
+ xfm[12] = space.p.x; xfm[13] = space.p.y; xfm[14] = space.p.z; xfm[15] = 1.f;
+ break;
+
+ default:
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+ break;
+ }
+ }
+
+ RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryTransform);
+ RTC_VERIFY_HANDLE(hgeometry);
+ RTC_VERIFY_HANDLE(xfm);
+ const AffineSpace3fa transform = loadTransform(format, (const float*)xfm);
+ geometry->setTransform(transform, timeStep);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryTransformQuaternion);
+ RTC_VERIFY_HANDLE(hgeometry);
+ RTC_VERIFY_HANDLE(qd);
+
+ AffineSpace3fx transform;
+ transform.l.vx.x = qd->scale_x;
+ transform.l.vy.y = qd->scale_y;
+ transform.l.vz.z = qd->scale_z;
+ transform.l.vy.x = qd->skew_xy;
+ transform.l.vz.x = qd->skew_xz;
+ transform.l.vz.y = qd->skew_yz;
+ transform.l.vx.y = qd->translation_x;
+ transform.l.vx.z = qd->translation_y;
+ transform.l.vy.z = qd->translation_z;
+ transform.p.x = qd->shift_x;
+ transform.p.y = qd->shift_y;
+ transform.p.z = qd->shift_z;
+
+ // normalize quaternion
+ Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k);
+ q = normalize(q);
+ transform.l.vx.w = q.i;
+ transform.l.vy.w = q.j;
+ transform.l.vz.w = q.k;
+ transform.p.w = q.r;
+
+ geometry->setQuaternionDecomposition(transform, timeStep);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometryTransform);
+ const AffineSpace3fa transform = geometry->getTransform(time);
+ storeTransform(transform, format, (float*)xfm);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+ {
+ IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
+ args->report(args,filter_args);
+ }
+
+ RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+ {
+ OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
+ args->report(args,filter_args);
+ }
+
+ RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
+ {
+ Device* device = (Device*) hdevice;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcNewGeometry);
+ RTC_VERIFY_HANDLE(hdevice);
+
+ switch (type)
+ {
+ case RTC_GEOMETRY_TYPE_TRIANGLE:
+ {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ createTriangleMeshTy createTriangleMesh = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createTriangleMesh);
+ Geometry* geom = createTriangleMesh(device);
+ return (RTCGeometry) geom->refInc();
+#else
+ throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported");
+#endif
+ }
+
+ case RTC_GEOMETRY_TYPE_QUAD:
+ {
+#if defined(EMBREE_GEOMETRY_QUAD)
+ createQuadMeshTy createQuadMesh = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createQuadMesh);
+ Geometry* geom = createQuadMesh(device);
+ return (RTCGeometry) geom->refInc();
+#else
+ throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported");
+#endif
+ }
+
+ case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+ case RTC_GEOMETRY_TYPE_DISC_POINT:
+ case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+ {
+#if defined(EMBREE_GEOMETRY_POINT)
+ createPointsTy createPoints = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_builder_cpu_features, createPoints);
+
+ Geometry *geom;
+ switch(type) {
+ case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+ geom = createPoints(device, Geometry::GTY_SPHERE_POINT);
+ break;
+ case RTC_GEOMETRY_TYPE_DISC_POINT:
+ geom = createPoints(device, Geometry::GTY_DISC_POINT);
+ break;
+ case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+ geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT);
+ break;
+ default:
+ geom = nullptr;
+ break;
+ }
+ return (RTCGeometry) geom->refInc();
+#else
+ throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported");
+#endif
+ }
+
+ case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE:
+ case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE:
+ case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE:
+
+ case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE:
+ case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE:
+ case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE:
+
+ case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE:
+ case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE:
+ case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE:
+
+ case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE:
+ case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE:
+ case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE:
+
+ case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE:
+ case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE:
+ case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE:
+ {
+#if defined(EMBREE_GEOMETRY_CURVE)
+ createLineSegmentsTy createLineSegments = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createLineSegments);
+ createCurvesTy createCurves = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createCurves);
+
+ Geometry* geom;
+ switch (type) {
+ case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break;
+ case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break;
+ case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break;
+ //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break;
+
+ case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break;
+ case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break;
+ case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break;
+
+ case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break;
+ case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break;
+ case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break;
+
+ case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break;
+ case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break;
+ case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break;
+
+ case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break;
+ case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break;
+ case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break;
+ default: geom = nullptr; break;
+ }
+ return (RTCGeometry) geom->refInc();
+#else
+ throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported");
+#endif
+ }
+
+ case RTC_GEOMETRY_TYPE_SUBDIVISION:
+ {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+ createSubdivMeshTy createSubdivMesh = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh);
+ //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason?
+ Geometry* geom = createSubdivMesh(device);
+ return (RTCGeometry) geom->refInc();
+#else
+ throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported");
+#endif
+ }
+
+ case RTC_GEOMETRY_TYPE_USER:
+ {
+#if defined(EMBREE_GEOMETRY_USER)
+ createUserGeometryTy createUserGeometry = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createUserGeometry);
+ Geometry* geom = createUserGeometry(device);
+ return (RTCGeometry) geom->refInc();
+#else
+ throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported");
+#endif
+ }
+
+ case RTC_GEOMETRY_TYPE_INSTANCE:
+ {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ createInstanceTy createInstance = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createInstance);
+ Geometry* geom = createInstance(device);
+ return (RTCGeometry) geom->refInc();
+#else
+ throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported");
+#endif
+ }
+
+ case RTC_GEOMETRY_TYPE_GRID:
+ {
+#if defined(EMBREE_GEOMETRY_GRID)
+ createGridMeshTy createGridMesh = nullptr;
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createGridMesh);
+ Geometry* geom = createGridMesh(device);
+ return (RTCGeometry) geom->refInc();
+#else
+ throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported");
+#endif
+ }
+
+ default:
+ throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type");
+ }
+
+ RTC_CATCH_END(device);
+ return nullptr;
+ }
+
+ RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryUserPrimitiveCount);
+ RTC_VERIFY_HANDLE(hgeometry);
+
+ if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY))
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries");
+
+ geometry->setNumPrimitives(userPrimitiveCount);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryTimeStepCount);
+ RTC_VERIFY_HANDLE(hgeometry);
+
+ if (timeStepCount > RTC_MAX_TIME_STEP_COUNT)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range");
+
+ geometry->setNumTimeSteps(timeStepCount);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime)
+ {
+ Ref<Geometry> geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryTimeRange);
+ RTC_VERIFY_HANDLE(hgeometry);
+
+ if (startTime > endTime)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime");
+
+ geometry->setTimeRange(BBox1f(startTime,endTime));
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryVertexAttributeCount);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setVertexAttributeCount(N);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryTopologyCount);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setTopologyCount(N);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryBuildQuality);
+ RTC_VERIFY_HANDLE(hgeometry);
+ if (quality != RTC_BUILD_QUALITY_LOW &&
+ quality != RTC_BUILD_QUALITY_MEDIUM &&
+ quality != RTC_BUILD_QUALITY_HIGH &&
+ quality != RTC_BUILD_QUALITY_REFIT)
+ // -- GODOT start --
+ // throw std::runtime_error("invalid build quality");
+ abort();
+ // -- GODOT end --
+ geometry->setBuildQuality(quality);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryMaxRadiusScale);
+ RTC_VERIFY_HANDLE(hgeometry);
+#if RTC_MIN_WIDTH
+ if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1");
+ geometry->setMaxRadiusScale(maxRadiusScale);
+#else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled");
+#endif
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryMask);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setMask(mask);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometrySubdivisionMode);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setSubdivisionMode(topologyID,mode);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryVertexAttributeTopology);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setVertexAttributeTopology(vertexAttributeID, topologyID);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ Ref<Buffer> buffer = (Buffer*)hbuffer;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryBuffer);
+ RTC_VERIFY_HANDLE(hgeometry);
+ RTC_VERIFY_HANDLE(hbuffer);
+
+ if (geometry->device != buffer->device)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+
+ if (itemCount > 0xFFFFFFFFu)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+
+ geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetSharedGeometryBuffer);
+ RTC_VERIFY_HANDLE(hgeometry);
+
+ if (itemCount > 0xFFFFFFFFu)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+
+ Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset);
+ geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetNewGeometryBuffer);
+ RTC_VERIFY_HANDLE(hgeometry);
+
+ if (itemCount > 0xFFFFFFFFu)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+
+ /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */
+ size_t bytes = itemCount*byteStride;
+ if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+ bytes += (16 - (byteStride%16))%16;
+
+ Ref<Buffer> buffer = new Buffer(geometry->device, bytes);
+ geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+ return buffer->data();
+ RTC_CATCH_END2(geometry);
+ return nullptr;
+ }
+
+ RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometryBufferData);
+ RTC_VERIFY_HANDLE(hgeometry);
+ return geometry->getBuffer(type, slot);
+ RTC_CATCH_END2(geometry);
+ return nullptr;
+ }
+
+ RTC_API void rtcEnableGeometry (RTCGeometry hgeometry)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcEnableGeometry);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->enable();
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcUpdateGeometryBuffer);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->updateBuffer(type, slot);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcDisableGeometry (RTCGeometry hgeometry)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcDisableGeometry);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->disable();
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryTessellationRate);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setTessellationRate(tessellationRate);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryUserData);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setUserData(ptr);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry)
+ {
+ Geometry* geometry = (Geometry*) hgeometry; // no ref counting here!
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometryUserData);
+ RTC_VERIFY_HANDLE(hgeometry);
+ return geometry->getUserData();
+ RTC_CATCH_END2(geometry);
+ return nullptr;
+ }
+
+ RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryBoundsFunction);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setBoundsFunction(bounds,userPtr);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryDisplacementFunction);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setDisplacementFunction(displacement);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryIntersectFunction);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setIntersectFunctionN(intersect);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryPointQueryFunction);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setPointQueryFunction(pointQuery);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometryFirstHalfEdge);
+ return geometry->getFirstHalfEdge(faceID);
+ RTC_CATCH_END2(geometry);
+ return -1;
+ }
+
+ RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometryFace);
+ return geometry->getFace(edgeID);
+ RTC_CATCH_END2(geometry);
+ return -1;
+ }
+
+ RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometryNextHalfEdge);
+ return geometry->getNextHalfEdge(edgeID);
+ RTC_CATCH_END2(geometry);
+ return -1;
+ }
+
+ RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometryPreviousHalfEdge);
+ return geometry->getPreviousHalfEdge(edgeID);
+ RTC_CATCH_END2(geometry);
+ return -1;
+ }
+
+ RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometryOppositeHalfEdge);
+ return geometry->getOppositeHalfEdge(topologyID,edgeID);
+ RTC_CATCH_END2(geometry);
+ return -1;
+ }
+
+ RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetOccludedFunctionN);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setOccludedFunctionN(occluded);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryIntersectFilterFunction);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setIntersectionFilterFunctionN(filter);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcSetGeometryOccludedFilterFunction);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->setOcclusionFilterFunctionN(filter);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args)
+ {
+ Geometry* geometry = (Geometry*) args->geometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcInterpolate);
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(args->geometry);
+#endif
+ geometry->interpolate(args);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args)
+ {
+ Geometry* geometry = (Geometry*) args->geometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcInterpolateN);
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(args->geometry);
+#endif
+ geometry->interpolateN(args);
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcCommitGeometry (RTCGeometry hgeometry)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcCommitGeometry);
+ RTC_VERIFY_HANDLE(hgeometry);
+ return geometry->commit();
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry)
+ {
+ Scene* scene = (Scene*) hscene;
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcAttachGeometry);
+ RTC_VERIFY_HANDLE(hscene);
+ RTC_VERIFY_HANDLE(hgeometry);
+ if (scene->device != geometry->device)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+ return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry);
+ RTC_CATCH_END2(scene);
+ return -1;
+ }
+
+ RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID)
+ {
+ Scene* scene = (Scene*) hscene;
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcAttachGeometryByID);
+ RTC_VERIFY_HANDLE(hscene);
+ RTC_VERIFY_HANDLE(hgeometry);
+ RTC_VERIFY_GEOMID(geomID);
+ if (scene->device != geometry->device)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+ scene->bind(geomID,geometry);
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcDetachGeometry);
+ RTC_VERIFY_HANDLE(hscene);
+ RTC_VERIFY_GEOMID(geomID);
+ scene->detachGeometry(geomID);
+ RTC_CATCH_END2(scene);
+ }
+
+ RTC_API void rtcRetainGeometry (RTCGeometry hgeometry)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcRetainGeometry);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->refInc();
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry)
+ {
+ Geometry* geometry = (Geometry*) hgeometry;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcReleaseGeometry);
+ RTC_VERIFY_HANDLE(hgeometry);
+ geometry->refDec();
+ RTC_CATCH_END2(geometry);
+ }
+
+ RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID)
+ {
+ Scene* scene = (Scene*) hscene;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcGetGeometry);
+#if defined(DEBUG)
+ RTC_VERIFY_HANDLE(hscene);
+ RTC_VERIFY_GEOMID(geomID);
+#endif
+ return (RTCGeometry) scene->get(geomID);
+ RTC_CATCH_END2(scene);
+ return nullptr;
+ }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h
new file mode 100644
index 0000000000..4b070e122b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h
@@ -0,0 +1,142 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../include/embree3/rtcore.h"
+RTC_NAMESPACE_USE
+
+namespace embree
+{
+ /*! decoding of intersection flags */
+ __forceinline bool isCoherent (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; }
+ __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8)
+# define USE_TASK_ARENA 1
+#else
+# define USE_TASK_ARENA 0
+#endif
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9
+# define TASKING_TBB_USE_TASK_ISOLATION 1
+#else
+# define TASKING_TBB_USE_TASK_ISOLATION 0
+#endif
+
+/*! Macros used in the rtcore API implementation */
+// -- GODOT start --
+// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
+
+// #define RTC_CATCH_END(device) \
+// } catch (std::bad_alloc&) { \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// } catch (rtcore_error& e) { \
+// Device::process_error(device,e.error,e.what()); \
+// } catch (std::exception& e) { \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// } catch (...) { \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// }
+#define RTC_CATCH_END(device)
+
+// #define RTC_CATCH_END2(scene) \
+// } catch (std::bad_alloc&) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// } catch (rtcore_error& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,e.error,e.what()); \
+// } catch (std::exception& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// } catch (...) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// }
+#define RTC_CATCH_END2(scene)
+
+// #define RTC_CATCH_END2_FALSE(scene) \
+// } catch (std::bad_alloc&) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \
+// return false; \
+// } catch (rtcore_error& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,e.error,e.what()); \
+// return false; \
+// } catch (std::exception& e) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \
+// return false; \
+// } catch (...) { \
+// Device* device = scene ? scene->device : nullptr; \
+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+// return false; \
+// }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end --
+
+#define RTC_VERIFY_HANDLE(handle) \
+ if (handle == nullptr) { \
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+ }
+
+#define RTC_VERIFY_GEOMID(id) \
+ if (id == RTC_INVALID_GEOMETRY_ID) { \
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+ }
+
+#define RTC_VERIFY_UPPER(id,upper) \
+ if (id > upper) { \
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+ }
+
+#define RTC_VERIFY_RANGE(id,lower,upper) \
+ if (id < lower || id > upper) \
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds");
+
+#if 0 // enable to debug print all API calls
+#define RTC_TRACE(x) std::cout << #x << std::endl;
+#else
+#define RTC_TRACE(x)
+#endif
+
+// -- GODOT begin --
+// /*! used to throw embree API errors */
+// struct rtcore_error : public std::exception
+// {
+// __forceinline rtcore_error(RTCError error, const std::string& str)
+// : error(error), str(str) {}
+//
+// ~rtcore_error() throw() {}
+//
+// const char* what () const throw () {
+// return str.c_str();
+// }
+//
+// RTCError error;
+// std::string str;
+// };
+// -- GODOT end --
+
+#if defined(DEBUG) // only report file and line in debug mode
+ // -- GODOT begin --
+ // #define throw_RTCError(error,str) \
+ // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+ #define throw_RTCError(error,str) \
+ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+ // -- GODOT end --
+#else
+ // -- GODOT begin --
+ // #define throw_RTCError(error,str) \
+ // throw rtcore_error(error,str);
+ #define throw_RTCError(error,str) \
+ abort();
+ // -- GODOT end --
+#endif
+
+#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+ (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member)))
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp
new file mode 100644
index 0000000000..6bb96bba07
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp
@@ -0,0 +1,442 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "alloc.h"
+
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/bvh_builder_morton.h"
+
+namespace embree
+{
+ namespace isa // FIXME: support more ISAs for builders
+ {
+ struct BVH : public RefCount
+ {
+ BVH (Device* device)
+ : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0)
+ {
+ device->refInc();
+ }
+
+ ~BVH() {
+ device->refDec();
+ }
+
+ public:
+ Device* device;
+ FastAllocator allocator;
+ mvector<BVHBuilderMorton::BuildPrim> morton_src;
+ mvector<BVHBuilderMorton::BuildPrim> morton_tmp;
+ };
+
+ void* rtcBuildBVHMorton(const RTCBuildArguments* arguments)
+ {
+ BVH* bvh = (BVH*) arguments->bvh;
+ RTCBuildPrimitive* prims_i = arguments->primitives;
+ size_t primitiveCount = arguments->primitiveCount;
+ RTCCreateNodeFunction createNode = arguments->createNode;
+ RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+ RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+ RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+ RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+ void* userPtr = arguments->userPtr;
+
+ std::atomic<size_t> progress(0);
+
+ /* initialize temporary arrays for morton builder */
+ PrimRef* prims = (PrimRef*) prims_i;
+ mvector<BVHBuilderMorton::BuildPrim>& morton_src = bvh->morton_src;
+ mvector<BVHBuilderMorton::BuildPrim>& morton_tmp = bvh->morton_tmp;
+ morton_src.resize(primitiveCount);
+ morton_tmp.resize(primitiveCount);
+
+ /* compute centroid bounds */
+ const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range<size_t>& r) -> BBox3fa {
+
+ BBox3fa bounds(empty);
+ for (size_t i=r.begin(); i<r.end(); i++)
+ bounds.extend(prims[i].bounds().center2());
+ return bounds;
+ }, BBox3fa::merge);
+
+ /* compute morton codes */
+ BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+ parallel_for ( size_t(0), primitiveCount, [&](const range<size_t>& r) {
+ BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]);
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ generator(prims[i].bounds(),(unsigned) i);
+ }
+ });
+
+ /* start morton build */
+ std::pair<void*,BBox3fa> root = BVHBuilderMorton::build<std::pair<void*,BBox3fa>>(
+
+ /* thread local allocator for fast allocations */
+ [&] () -> FastAllocator::CachedAllocator {
+ return bvh->allocator.getCachedAllocator();
+ },
+
+ /* lambda function that allocates BVH nodes */
+ [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* {
+ return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+ },
+
+ /* lambda function that sets bounds */
+ [&] (void* node, const std::pair<void*,BBox3fa>* children, size_t N) -> std::pair<void*,BBox3fa>
+ {
+ BBox3fa bounds = empty;
+ void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+ const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+ for (size_t i=0; i<N; i++) {
+ bounds.extend(children[i].second);
+ childptrs[i] = children[i].first;
+ cbounds[i] = (const RTCBounds*)&children[i].second;
+ }
+ setNodeBounds(node,cbounds,(unsigned int)N,userPtr);
+ setNodeChildren(node,childptrs, (unsigned int)N,userPtr);
+ return std::make_pair(node,bounds);
+ },
+
+ /* lambda function that creates BVH leaves */
+ [&]( const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) -> std::pair<void*,BBox3fa>
+ {
+ RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF];
+ BBox3fa bounds = empty;
+ for (size_t i=0;i<current.size();i++)
+ {
+ const size_t id = morton_src[current.begin()+i].index;
+ bounds.extend(prims[id].bounds());
+ localBuildPrims[i] = prims_i[id];
+ }
+ void* node = createLeaf((RTCThreadLocalAllocator)&alloc,localBuildPrims,current.size(),userPtr);
+ return std::make_pair(node,bounds);
+ },
+
+ /* lambda that calculates the bounds for some primitive */
+ [&] (const BVHBuilderMorton::BuildPrim& morton) -> BBox3fa {
+ return prims[morton.index].bounds();
+ },
+
+ /* progress monitor function */
+ [&] (size_t dn) {
+ if (!buildProgress) return true;
+ const size_t n = progress.fetch_add(dn)+dn;
+ const double f = std::min(1.0,double(n)/double(primitiveCount));
+ return buildProgress(userPtr,f);
+ },
+
+ morton_src.data(),morton_tmp.data(),primitiveCount,
+ *arguments);
+
+ bvh->allocator.cleanup();
+ return root.first;
+ }
+
+ void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments)
+ {
+ BVH* bvh = (BVH*) arguments->bvh;
+ RTCBuildPrimitive* prims = arguments->primitives;
+ size_t primitiveCount = arguments->primitiveCount;
+ RTCCreateNodeFunction createNode = arguments->createNode;
+ RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+ RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+ RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+ RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+ void* userPtr = arguments->userPtr;
+
+ std::atomic<size_t> progress(0);
+
+ /* calculate priminfo */
+ auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+ {
+ CentGeomBBox3fa bounds(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ bounds.extend((BBox3fa&)prims[j]);
+ return bounds;
+ };
+ const CentGeomBBox3fa bounds =
+ parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+ const PrimInfo pinfo(0,primitiveCount,bounds);
+
+ /* build BVH */
+ void* root = BVHBuilderBinnedSAH::build<void*>(
+
+ /* thread local allocator for fast allocations */
+ [&] () -> FastAllocator::CachedAllocator {
+ return bvh->allocator.getCachedAllocator();
+ },
+
+ /* lambda function that creates BVH nodes */
+ [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+ {
+ void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+ const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+ for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+ setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+ return node;
+ },
+
+ /* lambda function that updates BVH nodes */
+ [&](const BVHBuilderBinnedSAH::BuildRecord& precord, const BVHBuilderBinnedSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+ setNodeChildren(node,children, (unsigned int)N,userPtr);
+ return node;
+ },
+
+ /* lambda function that creates BVH leaves */
+ [&](const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+ return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+ },
+
+ /* progress monitor function */
+ [&] (size_t dn) {
+ if (!buildProgress) return true;
+ const size_t n = progress.fetch_add(dn)+dn;
+ const double f = std::min(1.0,double(n)/double(primitiveCount));
+ return buildProgress(userPtr,f);
+ },
+
+ (PrimRef*)prims,pinfo,*arguments);
+
+ bvh->allocator.cleanup();
+ return root;
+ }
+
+ static __forceinline const std::pair<CentGeomBBox3fa,unsigned int> mergePair(const std::pair<CentGeomBBox3fa,unsigned int>& a, const std::pair<CentGeomBBox3fa,unsigned int>& b) {
+ CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first);
+ unsigned int maxGeomID = max(a.second,b.second);
+ return std::pair<CentGeomBBox3fa,unsigned int>(centBounds,maxGeomID);
+ }
+
+ void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments)
+ {
+ BVH* bvh = (BVH*) arguments->bvh;
+ RTCBuildPrimitive* prims = arguments->primitives;
+ size_t primitiveCount = arguments->primitiveCount;
+ RTCCreateNodeFunction createNode = arguments->createNode;
+ RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+ RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+ RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+ RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive;
+ RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+ void* userPtr = arguments->userPtr;
+
+ std::atomic<size_t> progress(0);
+
+ /* calculate priminfo */
+
+ auto computeBounds = [&](const range<size_t>& r) -> std::pair<CentGeomBBox3fa,unsigned int>
+ {
+ CentGeomBBox3fa bounds(empty);
+ unsigned maxGeomID = 0;
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ bounds.extend((BBox3fa&)prims[j]);
+ maxGeomID = max(maxGeomID,prims[j].geomID);
+ }
+ return std::pair<CentGeomBBox3fa,unsigned int>(bounds,maxGeomID);
+ };
+
+
+ const std::pair<CentGeomBBox3fa,unsigned int> pair =
+ parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair<CentGeomBBox3fa,unsigned int>(CentGeomBBox3fa(empty),0), computeBounds, mergePair);
+
+ CentGeomBBox3fa bounds = pair.first;
+ const unsigned int maxGeomID = pair.second;
+
+ if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))))
+ {
+ /* fallback code for max geomID larger than threshold */
+ return rtcBuildBVHBinnedSAH(arguments);
+ }
+
+ const PrimInfo pinfo(0,primitiveCount,bounds);
+
+ /* function that splits a build primitive */
+ struct Splitter
+ {
+ Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr)
+ : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {}
+
+ __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const
+ {
+ prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK;
+ splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+ left_o.geomIDref() = geomID; left_o.primIDref() = primID;
+ right_o.geomIDref() = geomID; right_o.primIDref() = primID;
+ }
+
+ __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const
+ {
+ PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID);
+ splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+ }
+
+ RTCSplitPrimitiveFunction splitPrimitive;
+ unsigned geomID;
+ unsigned primID;
+ void* userPtr;
+ };
+
+ /* build BVH */
+ void* root = BVHBuilderBinnedFastSpatialSAH::build<void*>(
+
+ /* thread local allocator for fast allocations */
+ [&] () -> FastAllocator::CachedAllocator {
+ return bvh->allocator.getCachedAllocator();
+ },
+
+ /* lambda function that creates BVH nodes */
+ [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+ {
+ void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+ const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+ for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+ setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+ return node;
+ },
+
+ /* lambda function that updates BVH nodes */
+ [&] (const BVHBuilderBinnedFastSpatialSAH::BuildRecord& precord, const BVHBuilderBinnedFastSpatialSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+ setNodeChildren(node,children, (unsigned int)N,userPtr);
+ return node;
+ },
+
+ /* lambda function that creates BVH leaves */
+ [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+ return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+ },
+
+ /* returns the splitter */
+ [&] ( const PrimRef& prim ) -> Splitter {
+ return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr);
+ },
+
+ /* progress monitor function */
+ [&] (size_t dn) {
+ if (!buildProgress) return true;
+ const size_t n = progress.fetch_add(dn)+dn;
+ const double f = std::min(1.0,double(n)/double(primitiveCount));
+ return buildProgress(userPtr,f);
+ },
+
+ (PrimRef*)prims,
+ arguments->primitiveArrayCapacity,
+ pinfo,*arguments);
+
+ bvh->allocator.cleanup();
+ return root;
+ }
+ }
+}
+
+using namespace embree;
+using namespace embree::isa;
+
+RTC_NAMESPACE_BEGIN
+
+ RTC_API RTCBVH rtcNewBVH(RTCDevice device)
+ {
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcNewAllocator);
+ RTC_VERIFY_HANDLE(device);
+ BVH* bvh = new BVH((Device*)device);
+ return (RTCBVH) bvh->refInc();
+ RTC_CATCH_END((Device*)device);
+ return nullptr;
+ }
+
+ RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments)
+ {
+ BVH* bvh = (BVH*) arguments->bvh;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcBuildBVH);
+ RTC_VERIFY_HANDLE(bvh);
+ RTC_VERIFY_HANDLE(arguments);
+ RTC_VERIFY_HANDLE(arguments->createNode);
+ RTC_VERIFY_HANDLE(arguments->setNodeChildren);
+ RTC_VERIFY_HANDLE(arguments->setNodeBounds);
+ RTC_VERIFY_HANDLE(arguments->createLeaf);
+
+ if (arguments->primitiveArrayCapacity < arguments->primitiveCount)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount")
+
+ /* initialize the allocator */
+ bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
+ bvh->allocator.reset();
+
+ /* switch between differnet builders based on quality level */
+ if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
+ return rtcBuildBVHMorton(arguments);
+ else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
+ return rtcBuildBVHBinnedSAH(arguments);
+ else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) {
+ if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount)
+ return rtcBuildBVHBinnedSAH(arguments);
+ else
+ return rtcBuildBVHSpatialSAH(arguments);
+ }
+ else
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality");
+
+ /* if we are in dynamic mode, then do not clear temporary data */
+ if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC))
+ {
+ bvh->morton_src.clear();
+ bvh->morton_tmp.clear();
+ }
+
+ RTC_CATCH_END(bvh->device);
+ return nullptr;
+ }
+
+ RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align)
+ {
+ FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcThreadLocalAlloc);
+ return alloc->malloc0(bytes,align);
+ RTC_CATCH_END(alloc->alloc->getDevice());
+ return nullptr;
+ }
+
+ RTC_API void rtcMakeStaticBVH(RTCBVH hbvh)
+ {
+ BVH* bvh = (BVH*) hbvh;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcStaticBVH);
+ RTC_VERIFY_HANDLE(hbvh);
+ bvh->morton_src.clear();
+ bvh->morton_tmp.clear();
+ RTC_CATCH_END(bvh->device);
+ }
+
+ RTC_API void rtcRetainBVH(RTCBVH hbvh)
+ {
+ BVH* bvh = (BVH*) hbvh;
+ Device* device = bvh ? bvh->device : nullptr;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcRetainBVH);
+ RTC_VERIFY_HANDLE(hbvh);
+ bvh->refInc();
+ RTC_CATCH_END(device);
+ }
+
+ RTC_API void rtcReleaseBVH(RTCBVH hbvh)
+ {
+ BVH* bvh = (BVH*) hbvh;
+ Device* device = bvh ? bvh->device : nullptr;
+ RTC_CATCH_BEGIN;
+ RTC_TRACE(rtcReleaseBVH);
+ RTC_VERIFY_HANDLE(hbvh);
+ bvh->refDec();
+ RTC_CATCH_END(device);
+ }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp
new file mode 100644
index 0000000000..1e23aeb415
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp
@@ -0,0 +1,976 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+ /* error raising rtcIntersect and rtcOccluded functions */
+ void missing_rtcCommit() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); }
+ void invalid_rtcIntersect1() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); }
+ void invalid_rtcIntersect4() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); }
+ void invalid_rtcIntersect8() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); }
+ void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); }
+ void invalid_rtcIntersectN() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); }
+
+ Scene::Scene (Device* device)
+ : device(device),
+ flags_modified(true), enabled_geometry_types(0),
+ scene_flags(RTC_SCENE_FLAG_NONE),
+ quality_flags(RTC_BUILD_QUALITY_MEDIUM),
+ is_build(false), modified(true),
+ progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0)
+ {
+ device->refInc();
+
+ intersectors = Accel::Intersectors(missing_rtcCommit);
+
+ /* one can overwrite flags through device for debugging */
+ if (device->quality_flags != -1)
+ quality_flags = (RTCBuildQuality) device->quality_flags;
+ if (device->scene_flags != -1)
+ scene_flags = (RTCSceneFlags) device->scene_flags;
+ }
+
+ Scene::~Scene() noexcept
+ {
+ device->refDec();
+ }
+
+ void Scene::printStatistics()
+ {
+ /* calculate maximum number of time segments */
+ unsigned max_time_steps = 0;
+ for (size_t i=0; i<size(); i++) {
+ if (!get(i)) continue;
+ max_time_steps = max(max_time_steps,get(i)->numTimeSteps);
+ }
+
+ /* initialize vectors*/
+ std::vector<size_t> statistics[Geometry::GTY_END];
+ for (size_t i=0; i<Geometry::GTY_END; i++)
+ statistics[i].resize(max_time_steps);
+
+ /* gather statistics */
+ for (size_t i=0; i<size(); i++)
+ {
+ if (!get(i)) continue;
+ int ty = get(i)->getType();
+ assert(ty<Geometry::GTY_END);
+ int timesegments = get(i)->numTimeSegments();
+ assert((unsigned int)timesegments < max_time_steps);
+ statistics[ty][timesegments] += get(i)->size();
+ }
+
+ /* print statistics */
+ std::cout << std::setw(23) << "segments" << ": ";
+ for (size_t t=0; t<max_time_steps; t++)
+ std::cout << std::setw(10) << t;
+ std::cout << std::endl;
+
+ std::cout << "-------------------------";
+ for (size_t t=0; t<max_time_steps; t++)
+ std::cout << "----------";
+ std::cout << std::endl;
+
+ for (size_t p=0; p<Geometry::GTY_END; p++)
+ {
+ if (std::string(Geometry::gtype_names[p]) == "") continue;
+ std::cout << std::setw(23) << Geometry::gtype_names[p] << ": ";
+ for (size_t t=0; t<max_time_steps; t++)
+ std::cout << std::setw(10) << statistics[p][t];
+ std::cout << std::endl;
+ }
+ }
+
+ void Scene::createTriangleAccel()
+ {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ if (device->tri_accel == "default")
+ {
+ if (quality_flags != RTC_BUILD_QUALITY_LOW)
+ {
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+ switch (mode) {
+ case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX())
+ {
+ if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+ accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+ else
+ accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+ }
+ else
+#endif
+ {
+ if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+ accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+ else
+ accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+ }
+ break;
+
+ case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX())
+ accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+ else
+#endif
+ accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+
+ break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ else /* dynamic */
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX())
+ {
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+ switch (mode) {
+ case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ else
+#endif
+ {
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+ switch (mode) {
+ case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ }
+ }
+ else if (device->tri_accel == "bvh4.triangle4") accels_add(device->bvh4_factory->BVH4Triangle4 (this));
+ else if (device->tri_accel == "bvh4.triangle4v") accels_add(device->bvh4_factory->BVH4Triangle4v(this));
+ else if (device->tri_accel == "bvh4.triangle4i") accels_add(device->bvh4_factory->BVH4Triangle4i(this));
+ else if (device->tri_accel == "qbvh4.triangle4i") accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->tri_accel == "bvh8.triangle4") accels_add(device->bvh8_factory->BVH8Triangle4 (this));
+ else if (device->tri_accel == "bvh8.triangle4v") accels_add(device->bvh8_factory->BVH8Triangle4v(this));
+ else if (device->tri_accel == "bvh8.triangle4i") accels_add(device->bvh8_factory->BVH8Triangle4i(this));
+ else if (device->tri_accel == "qbvh8.triangle4i") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this));
+ else if (device->tri_accel == "qbvh8.triangle4") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel);
+#endif
+ }
+
+ void Scene::createTriangleMBAccel()
+ {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ if (device->tri_accel_mb == "default")
+ {
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines
+ {
+ switch (mode) {
+ case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ else
+#endif
+ {
+ switch (mode) {
+ case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ }
+ else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this));
+ else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this));
+ else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb);
+#endif
+ }
+
+ void Scene::createQuadAccel()
+ {
+#if defined(EMBREE_GEOMETRY_QUAD)
+ if (device->quad_accel == "default")
+ {
+ if (quality_flags != RTC_BUILD_QUALITY_LOW)
+ {
+ /* static */
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+ switch (mode) {
+ case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX())
+ {
+ if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+ accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+ else
+ accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+ }
+ else
+#endif
+ {
+ if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+ accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+ else
+ accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+ }
+ break;
+
+ case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX())
+ accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+ else
+#endif
+ accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+ break;
+
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ else /* dynamic */
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX())
+ {
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+ switch (mode) {
+ case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ else
+#endif
+ {
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+ switch (mode) {
+ case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ }
+ }
+ else if (device->quad_accel == "bvh4.quad4v") accels_add(device->bvh4_factory->BVH4Quad4v(this));
+ else if (device->quad_accel == "bvh4.quad4i") accels_add(device->bvh4_factory->BVH4Quad4i(this));
+ else if (device->quad_accel == "qbvh4.quad4i") accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->quad_accel == "bvh8.quad4v") accels_add(device->bvh8_factory->BVH8Quad4v(this));
+ else if (device->quad_accel == "bvh8.quad4i") accels_add(device->bvh8_factory->BVH8Quad4i(this));
+ else if (device->quad_accel == "qbvh8.quad4i") accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel);
+#endif
+ }
+
+ void Scene::createQuadMBAccel()
+ {
+#if defined(EMBREE_GEOMETRY_QUAD)
+ if (device->quad_accel_mb == "default")
+ {
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+ switch (mode) {
+ case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX())
+ accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+ else
+#endif
+ accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+ break;
+
+ case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX())
+ accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+ else
+#endif
+ accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+ break;
+
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb);
+#endif
+ }
+
+ void Scene::createHairAccel()
+ {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+ if (device->hair_accel == "default")
+ {
+ int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower
+ {
+ switch (mode) {
+ case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ else
+#endif
+ {
+ switch (mode) {
+ case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+ case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break;
+ case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+ }
+ }
+ }
+ else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST));
+ else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST));
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST));
+ else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel);
+#endif
+ }
+
+ void Scene::createHairMBAccel()
+ {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+ if (device->hair_accel_mb == "default")
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower
+ {
+ if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST));
+ else accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+ }
+ else
+#endif
+ {
+ if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST));
+ else accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+ }
+ }
+ else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+ else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb);
+#endif
+ }
+
+ void Scene::createSubdivAccel()
+ {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+ if (device->subdiv_accel == "default") {
+ accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+ }
+ else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+ else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel);
+#endif
+ }
+
+ void Scene::createSubdivMBAccel()
+ {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+ if (device->subdiv_accel_mb == "default") {
+ accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this));
+ }
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb);
+#endif
+ }
+
+ void Scene::createUserGeometryAccel()
+ {
+#if defined(EMBREE_GEOMETRY_USER)
+ if (device->object_accel == "default")
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX() && !isCompactAccel())
+ {
+ if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+ accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+ } else {
+ accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+ }
+ }
+ else
+#endif
+ {
+ if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+ accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+ } else {
+ accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+ }
+ }
+ }
+ else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this));
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel);
+#endif
+ }
+
+ void Scene::createUserGeometryMBAccel()
+ {
+#if defined(EMBREE_GEOMETRY_USER)
+ if (device->object_accel_mb == "default" ) {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX() && !isCompactAccel())
+ accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+ else
+#endif
+ accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+ }
+ else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb);
+#endif
+ }
+
+ void Scene::createInstanceAccel()
+ {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ // if (device->object_accel == "default")
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX() && !isCompactAccel()) {
+ if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+ accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC));
+ } else {
+ accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+ }
+ }
+ else
+#endif
+ {
+ if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+ accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC));
+ } else {
+ accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+ }
+ }
+ }
+ // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+ }
+
+ void Scene::createInstanceMBAccel()
+ {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ //if (device->instance_accel_mb == "default")
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX() && !isCompactAccel())
+ accels_add(device->bvh8_factory->BVH8InstanceMB(this, false));
+ else
+#endif
+ accels_add(device->bvh4_factory->BVH4InstanceMB(this, false));
+ }
+ //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+ }
+
+ void Scene::createInstanceExpensiveAccel()
+ {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ // if (device->object_accel == "default")
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX() && !isCompactAccel()) {
+ if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+ accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC));
+ } else {
+ accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+ }
+ }
+ else
+#endif
+ {
+ if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+ accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC));
+ } else {
+ accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+ }
+ }
+ }
+ // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+ }
+
+ void Scene::createInstanceExpensiveMBAccel()
+ {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ //if (device->instance_accel_mb == "default")
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX() && !isCompactAccel())
+ accels_add(device->bvh8_factory->BVH8InstanceMB(this, true));
+ else
+#endif
+ accels_add(device->bvh4_factory->BVH4InstanceMB(this, true));
+ }
+ //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+ }
+
+ void Scene::createGridAccel()
+ {
+ BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
+#if defined(EMBREE_GEOMETRY_GRID)
+ if (device->grid_accel == "default")
+ {
+#if defined (EMBREE_TARGET_SIMD8)
+ if (device->canUseAVX() && !isCompactAccel())
+ {
+ accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+ }
+ else
+#endif
+ {
+ accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+ }
+ }
+ else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#if defined (EMBREE_TARGET_SIMD8)
+ else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#endif
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel);
+#endif
+
+ }
+
+ void Scene::createGridMBAccel()
+ {
+#if defined(EMBREE_GEOMETRY_GRID)
+ if (device->grid_accel_mb == "default")
+ {
+ accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
+ }
+ else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this));
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel);
+#endif
+
+ }
+
+ void Scene::clear() {
+ }
+
+ unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry)
+ {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(geometriesMutex);
+#else
+ Lock<SpinLock> lock(geometriesMutex);
+#endif
+ if (geomID == RTC_INVALID_GEOMETRY_ID) {
+ geomID = id_pool.allocate();
+ if (geomID == RTC_INVALID_GEOMETRY_ID)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene");
+ }
+ else
+ {
+ if (!id_pool.add(geomID))
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided");
+ }
+ if (geomID >= geometries.size()) {
+ geometries.resize(geomID+1);
+ vertices.resize(geomID+1);
+ geometryModCounters_.resize(geomID+1);
+ }
+ geometries[geomID] = geometry;
+ geometryModCounters_[geomID] = 0;
+ if (geometry->isEnabled()) {
+ setModified ();
+ }
+ return geomID;
+ }
+
+ void Scene::detachGeometry(size_t geomID)
+ {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+ std::scoped_lock lock(geometriesMutex);
+#else
+ Lock<SpinLock> lock(geometriesMutex);
+#endif
+
+ if (geomID >= geometries.size())
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
+
+ Ref<Geometry>& geometry = geometries[geomID];
+ if (geometry == null)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
+
+ if (geometry->isEnabled()) {
+ setModified ();
+ }
+ accels_deleteGeometry(unsigned(geomID));
+ id_pool.deallocate((unsigned)geomID);
+ geometries[geomID] = null;
+ vertices[geomID] = nullptr;
+ geometryModCounters_[geomID] = 0;
+ }
+
+ void Scene::updateInterface()
+ {
+ is_build = true;
+ }
+
+ void Scene::commit_task ()
+ {
+ checkIfModifiedAndSet ();
+ if (!isModified()) {
+ return;
+ }
+
+ /* print scene statistics */
+ if (device->verbosity(2))
+ printStatistics();
+
+ progress_monitor_counter = 0;
+
+ /* gather scene stats and call preCommit function of each geometry */
+ this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (),
+ [this](const range<size_t>& r)->GeometryCounts
+ {
+ GeometryCounts c;
+ for (auto i=r.begin(); i<r.end(); ++i)
+ {
+ if (geometries[i] && geometries[i]->isEnabled())
+ {
+ geometries[i]->preCommit();
+ geometries[i]->addElementsToCount (c);
+ c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions();
+ }
+ }
+ return c;
+ },
+ std::plus<GeometryCounts>()
+ );
+
+ /* select acceleration structures to build */
+ unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
+ if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
+ {
+ accels_init();
+
+ /* we need to make all geometries modified, otherwise two level builder will
+ not rebuild currently not modified geometries */
+ parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
+ geometryModCounters_[i] = 0;
+ });
+
+ if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
+ if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
+ if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
+ if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel();
+ if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel();
+ if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel();
+ if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel();
+ if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel();
+ if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel();
+ if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel();
+ if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel();
+ if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel();
+ if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel();
+ if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
+ if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
+ if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
+
+ flags_modified = false;
+ enabled_geometry_types = new_enabled_geometry_types;
+ }
+
+ /* select fast code path if no filter function is present */
+ accels_select(hasFilterFunction());
+
+ /* build all hierarchies of this scene */
+ accels_build();
+
+ /* make static geometry immutable */
+ if (!isDynamicAccel()) {
+ accels_immutable();
+ flags_modified = true; // in non-dynamic mode we have to re-create accels
+ }
+
+ /* call postCommit function of each geometry */
+ parallel_for(geometries.size(), [&] ( const size_t i ) {
+ if (geometries[i] && geometries[i]->isEnabled()) {
+ geometries[i]->postCommit();
+ vertices[i] = geometries[i]->getCompactVertexArray();
+ geometryModCounters_[i] = geometries[i]->getModCounter();
+ }
+ });
+
+ updateInterface();
+
+ if (device->verbosity(2)) {
+ std::cout << "created scene intersector" << std::endl;
+ accels_print(2);
+ std::cout << "selected scene intersector" << std::endl;
+ intersectors.print(2);
+ }
+
+ setModified(false);
+ }
+
+ void Scene::setBuildQuality(RTCBuildQuality quality_flags_i)
+ {
+ if (quality_flags == quality_flags_i) return;
+ quality_flags = quality_flags_i;
+ flags_modified = true;
+ }
+
+ RTCBuildQuality Scene::getBuildQuality() const {
+ return quality_flags;
+ }
+
+ void Scene::setSceneFlags(RTCSceneFlags scene_flags_i)
+ {
+ if (scene_flags == scene_flags_i) return;
+ scene_flags = scene_flags_i;
+ flags_modified = true;
+ }
+
+ RTCSceneFlags Scene::getSceneFlags() const {
+ return scene_flags;
+ }
+
+#if defined(TASKING_INTERNAL)
+
+ void Scene::commit (bool join)
+ {
+ Lock<MutexSys> buildLock(buildMutex,false);
+
+ /* allocates own taskscheduler for each build */
+ Ref<TaskScheduler> scheduler = nullptr;
+ {
+ Lock<MutexSys> lock(schedulerMutex);
+ scheduler = this->scheduler;
+ if (scheduler == null) {
+ buildLock.lock();
+ this->scheduler = scheduler = new TaskScheduler;
+ }
+ }
+
+ /* worker threads join build */
+ if (!buildLock.isLocked())
+ {
+ if (!join)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation");
+
+ scheduler->join();
+ return;
+ }
+
+ /* initiate build */
+ // -- GODOT start --
+ // try {
+ scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+ // }
+ // catch (...) {
+ // accels_clear();
+ // updateInterface();
+ // Lock<MutexSys> lock(schedulerMutex);
+ // this->scheduler = nullptr;
+ // throw;
+ // }
+ // -- GODOT end --
+ }
+
+#endif
+
+#if defined(TASKING_TBB) || defined(TASKING_GCD)
+
+ void Scene::commit (bool join)
+ {
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+ if (join)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version");
+#endif
+
+ /* try to obtain build lock */
+ Lock<MutexSys> lock(buildMutex,buildMutex.try_lock());
+
+ /* join hierarchy build */
+ if (!lock.isLocked())
+ {
+#if !TASKING_TBB_USE_TASK_ISOLATION
+ if (!join)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version");
+#endif
+
+ do {
+
+#if defined(TASKING_GCD)
+ // Do Nothing
+#else
+#if USE_TASK_ARENA
+ if (join) {
+ device->arena->execute([&]{ group.wait(); });
+ }
+ else
+#endif
+ {
+ group.wait();
+ }
+#endif
+
+ pause_cpu();
+ yield();
+
+ } while (!buildMutex.try_lock());
+
+ buildMutex.unlock();
+ return;
+ }
+
+ /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+ const unsigned int mxcsr = _mm_getcsr();
+ _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+
+ try {
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION_MAJOR < 8
+ tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
+#else
+ tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
+#endif
+ //ctx.set_priority(tbb::priority_high);
+
+#if USE_TASK_ARENA
+ if (join)
+ {
+ device->arena->execute([&]{
+ group.run([&]{
+ tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+ });
+ group.wait();
+ });
+ }
+ else
+#endif
+ {
+ group.run([&]{
+ tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+ });
+ group.wait();
+ }
+
+ /* reset MXCSR register again */
+ _mm_setcsr(mxcsr);
+
+#elif defined(TASKING_GCD)
+
+ commit_task();
+
+#endif // #if defined(TASKING_TBB)
+
+ }
+ catch (...)
+ {
+ /* reset MXCSR register again */
+ _mm_setcsr(mxcsr);
+
+ accels_clear();
+ updateInterface();
+ throw;
+ }
+ }
+#endif
+
+#if defined(TASKING_PPL)
+
+ void Scene::commit (bool join)
+ {
+#if defined(TASKING_PPL)
+ if (join)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL");
+#endif
+
+ /* try to obtain build lock */
+ Lock<MutexSys> lock(buildMutex);
+
+ checkIfModifiedAndSet ();
+ if (!isModified()) {
+ return;
+ }
+
+ /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+ const unsigned int mxcsr = _mm_getcsr();
+ _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+
+ try {
+
+ group.run([&]{
+ concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); });
+ });
+ group.wait();
+
+ /* reset MXCSR register again */
+ _mm_setcsr(mxcsr);
+ }
+ catch (...)
+ {
+ /* reset MXCSR register again */
+ _mm_setcsr(mxcsr);
+
+ accels_clear();
+ updateInterface();
+ throw;
+ }
+ }
+#endif
+
+ void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr)
+ {
+ progress_monitor_function = func;
+ progress_monitor_ptr = ptr;
+ }
+
+ void Scene::progressMonitor(double dn)
+ {
+ if (progress_monitor_function) {
+ size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn));
+ if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) {
+ throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination");
+ }
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene.h b/thirdparty/embree-aarch64/kernels/common/scene.h
new file mode 100644
index 0000000000..b41c6cde91
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene.h
@@ -0,0 +1,390 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "builder.h"
+#include "../../common/algorithms/parallel_any_of.h"
+#include "scene_triangle_mesh.h"
+#include "scene_quad_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_line_segments.h"
+#include "scene_subdiv_mesh.h"
+#include "scene_grid_mesh.h"
+#include "scene_points.h"
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+namespace embree
+{
+ /*! Base class all scenes are derived from */
+ class Scene : public AccelN
+ {
+ ALIGNED_CLASS_(std::alignment_of<Scene>::value);
+
+ public:
+ template<typename Ty, bool mblur = false>
+ class Iterator
+ {
+ public:
+ Iterator () {}
+
+ Iterator (Scene* scene, bool all = false)
+ : scene(scene), all(all) {}
+
+ __forceinline Ty* at(const size_t i)
+ {
+ Geometry* geom = scene->geometries[i].ptr;
+ if (geom == nullptr) return nullptr;
+ if (!all && !geom->isEnabled()) return nullptr;
+ const size_t mask = geom->getTypeMask() & Ty::geom_type;
+ if (!(mask)) return nullptr;
+ if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+ return (Ty*) geom;
+ }
+
+ __forceinline Ty* operator[] (const size_t i) {
+ return at(i);
+ }
+
+ __forceinline size_t size() const {
+ return scene->size();
+ }
+
+ __forceinline size_t numPrimitives() const {
+ return scene->getNumPrimitives(Ty::geom_type,mblur);
+ }
+
+ __forceinline size_t maxPrimitivesPerGeometry()
+ {
+ size_t ret = 0;
+ for (size_t i=0; i<scene->size(); i++) {
+ Ty* mesh = at(i);
+ if (mesh == nullptr) continue;
+ ret = max(ret,mesh->size());
+ }
+ return ret;
+ }
+
+ __forceinline unsigned int maxGeomID()
+ {
+ unsigned int ret = 0;
+ for (size_t i=0; i<scene->size(); i++) {
+ Ty* mesh = at(i);
+ if (mesh == nullptr) continue;
+ ret = max(ret,(unsigned int)i);
+ }
+ return ret;
+ }
+
+ __forceinline unsigned maxTimeStepsPerGeometry()
+ {
+ unsigned ret = 0;
+ for (size_t i=0; i<scene->size(); i++) {
+ Ty* mesh = at(i);
+ if (mesh == nullptr) continue;
+ ret = max(ret,mesh->numTimeSteps);
+ }
+ return ret;
+ }
+
+ private:
+ Scene* scene;
+ bool all;
+ };
+
+ class Iterator2
+ {
+ public:
+ Iterator2 () {}
+
+ Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur)
+ : scene(scene), typemask(typemask), mblur(mblur) {}
+
+ __forceinline Geometry* at(const size_t i)
+ {
+ Geometry* geom = scene->geometries[i].ptr;
+ if (geom == nullptr) return nullptr;
+ if (!geom->isEnabled()) return nullptr;
+ if (!(geom->getTypeMask() & typemask)) return nullptr;
+ if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+ return geom;
+ }
+
+ __forceinline Geometry* operator[] (const size_t i) {
+ return at(i);
+ }
+
+ __forceinline size_t size() const {
+ return scene->size();
+ }
+
+ private:
+ Scene* scene;
+ Geometry::GTypeMask typemask;
+ bool mblur;
+ };
+
+ public:
+
+ /*! Scene construction */
+ Scene (Device* device);
+
+ /*! Scene destruction */
+ ~Scene () noexcept;
+
+ private:
+ /*! class is non-copyable */
+ Scene (const Scene& other) DELETED; // do not implement
+ Scene& operator= (const Scene& other) DELETED; // do not implement
+
+ public:
+ void createTriangleAccel();
+ void createTriangleMBAccel();
+ void createQuadAccel();
+ void createQuadMBAccel();
+ void createHairAccel();
+ void createHairMBAccel();
+ void createSubdivAccel();
+ void createSubdivMBAccel();
+ void createUserGeometryAccel();
+ void createUserGeometryMBAccel();
+ void createInstanceAccel();
+ void createInstanceMBAccel();
+ void createInstanceExpensiveAccel();
+ void createInstanceExpensiveMBAccel();
+ void createGridAccel();
+ void createGridMBAccel();
+
+ /*! prints statistics about the scene */
+ void printStatistics();
+
+ /*! clears the scene */
+ void clear();
+
+ /*! detaches some geometry */
+ void detachGeometry(size_t geomID);
+
+ void setBuildQuality(RTCBuildQuality quality_flags);
+ RTCBuildQuality getBuildQuality() const;
+
+ void setSceneFlags(RTCSceneFlags scene_flags);
+ RTCSceneFlags getSceneFlags() const;
+
+ void commit (bool join);
+ void commit_task ();
+ void build () {}
+
+ void updateInterface();
+
+ /* return number of geometries */
+ __forceinline size_t size() const { return geometries.size(); }
+
+ /* bind geometry to the scene */
+ unsigned int bind (unsigned geomID, Ref<Geometry> geometry);
+
+ /* determines if scene is modified */
+ __forceinline bool isModified() const { return modified; }
+
+ /* sets modified flag */
+ __forceinline void setModified(bool f = true) {
+ modified = f;
+ }
+
+ __forceinline bool isGeometryModified(size_t geomID)
+ {
+ Ref<Geometry>& g = geometries[geomID];
+ if (!g) return false;
+ return g->getModCounter() > geometryModCounters_[geomID];
+ }
+
+ protected:
+
+ __forceinline void checkIfModifiedAndSet ()
+ {
+ if (isModified ()) return;
+
+ auto geometryIsModified = [this](size_t geomID)->bool {
+ return isGeometryModified(geomID);
+ };
+
+ if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) {
+ setModified ();
+ }
+ }
+
+ public:
+
+ /* get mesh by ID */
+ __forceinline Geometry* get(size_t i) { assert(i < geometries.size()); return geometries[i].ptr; }
+ __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; }
+
+ template<typename Mesh>
+ __forceinline Mesh* get(size_t i) {
+ assert(i < geometries.size());
+ assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+ return (Mesh*)geometries[i].ptr;
+ }
+ template<typename Mesh>
+ __forceinline const Mesh* get(size_t i) const {
+ assert(i < geometries.size());
+ assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+ return (Mesh*)geometries[i].ptr;
+ }
+
+ template<typename Mesh>
+ __forceinline Mesh* getSafe(size_t i) {
+ assert(i < geometries.size());
+ if (geometries[i] == null) return nullptr;
+ if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr;
+ else return (Mesh*) geometries[i].ptr;
+ }
+
+ __forceinline Ref<Geometry> get_locked(size_t i) {
+ Lock<SpinLock> lock(geometriesMutex);
+ assert(i < geometries.size());
+ return geometries[i];
+ }
+
+ /* flag decoding */
+ __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); }
+ __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; }
+ __forceinline bool isRobustAccel() const { return scene_flags & RTC_SCENE_FLAG_ROBUST; }
+ __forceinline bool isStaticAccel() const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); }
+ __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; }
+
+ __forceinline bool hasContextFilterFunction() const {
+ return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION;
+ }
+
+ __forceinline bool hasGeometryFilterFunction() {
+ return world.numFilterFunctions != 0;
+ }
+
+ __forceinline bool hasFilterFunction() {
+ return hasContextFilterFunction() || hasGeometryFilterFunction();
+ }
+
+ /* test if scene got already build */
+ __forceinline bool isBuild() const { return is_build; }
+
+ public:
+ IDPool<unsigned,0xFFFFFFFE> id_pool;
+ vector<Ref<Geometry>> geometries; //!< list of all user geometries
+ vector<unsigned int> geometryModCounters_;
+ vector<float*> vertices;
+
+ public:
+ Device* device;
+
+ /* these are to detect if we need to recreate the acceleration structures */
+ bool flags_modified;
+ unsigned int enabled_geometry_types;
+
+ RTCSceneFlags scene_flags;
+ RTCBuildQuality quality_flags;
+ MutexSys buildMutex;
+ SpinLock geometriesMutex;
+ bool is_build;
+ private:
+ bool modified; //!< true if scene got modified
+
+ public:
+
+ /*! global lock step task scheduler */
+#if defined(TASKING_INTERNAL)
+ MutexSys schedulerMutex;
+ Ref<TaskScheduler> scheduler;
+#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+ tbb::isolated_task_group group;
+#elif defined(TASKING_TBB)
+ tbb::task_group group;
+#elif defined(TASKING_PPL)
+ concurrency::task_group group;
+#endif
+
+ public:
+ struct BuildProgressMonitorInterface : public BuildProgressMonitor {
+ BuildProgressMonitorInterface(Scene* scene)
+ : scene(scene) {}
+ void operator() (size_t dn) const { scene->progressMonitor(double(dn)); }
+ private:
+ Scene* scene;
+ };
+ BuildProgressMonitorInterface progressInterface;
+ RTCProgressMonitorFunction progress_monitor_function;
+ void* progress_monitor_ptr;
+ std::atomic<size_t> progress_monitor_counter;
+ void progressMonitor(double nprims);
+ void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr);
+
+ private:
+ GeometryCounts world; //!< counts for geometry
+
+ public:
+
+ __forceinline size_t numPrimitives() const {
+ return world.size();
+ }
+
+ __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const
+ {
+ size_t count = 0;
+
+ if (mask & Geometry::MTY_TRIANGLE_MESH)
+ count += mblur ? world.numMBTriangles : world.numTriangles;
+
+ if (mask & Geometry::MTY_QUAD_MESH)
+ count += mblur ? world.numMBQuads : world.numQuads;
+
+ if (mask & Geometry::MTY_CURVE2)
+ count += mblur ? world.numMBLineSegments : world.numLineSegments;
+
+ if (mask & Geometry::MTY_CURVE4)
+ count += mblur ? world.numMBBezierCurves : world.numBezierCurves;
+
+ if (mask & Geometry::MTY_POINTS)
+ count += mblur ? world.numMBPoints : world.numPoints;
+
+ if (mask & Geometry::MTY_SUBDIV_MESH)
+ count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches;
+
+ if (mask & Geometry::MTY_USER_GEOMETRY)
+ count += mblur ? world.numMBUserGeometries : world.numUserGeometries;
+
+ if (mask & Geometry::MTY_INSTANCE_CHEAP)
+ count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap;
+
+ if (mask & Geometry::MTY_INSTANCE_EXPENSIVE)
+ count += mblur ? world.numMBInstancesExpensive : world.numInstancesExpensive;
+
+ if (mask & Geometry::MTY_GRID_MESH)
+ count += mblur ? world.numMBGrids : world.numGrids;
+
+ return count;
+ }
+
+ template<typename Mesh, bool mblur>
+ __forceinline unsigned getNumTimeSteps()
+ {
+ if (!mblur)
+ return 1;
+
+ Scene::Iterator<Mesh,mblur> iter(this);
+ return iter.maxTimeStepsPerGeometry();
+ }
+
+ template<typename Mesh, bool mblur>
+ __forceinline unsigned int getMaxGeomID()
+ {
+ Scene::Iterator<Mesh,mblur> iter(this);
+ return iter.maxGeomID();
+ }
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_curves.h b/thirdparty/embree-aarch64/kernels/common/scene_curves.h
new file mode 100644
index 0000000000..2649ab0e3e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_curves.h
@@ -0,0 +1,341 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+ /*! represents an array of bicubic bezier curves */
+ struct CurveGeometry : public Geometry
+ {
+ /*! type of this geometry */
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4;
+
+ public:
+
+ /*! bezier curve construction */
+ CurveGeometry (Device* device, Geometry::GType gtype);
+
+ public:
+ void setMask(unsigned mask);
+ void setNumTimeSteps (unsigned int numTimeSteps);
+ void setVertexAttributeCount (unsigned int N);
+ void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+ void* getBuffer(RTCBufferType type, unsigned int slot);
+ void updateBuffer(RTCBufferType type, unsigned int slot);
+ void commit();
+ bool verify();
+ void setTessellationRate(float N);
+ void setMaxRadiusScale(float s);
+ void addElementsToCount (GeometryCounts & counts) const;
+
+ public:
+
+ /*! returns the number of vertices */
+ __forceinline size_t numVertices() const {
+ return vertices[0].size();
+ }
+
+ /*! returns the i'th curve */
+ __forceinline const unsigned int& curve(size_t i) const {
+ return curves[i];
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline Vec3ff vertex(size_t i) const {
+ return vertices0[i];
+ }
+
+ /*! returns i'th normal of the first time step */
+ __forceinline Vec3fa normal(size_t i) const {
+ return normals0[i];
+ }
+
+ /*! returns i'th tangent of the first time step */
+ __forceinline Vec3ff tangent(size_t i) const {
+ return tangents0[i];
+ }
+
+ /*! returns i'th normal derivative of the first time step */
+ __forceinline Vec3fa dnormal(size_t i) const {
+ return dnormals0[i];
+ }
+
+ /*! returns i'th radius of the first time step */
+ __forceinline float radius(size_t i) const {
+ return vertices0[i].w;
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+ return vertices[itime][i];
+ }
+
+ /*! returns i'th normal of itime'th timestep */
+ __forceinline Vec3fa normal(size_t i, size_t itime) const {
+ return normals[itime][i];
+ }
+
+ /*! returns i'th tangent of itime'th timestep */
+ __forceinline Vec3ff tangent(size_t i, size_t itime) const {
+ return tangents[itime][i];
+ }
+
+ /*! returns i'th normal derivative of itime'th timestep */
+ __forceinline Vec3fa dnormal(size_t i, size_t itime) const {
+ return dnormals[itime][i];
+ }
+
+ /*! returns i'th radius of itime'th timestep */
+ __forceinline float radius(size_t i, size_t itime) const {
+ return vertices[itime][i].w;
+ }
+
+ /*! gathers the curve starting with i'th vertex */
+ __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const
+ {
+ p0 = vertex(i+0);
+ p1 = vertex(i+1);
+ p2 = vertex(i+2);
+ p3 = vertex(i+3);
+ }
+
+ /*! gathers the curve starting with i'th vertex of itime'th timestep */
+ __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const
+ {
+ p0 = vertex(i+0,itime);
+ p1 = vertex(i+1,itime);
+ p2 = vertex(i+2,itime);
+ p3 = vertex(i+3,itime);
+ }
+
+ /*! gathers the curve starting with i'th vertex */
+ __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const
+ {
+ p0 = vertex(i+0);
+ p1 = vertex(i+1);
+ p2 = vertex(i+2);
+ p3 = vertex(i+3);
+ n0 = normal(i+0);
+ n1 = normal(i+1);
+ n2 = normal(i+2);
+ n3 = normal(i+3);
+ }
+
+ /*! gathers the curve starting with i'th vertex of itime'th timestep */
+ __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const
+ {
+ p0 = vertex(i+0,itime);
+ p1 = vertex(i+1,itime);
+ p2 = vertex(i+2,itime);
+ p3 = vertex(i+3,itime);
+ n0 = normal(i+0,itime);
+ n1 = normal(i+1,itime);
+ n2 = normal(i+2,itime);
+ n3 = normal(i+3,itime);
+ }
+
+ /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+ __forceinline void prefetchL1_vertices(size_t i) const
+ {
+ prefetchL1(vertices0.getPtr(i)+0);
+ prefetchL1(vertices0.getPtr(i)+64);
+ }
+
+ /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+ __forceinline void prefetchL2_vertices(size_t i) const
+ {
+ prefetchL2(vertices0.getPtr(i)+0);
+ prefetchL2(vertices0.getPtr(i)+64);
+ }
+
+ /*! loads curve vertices for specified time */
+ __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const
+ {
+ float ftime;
+ const size_t itime = timeSegment(time, ftime);
+
+ const float t0 = 1.0f - ftime;
+ const float t1 = ftime;
+ Vec3ff a0,a1,a2,a3;
+ gather(a0,a1,a2,a3,i,itime);
+ Vec3ff b0,b1,b2,b3;
+ gather(b0,b1,b2,b3,i,itime+1);
+ p0 = madd(Vec3ff(t0),a0,t1*b0);
+ p1 = madd(Vec3ff(t0),a1,t1*b1);
+ p2 = madd(Vec3ff(t0),a2,t1*b2);
+ p3 = madd(Vec3ff(t0),a3,t1*b3);
+ }
+
+ /*! loads curve vertices for specified time */
+ __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const
+ {
+ float ftime;
+ const size_t itime = timeSegment(time, ftime);
+
+ const float t0 = 1.0f - ftime;
+ const float t1 = ftime;
+ Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3;
+ gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime);
+ Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3;
+ gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1);
+ p0 = madd(Vec3ff(t0),a0,t1*b0);
+ p1 = madd(Vec3ff(t0),a1,t1*b1);
+ p2 = madd(Vec3ff(t0),a2,t1*b2);
+ p3 = madd(Vec3ff(t0),a3,t1*b3);
+ n0 = madd(Vec3ff(t0),an0,t1*bn0);
+ n1 = madd(Vec3ff(t0),an1,t1*bn1);
+ n2 = madd(Vec3ff(t0),an2,t1*bn2);
+ n3 = madd(Vec3ff(t0),an3,t1*bn3);
+ }
+
+ template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+ __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+ {
+ Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3;
+ unsigned int vertexID = curve(primID);
+ gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime);
+ SourceCurve3ff ccurve(v0,v1,v2,v3);
+ SourceCurve3fa ncurve(n0,n1,n2,n3);
+ ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+ return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+ }
+
+ template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+ __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+ {
+ float ftime;
+ const size_t itime = timeSegment(time, ftime);
+ const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0);
+ const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1);
+ return clerp(curve0,curve1,ftime);
+ }
+
+ /*! gathers the hermite curve starting with i'th vertex */
+ __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const
+ {
+ p0 = vertex (i+0);
+ p1 = vertex (i+1);
+ t0 = tangent(i+0);
+ t1 = tangent(i+1);
+ }
+
+ /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+ __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const
+ {
+ p0 = vertex (i+0,itime);
+ p1 = vertex (i+1,itime);
+ t0 = tangent(i+0,itime);
+ t1 = tangent(i+1,itime);
+ }
+
+ /*! loads curve vertices for specified time */
+ __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const
+ {
+ float ftime;
+ const size_t itime = timeSegment(time, ftime);
+ const float f0 = 1.0f - ftime, f1 = ftime;
+ Vec3ff ap0,at0,ap1,at1;
+ gather_hermite(ap0,at0,ap1,at1,i,itime);
+ Vec3ff bp0,bt0,bp1,bt1;
+ gather_hermite(bp0,bt0,bp1,bt1,i,itime+1);
+ p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+ t0 = madd(Vec3ff(f0),at0,f1*bt0);
+ p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+ t1 = madd(Vec3ff(f0),at1,f1*bt1);
+ }
+
+ /*! gathers the hermite curve starting with i'th vertex */
+ __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const
+ {
+ p0 = vertex (i+0);
+ p1 = vertex (i+1);
+ t0 = tangent(i+0);
+ t1 = tangent(i+1);
+ n0 = normal(i+0);
+ n1 = normal(i+1);
+ dn0 = dnormal(i+0);
+ dn1 = dnormal(i+1);
+ }
+
+ /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+ __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const
+ {
+ p0 = vertex (i+0,itime);
+ p1 = vertex (i+1,itime);
+ t0 = tangent(i+0,itime);
+ t1 = tangent(i+1,itime);
+ n0 = normal(i+0,itime);
+ n1 = normal(i+1,itime);
+ dn0 = dnormal(i+0,itime);
+ dn1 = dnormal(i+1,itime);
+ }
+
+ /*! loads curve vertices for specified time */
+ __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
+ {
+ float ftime;
+ const size_t itime = timeSegment(time, ftime);
+ const float f0 = 1.0f - ftime, f1 = ftime;
+ Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1;
+ gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime);
+ Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1;
+ gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1);
+ p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+ t0 = madd(Vec3ff(f0),at0,f1*bt0);
+ n0 = madd(Vec3ff(f0),an0,f1*bn0);
+ dn0= madd(Vec3ff(f0),adn0,f1*bdn0);
+ p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+ t1 = madd(Vec3ff(f0),at1,f1*bt1);
+ n1 = madd(Vec3ff(f0),an1,f1*bn1);
+ dn1= madd(Vec3ff(f0),adn1,f1*bdn1);
+ }
+
+ template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+ __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+ {
+ Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1;
+ unsigned int vertexID = curve(primID);
+ gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime);
+
+ SourceCurve3ff ccurve(v0,t0,v1,t1);
+ SourceCurve3fa ncurve(n0,dn0,n1,dn1);
+ ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+ return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+ }
+
+ template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+ __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+ {
+ float ftime;
+ const size_t itime = timeSegment(time, ftime);
+ const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0);
+ const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1);
+ return clerp(curve0,curve1,ftime);
+ }
+
+ private:
+ void resizeBuffers(unsigned int numSteps);
+
+ public:
+ BufferView<unsigned int> curves; //!< array of curve indices
+ BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer
+ BufferView<Vec3fa> normals0; //!< fast access to first normal buffer
+ BufferView<Vec3ff> tangents0; //!< fast access to first tangent buffer
+ BufferView<Vec3fa> dnormals0; //!< fast access to first normal derivative buffer
+ vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep
+ vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep
+ vector<BufferView<Vec3ff>> tangents; //!< tangent array for each timestep
+ vector<BufferView<Vec3fa>> dnormals; //!< normal derivative array for each timestep
+ BufferView<char> flags; //!< start, end flag per segment
+ vector<BufferView<char>> vertexAttribs; //!< user buffers
+ int tessellationRate; //!< tessellation rate for flat curve
+ float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii
+ };
+
+ DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h
new file mode 100644
index 0000000000..c08658466a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+ /*! Grid Mesh */
+ struct GridMesh : public Geometry
+ {
+ /*! type of this geometry */
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH;
+
+ /*! grid */
+ struct Grid
+ {
+ unsigned int startVtxID;
+ unsigned int lineVtxOffset;
+ unsigned short resX,resY;
+
+ /* border flags due to 3x3 vertex pattern */
+ __forceinline unsigned int get3x3FlagsX(const unsigned int x) const
+ {
+ return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0;
+ }
+
+ /* border flags due to 3x3 vertex pattern */
+ __forceinline unsigned int get3x3FlagsY(const unsigned int y) const
+ {
+ return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0;
+ }
+
+ /*! outputs grid structure */
+ __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) {
+ return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }";
+ }
+ };
+
+ public:
+
+ /*! grid mesh construction */
+ GridMesh (Device* device);
+
+ /* geometry interface */
+ public:
+ void setMask(unsigned mask);
+ void setNumTimeSteps (unsigned int numTimeSteps);
+ void setVertexAttributeCount (unsigned int N);
+ void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+ void* getBuffer(RTCBufferType type, unsigned int slot);
+ void updateBuffer(RTCBufferType type, unsigned int slot);
+ void commit();
+ bool verify();
+ void interpolate(const RTCInterpolateArguments* const args);
+ void addElementsToCount (GeometryCounts & counts) const;
+
+ __forceinline unsigned int getNumSubGrids(const size_t gridID)
+ {
+ const Grid &g = grid(gridID);
+ return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1));
+ }
+
+ /*! get fast access to first vertex buffer */
+ __forceinline float * getCompactVertexArray () const {
+ return (float*) vertices0.getPtr();
+ }
+
+ public:
+
+ /*! returns number of vertices */
+ __forceinline size_t numVertices() const {
+ return vertices[0].size();
+ }
+
+ /*! returns i'th grid*/
+ __forceinline const Grid& grid(size_t i) const {
+ return grids[i];
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load
+ return vertices0[i];
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline const char* vertexPtr(size_t i) const {
+ return vertices0.getPtr(i);
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+ return vertices[itime][i];
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+ return vertices[itime].getPtr(i);
+ }
+
+ /*! returns i'th vertex of the first timestep */
+ __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const {
+ assert(x < (size_t)g.resX);
+ assert(y < (size_t)g.resY);
+ return g.startVtxID + x + y * g.lineVtxOffset;
+ }
+
+ /*! returns i'th vertex of the first timestep */
+ __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const {
+ const size_t index = grid_vertex_index(g,x,y);
+ return vertex(index);
+ }
+
+ /*! returns i'th vertex of the itime'th timestep */
+ __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const {
+ const size_t index = grid_vertex_index(g,x,y);
+ return vertex(index,itime);
+ }
+
+ /*! calculates the build bounds of the i'th primitive, if it's valid */
+ __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
+ {
+ BBox3fa b(empty);
+ for (size_t t=0; t<numTimeSteps; t++)
+ {
+ for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+ for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+ {
+ const Vec3fa v = grid_vertex(g,x,y,t);
+ if (unlikely(!isvalid(v))) return false;
+ b.extend(v);
+ }
+ }
+
+ bbox = b;
+ return true;
+ }
+
+ /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+ __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const
+ {
+ assert(itime < numTimeSteps);
+ BBox3fa b0(empty);
+ for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+ for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+ {
+ const Vec3fa v = grid_vertex(g,x,y,itime);
+ if (unlikely(!isvalid(v))) return false;
+ b0.extend(v);
+ }
+
+ /* use bounds of first time step in builder */
+ bbox = b0;
+ return true;
+ }
+
+ __forceinline bool valid(size_t gridID, size_t itime=0) const {
+ return valid(gridID, make_range(itime, itime));
+ }
+
+ /*! check if the i'th primitive is valid between the specified time range */
+ __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const
+ {
+ if (unlikely(gridID >= grids.size())) return false;
+ const Grid &g = grid(gridID);
+ if (unlikely(g.startVtxID + 0 >= vertices0.size())) return false;
+ if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false;
+
+ for (size_t y=0;y<g.resY;y++)
+ for (size_t x=0;x<g.resX;x++)
+ for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+ if (!isvalid(grid_vertex(g,x,y,itime))) return false;
+ return true;
+ }
+
+
+ __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const
+ {
+ BBox3fa box(empty);
+ buildBounds(g,sx,sy,itime,box);
+ return box;
+ }
+
+ __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const {
+ BBox3fa bounds0, bounds1;
+ buildBounds(g,sx,sy,itime+0,bounds0);
+ buildBounds(g,sx,sy,itime+1,bounds1);
+ return LBBox3fa(bounds0,bounds1);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const {
+ return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments);
+ }
+
+ public:
+ BufferView<Grid> grids; //!< array of triangles
+ BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer
+ vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+ vector<RawBufferView> vertexAttribs; //!< vertex attributes
+ };
+
+ namespace isa
+ {
+ struct GridMeshISA : public GridMesh
+ {
+ GridMeshISA (Device* device)
+ : GridMesh(device) {}
+ };
+ }
+
+ DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_instance.h b/thirdparty/embree-aarch64/kernels/common/scene_instance.h
new file mode 100644
index 0000000000..7ff82a4fb8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_instance.h
@@ -0,0 +1,272 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "accel.h"
+
+namespace embree
+{
+ struct MotionDerivativeCoefficients;
+
+ /*! Instanced acceleration structure */
+ struct Instance : public Geometry
+ {
+ ALIGNED_STRUCT_(16);
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE;
+
+ public:
+ Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1);
+ ~Instance();
+
+ private:
+ Instance (const Instance& other) DELETED; // do not implement
+ Instance& operator= (const Instance& other) DELETED; // do not implement
+
+ private:
+ LBBox3fa nonlinearBounds(const BBox1f& time_range_in,
+ const BBox1f& geom_time_range,
+ float geom_time_segments) const;
+
+ BBox3fa boundSegment(size_t itime,
+ BBox3fa const& obbox0, BBox3fa const& obbox1,
+ BBox3fa const& bbox0, BBox3fa const& bbox1,
+ float t_min, float t_max) const;
+
+ /* calculates the (correct) interpolated bounds */
+ __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const
+ {
+ if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+ return xfmBounds(slerp(local2world[itime0], local2world[itime1], f),
+ lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+ return xfmBounds(lerp(local2world[itime0], local2world[itime1], f),
+ lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+ }
+
+ public:
+ virtual void setNumTimeSteps (unsigned int numTimeSteps) override;
+ virtual void setInstancedScene(const Ref<Scene>& scene) override;
+ virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override;
+ virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override;
+ virtual AffineSpace3fa getTransform(float time) override;
+ virtual void setMask (unsigned mask) override;
+ virtual void build() {}
+ virtual void addElementsToCount (GeometryCounts & counts) const override;
+ virtual void commit() override;
+
+ public:
+
+ /*! calculates the bounds of instance */
+ __forceinline BBox3fa bounds(size_t i) const {
+ assert(i == 0);
+ if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+ return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds());
+ return xfmBounds(local2world[0],object->bounds.bounds());
+ }
+
+ /*! gets the bounds of the instanced scene */
+ __forceinline BBox3fa getObjectBounds(size_t itime) const {
+ return object->getBounds(timeStep(itime));
+ }
+
+ /*! calculates the bounds of instance */
+ __forceinline BBox3fa bounds(size_t i, size_t itime) const {
+ assert(i == 0);
+ if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+ return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime));
+ return xfmBounds(local2world[itime],getObjectBounds(itime));
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const {
+ assert(i == 0);
+ LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments);
+ return lbbox;
+ }
+
+ /*! calculates the build bounds of the i'th item, if it's valid */
+ __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+ {
+ assert(i==0);
+ const BBox3fa b = bounds(i);
+ if (bbox) *bbox = b;
+ return isvalid(b);
+ }
+
+ /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+ __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+ {
+ assert(i==0);
+ const LBBox3fa bounds = linearBounds(i,itime);
+ bbox = bounds.bounds ();
+ return isvalid(bounds);
+ }
+
+ /* gets version info of topology */
+ unsigned int getTopologyVersion() const {
+ return numPrimitives;
+ }
+
+ /* returns true if topology changed */
+ bool topologyChanged(unsigned int otherVersion) const {
+ return numPrimitives != otherVersion;
+ }
+
+ /*! check if the i'th primitive is valid between the specified time range */
+ __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+ {
+ assert(i == 0);
+ for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+ if (!isvalid(bounds(i,itime))) return false;
+
+ return true;
+ }
+
+ __forceinline AffineSpace3fa getLocal2World() const
+ {
+ if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+ return quaternionDecompositionToAffineSpace(local2world[0]);
+ return local2world[0];
+ }
+
+ __forceinline AffineSpace3fa getLocal2World(float t) const
+ {
+ float ftime; const unsigned int itime = timeSegment(t, ftime);
+ if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+ return slerp(local2world[itime+0],local2world[itime+1],ftime);
+ return lerp(local2world[itime+0],local2world[itime+1],ftime);
+ }
+
+ __forceinline AffineSpace3fa getWorld2Local() const {
+ return world2local0;
+ }
+
+ __forceinline AffineSpace3fa getWorld2Local(float t) const {
+ return rcp(getLocal2World(t));
+ }
+
+ template<int K>
+ __forceinline AffineSpace3vf<K> getWorld2Local(const vbool<K>& valid, const vfloat<K>& t) const
+ {
+ if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+ return getWorld2LocalSlerp(valid, t);
+ return getWorld2LocalLerp(valid, t);
+ }
+
+ private:
+
+ template<int K>
+ __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(const vbool<K>& valid, const vfloat<K>& t) const
+ {
+ vfloat<K> ftime;
+ const vint<K> itime_k = timeSegment(t, ftime);
+ assert(any(valid));
+ const size_t index = bsf(movemask(valid));
+ const int itime = itime_k[index];
+ if (likely(all(valid, itime_k == vint<K>(itime)))) {
+ return rcp(slerp(AffineSpace3vff<K>(local2world[itime+0]),
+ AffineSpace3vff<K>(local2world[itime+1]),
+ ftime));
+ }
+ else {
+ AffineSpace3vff<K> space0,space1;
+ vbool<K> valid1 = valid;
+ while (any(valid1)) {
+ vbool<K> valid2;
+ const int itime = next_unique(valid1, itime_k, valid2);
+ space0 = select(valid2, AffineSpace3vff<K>(local2world[itime+0]), space0);
+ space1 = select(valid2, AffineSpace3vff<K>(local2world[itime+1]), space1);
+ }
+ return rcp(slerp(space0, space1, ftime));
+ }
+ }
+
+ template<int K>
+ __forceinline AffineSpace3vf<K> getWorld2LocalLerp(const vbool<K>& valid, const vfloat<K>& t) const
+ {
+ vfloat<K> ftime;
+ const vint<K> itime_k = timeSegment(t, ftime);
+ assert(any(valid));
+ const size_t index = bsf(movemask(valid));
+ const int itime = itime_k[index];
+ if (likely(all(valid, itime_k == vint<K>(itime)))) {
+ return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]),
+ AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]),
+ ftime));
+ } else {
+ AffineSpace3vf<K> space0,space1;
+ vbool<K> valid1 = valid;
+ while (any(valid1)) {
+ vbool<K> valid2;
+ const int itime = next_unique(valid1, itime_k, valid2);
+ space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), space0);
+ space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), space1);
+ }
+ return rcp(lerp(space0, space1, ftime));
+ }
+ }
+
+ public:
+ Accel* object; //!< pointer to instanced acceleration structure
+ AffineSpace3ff* local2world; //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition)
+ AffineSpace3fa world2local0; //!< transformation from world space to local space for timestep 0
+ };
+
+ namespace isa
+ {
+ struct InstanceISA : public Instance
+ {
+ InstanceISA (Device* device)
+ : Instance(device) {}
+
+ PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ assert(r.begin() == 0);
+ assert(r.end() == 1);
+
+ PrimInfo pinfo(empty);
+ BBox3fa b = empty;
+ if (!buildBounds(0,&b)) return pinfo;
+ // const BBox3fa b = bounds(0);
+ // if (!isvalid(b)) return pinfo;
+
+ const PrimRef prim(b,geomID,unsigned(0));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ return pinfo;
+ }
+
+ PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ assert(r.begin() == 0);
+ assert(r.end() == 1);
+
+ PrimInfo pinfo(empty);
+ BBox3fa b = empty;
+ if (!buildBounds(0,&b)) return pinfo;
+ // if (!valid(0,range<size_t>(itime))) return pinfo;
+ // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0));
+ const PrimRef prim(b,geomID,unsigned(0));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ return pinfo;
+ }
+
+ PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ assert(r.begin() == 0);
+ assert(r.end() == 1);
+
+ PrimInfoMB pinfo(empty);
+ if (!valid(0, timeSegmentRange(t0t1))) return pinfo;
+ const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0));
+ pinfo.add_primref(prim);
+ prims[k++] = prim;
+ return pinfo;
+ }
+ };
+ }
+
+ DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h
new file mode 100644
index 0000000000..c0f9ee8f77
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h
@@ -0,0 +1,307 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+ /*! represents an array of line segments */
+ struct LineSegments : public Geometry
+ {
+ /*! type of this geometry */
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2;
+
+ public:
+
+ /*! line segments construction */
+ LineSegments (Device* device, Geometry::GType gtype);
+
+ public:
+ void setMask (unsigned mask);
+ void setNumTimeSteps (unsigned int numTimeSteps);
+ void setVertexAttributeCount (unsigned int N);
+ void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+ void* getBuffer(RTCBufferType type, unsigned int slot);
+ void updateBuffer(RTCBufferType type, unsigned int slot);
+ void commit();
+ bool verify ();
+ void interpolate(const RTCInterpolateArguments* const args);
+ void setTessellationRate(float N);
+ void setMaxRadiusScale(float s);
+ void addElementsToCount (GeometryCounts & counts) const;
+
+ public:
+
+ /*! returns the number of vertices */
+ __forceinline size_t numVertices() const {
+ return vertices[0].size();
+ }
+
+ /*! returns the i'th segment */
+ __forceinline const unsigned int& segment(size_t i) const {
+ return segments[i];
+ }
+
+ /*! returns the segment to the left of the i'th segment */
+ __forceinline bool segmentLeftExists(size_t i) const {
+ assert (flags);
+ return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0;
+ }
+
+ /*! returns the segment to the right of the i'th segment */
+ __forceinline bool segmentRightExists(size_t i) const {
+ assert (flags);
+ return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0;
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline Vec3ff vertex(size_t i) const {
+ return vertices0[i];
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline const char* vertexPtr(size_t i) const {
+ return vertices0.getPtr(i);
+ }
+
+ /*! returns i'th normal of the first time step */
+ __forceinline Vec3fa normal(size_t i) const {
+ return normals0[i];
+ }
+
+ /*! returns i'th radius of the first time step */
+ __forceinline float radius(size_t i) const {
+ return vertices0[i].w;
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+ return vertices[itime][i];
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+ return vertices[itime].getPtr(i);
+ }
+
+ /*! returns i'th normal of itime'th timestep */
+ __forceinline Vec3fa normal(size_t i, size_t itime) const {
+ return normals[itime][i];
+ }
+
+ /*! returns i'th radius of itime'th timestep */
+ __forceinline float radius(size_t i, size_t itime) const {
+ return vertices[itime][i].w;
+ }
+
+ /*! calculates bounding box of i'th line segment */
+ __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const
+ {
+ const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1));
+ return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w)));
+ }
+
+ /*! calculates bounding box of i'th line segment */
+ __forceinline BBox3fa bounds(size_t i) const
+ {
+ const unsigned int index = segment(i);
+ const Vec3ff v0 = vertex(index+0);
+ const Vec3ff v1 = vertex(index+1);
+ return bounds(v0,v1);
+ }
+
+ /*! calculates bounding box of i'th line segment for the itime'th time step */
+ __forceinline BBox3fa bounds(size_t i, size_t itime) const
+ {
+ const unsigned int index = segment(i);
+ const Vec3ff v0 = vertex(index+0,itime);
+ const Vec3ff v1 = vertex(index+1,itime);
+ return bounds(v0,v1);
+ }
+
+ /*! calculates bounding box of i'th line segment */
+ __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+ {
+ const unsigned int index = segment(i);
+ const Vec3ff v0 = vertex(index+0);
+ const Vec3ff v1 = vertex(index+1);
+ const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+ const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+ return bounds(w0,w1);
+ }
+
+ /*! calculates bounding box of i'th line segment for the itime'th time step */
+ __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+ {
+ const unsigned int index = segment(i);
+ const Vec3ff v0 = vertex(index+0,itime);
+ const Vec3ff v1 = vertex(index+1,itime);
+ const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+ const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+ return bounds(w0,w1);
+ }
+
+ /*! check if the i'th primitive is valid at the itime'th timestep */
+ __forceinline bool valid(size_t i, size_t itime) const {
+ return valid(i, make_range(itime, itime));
+ }
+
+ /*! check if the i'th primitive is valid between the specified time range */
+ __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+ {
+ const unsigned int index = segment(i);
+ if (index+1 >= numVertices()) return false;
+
+ for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+ {
+ const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false;
+ const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false;
+ if (min(v0.w,v1.w) < 0.0f) return false;
+ }
+ return true;
+ }
+
+ /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+ __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+ return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+ }
+
+ /*! calculates the build bounds of the i'th primitive, if it's valid */
+ __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+ {
+ if (!valid(i,0)) return false;
+ *bbox = bounds(i);
+ return true;
+ }
+
+ /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+ __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+ {
+ if (!valid(i,itime+0) || !valid(i,itime+1)) return false;
+ bbox = bounds(i,itime); // use bounds of first time step in builder
+ return true;
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+ return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+ return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+ {
+ if (!valid(i, timeSegmentRange(time_range))) return false;
+ bbox = linearBounds(i, time_range);
+ return true;
+ }
+
+ /*! get fast access to first vertex buffer */
+ __forceinline float * getCompactVertexArray () const {
+ return (float*) vertices0.getPtr();
+ }
+
+ public:
+ BufferView<unsigned int> segments; //!< array of line segment indices
+ BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer
+ BufferView<Vec3fa> normals0; //!< fast access to first normal buffer
+ BufferView<char> flags; //!< start, end flag per segment
+ vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep
+ vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep
+ vector<BufferView<char>> vertexAttribs; //!< user buffers
+ int tessellationRate; //!< tessellation rate for bezier curve
+ float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii
+ };
+
+ namespace isa
+ {
+ struct LineSegmentsISA : public LineSegments
+ {
+ LineSegmentsISA (Device* device, Geometry::GType gtype)
+ : LineSegments(device,gtype) {}
+
+ Vec3fa computeDirection(unsigned int primID) const
+ {
+ const unsigned vtxID = segment(primID);
+ const Vec3fa v0 = vertex(vtxID+0);
+ const Vec3fa v1 = vertex(vtxID+1);
+ return v1-v0;
+ }
+
+ Vec3fa computeDirection(unsigned int primID, size_t time) const
+ {
+ const unsigned vtxID = segment(primID);
+ const Vec3fa v0 = vertex(vtxID+0,time);
+ const Vec3fa v1 = vertex(vtxID+1,time);
+ return v1-v0;
+ }
+
+ PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j,&bounds)) continue;
+ const PrimRef prim(bounds,geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j,itime,bounds)) continue;
+ const PrimRef prim(bounds,geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfoMB pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!valid(j, timeSegmentRange(t0t1))) continue;
+ const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+ pinfo.add_primref(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ BBox3fa vbounds(size_t i) const {
+ return bounds(i);
+ }
+
+ BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const {
+ return bounds(space,i);
+ }
+
+ LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+ return linearBounds(primID,time_range);
+ }
+
+ LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+ return linearBounds(space,primID,time_range);
+ }
+ };
+ }
+
+ DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_points.h b/thirdparty/embree-aarch64/kernels/common/scene_points.h
new file mode 100644
index 0000000000..1d39ed07ba
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_points.h
@@ -0,0 +1,282 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "buffer.h"
+#include "default.h"
+#include "geometry.h"
+
+namespace embree
+{
+ /*! represents an array of points */
+ struct Points : public Geometry
+ {
+ /*! type of this geometry */
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS;
+
+ public:
+ /*! line segments construction */
+ Points(Device* device, Geometry::GType gtype);
+
+ public:
+ void setMask(unsigned mask);
+ void setNumTimeSteps(unsigned int numTimeSteps);
+ void setVertexAttributeCount(unsigned int N);
+ void setBuffer(RTCBufferType type,
+ unsigned int slot,
+ RTCFormat format,
+ const Ref<Buffer>& buffer,
+ size_t offset,
+ size_t stride,
+ unsigned int num);
+ void* getBuffer(RTCBufferType type, unsigned int slot);
+ void updateBuffer(RTCBufferType type, unsigned int slot);
+ void commit();
+ bool verify();
+ void setMaxRadiusScale(float s);
+ void addElementsToCount (GeometryCounts & counts) const;
+
+ public:
+ /*! returns the number of vertices */
+ __forceinline size_t numVertices() const {
+ return vertices[0].size();
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline Vec3ff vertex(size_t i) const {
+ return vertices0[i];
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline const char* vertexPtr(size_t i) const {
+ return vertices0.getPtr(i);
+ }
+
+ /*! returns i'th normal of the first time step */
+ __forceinline Vec3fa normal(size_t i) const {
+ return normals0[i];
+ }
+
+ /*! returns i'th radius of the first time step */
+ __forceinline float radius(size_t i) const {
+ return vertices0[i].w;
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+ return vertices[itime][i];
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+ return vertices[itime].getPtr(i);
+ }
+
+ /*! returns i'th normal of itime'th timestep */
+ __forceinline Vec3fa normal(size_t i, size_t itime) const {
+ return normals[itime][i];
+ }
+
+ /*! returns i'th radius of itime'th timestep */
+ __forceinline float radius(size_t i, size_t itime) const {
+ return vertices[itime][i].w;
+ }
+
+ /*! calculates bounding box of i'th line segment */
+ __forceinline BBox3fa bounds(const Vec3ff& v0) const {
+ return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w));
+ }
+
+ /*! calculates bounding box of i'th line segment */
+ __forceinline BBox3fa bounds(size_t i) const
+ {
+ const Vec3ff v0 = vertex(i);
+ return bounds(v0);
+ }
+
+ /*! calculates bounding box of i'th line segment for the itime'th time step */
+ __forceinline BBox3fa bounds(size_t i, size_t itime) const
+ {
+ const Vec3ff v0 = vertex(i, itime);
+ return bounds(v0);
+ }
+
+ /*! calculates bounding box of i'th line segment */
+ __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+ {
+ const Vec3ff v0 = vertex(i);
+ const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+ return bounds(w0);
+ }
+
+ /*! calculates bounding box of i'th line segment for the itime'th time step */
+ __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+ {
+ const Vec3ff v0 = vertex(i, itime);
+ const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+ return bounds(w0);
+ }
+
+ /*! check if the i'th primitive is valid at the itime'th timestep */
+ __forceinline bool valid(size_t i, size_t itime) const {
+ return valid(i, make_range(itime, itime));
+ }
+
+ /*! check if the i'th primitive is valid between the specified time range */
+ __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+ {
+ const unsigned int index = (unsigned int)i;
+ if (index >= numVertices())
+ return false;
+
+ for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) {
+ const Vec3ff v0 = vertex(index + 0, itime);
+ if (unlikely(!isvalid4(v0)))
+ return false;
+ if (v0.w < 0.0f)
+ return false;
+ }
+ return true;
+ }
+
+ /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+ __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+ return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1));
+ }
+
+ /*! calculates the build bounds of the i'th primitive, if it's valid */
+ __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+ {
+ if (!valid(i, 0))
+ return false;
+ *bbox = bounds(i);
+ return true;
+ }
+
+ /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+ __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+ {
+ if (!valid(i, itime + 0) || !valid(i, itime + 1))
+ return false;
+ bbox = bounds(i, itime); // use bounds of first time step in builder
+ return true;
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+ return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+ return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+ {
+ if (!valid(i, timeSegmentRange(time_range))) return false;
+ bbox = linearBounds(i, time_range);
+ return true;
+ }
+
+ /*! get fast access to first vertex buffer */
+ __forceinline float * getCompactVertexArray () const {
+ return (float*) vertices0.getPtr();
+ }
+
+ public:
+ BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer
+ BufferView<Vec3fa> normals0; //!< fast access to first normal buffer
+ vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep
+ vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep
+ vector<BufferView<char>> vertexAttribs; //!< user buffers
+ float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii
+ };
+
+ namespace isa
+ {
+ struct PointsISA : public Points
+ {
+ PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {}
+
+ Vec3fa computeDirection(unsigned int primID) const
+ {
+ return Vec3fa(1, 0, 0);
+ }
+
+ Vec3fa computeDirection(unsigned int primID, size_t time) const
+ {
+ return Vec3fa(1, 0, 0);
+ }
+
+ PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j = r.begin(); j < r.end(); j++) {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j, &bounds))
+ continue;
+ const PrimRef prim(bounds, geomID, unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j = r.begin(); j < r.end(); j++) {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j, itime, bounds))
+ continue;
+ const PrimRef prim(bounds, geomID, unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims,
+ const BBox1f& t0t1,
+ const range<size_t>& r,
+ size_t k,
+ unsigned int geomID) const
+ {
+ PrimInfoMB pinfo(empty);
+ for (size_t j = r.begin(); j < r.end(); j++) {
+ if (!valid(j, timeSegmentRange(t0t1)))
+ continue;
+ const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j));
+ pinfo.add_primref(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ BBox3fa vbounds(size_t i) const
+ {
+ return bounds(i);
+ }
+
+ BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const
+ {
+ return bounds(space, i);
+ }
+
+ LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const
+ {
+ return linearBounds(primID, time_range);
+ }
+
+ LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const
+ {
+ return linearBounds(space, primID, time_range);
+ }
+ };
+ } // namespace isa
+
+ DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType);
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h
new file mode 100644
index 0000000000..d5bb054b14
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+ /*! Quad Mesh */
+ struct QuadMesh : public Geometry
+ {
+ /*! type of this geometry */
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH;
+
+ /*! triangle indices */
+ struct Quad
+ {
+ uint32_t v[4];
+
+ /*! outputs triangle indices */
+ __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) {
+ return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }";
+ }
+ };
+
+ public:
+
+ /*! quad mesh construction */
+ QuadMesh (Device* device);
+
+ /* geometry interface */
+ public:
+ void setMask(unsigned mask);
+ void setNumTimeSteps (unsigned int numTimeSteps);
+ void setVertexAttributeCount (unsigned int N);
+ void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+ void* getBuffer(RTCBufferType type, unsigned int slot);
+ void updateBuffer(RTCBufferType type, unsigned int slot);
+ void commit();
+ bool verify();
+ void interpolate(const RTCInterpolateArguments* const args);
+ void addElementsToCount (GeometryCounts & counts) const;
+
+ public:
+
+ /*! returns number of vertices */
+ __forceinline size_t numVertices() const {
+ return vertices[0].size();
+ }
+
+ /*! returns i'th quad */
+ __forceinline const Quad& quad(size_t i) const {
+ return quads[i];
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const Vec3fa vertex(size_t i) const {
+ return vertices0[i];
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const char* vertexPtr(size_t i) const {
+ return vertices0.getPtr(i);
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+ return vertices[itime][i];
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+ return vertices[itime].getPtr(i);
+ }
+
+ /*! calculates the bounds of the i'th quad */
+ __forceinline BBox3fa bounds(size_t i) const
+ {
+ const Quad& q = quad(i);
+ const Vec3fa v0 = vertex(q.v[0]);
+ const Vec3fa v1 = vertex(q.v[1]);
+ const Vec3fa v2 = vertex(q.v[2]);
+ const Vec3fa v3 = vertex(q.v[3]);
+ return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+ }
+
+ /*! calculates the bounds of the i'th quad at the itime'th timestep */
+ __forceinline BBox3fa bounds(size_t i, size_t itime) const
+ {
+ const Quad& q = quad(i);
+ const Vec3fa v0 = vertex(q.v[0],itime);
+ const Vec3fa v1 = vertex(q.v[1],itime);
+ const Vec3fa v2 = vertex(q.v[2],itime);
+ const Vec3fa v3 = vertex(q.v[3],itime);
+ return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+ }
+
+ /*! check if the i'th primitive is valid at the itime'th timestep */
+ __forceinline bool valid(size_t i, size_t itime) const {
+ return valid(i, make_range(itime, itime));
+ }
+
+ /*! check if the i'th primitive is valid between the specified time range */
+ __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+ {
+ const Quad& q = quad(i);
+ if (unlikely(q.v[0] >= numVertices())) return false;
+ if (unlikely(q.v[1] >= numVertices())) return false;
+ if (unlikely(q.v[2] >= numVertices())) return false;
+ if (unlikely(q.v[3] >= numVertices())) return false;
+
+ for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+ {
+ if (!isvalid(vertex(q.v[0],itime))) return false;
+ if (!isvalid(vertex(q.v[1],itime))) return false;
+ if (!isvalid(vertex(q.v[2],itime))) return false;
+ if (!isvalid(vertex(q.v[3],itime))) return false;
+ }
+
+ return true;
+ }
+
+ /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */
+ __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+ return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+ }
+
+ /*! calculates the build bounds of the i'th primitive, if it's valid */
+ __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+ {
+ const Quad& q = quad(i);
+ if (q.v[0] >= numVertices()) return false;
+ if (q.v[1] >= numVertices()) return false;
+ if (q.v[2] >= numVertices()) return false;
+ if (q.v[3] >= numVertices()) return false;
+
+ for (unsigned int t=0; t<numTimeSteps; t++)
+ {
+ const Vec3fa v0 = vertex(q.v[0],t);
+ const Vec3fa v1 = vertex(q.v[1],t);
+ const Vec3fa v2 = vertex(q.v[2],t);
+ const Vec3fa v3 = vertex(q.v[3],t);
+
+ if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3)))
+ return false;
+ }
+
+ if (bbox)
+ *bbox = bounds(i);
+
+ return true;
+ }
+
+ /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+ __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+ {
+ const Quad& q = quad(i);
+ if (unlikely(q.v[0] >= numVertices())) return false;
+ if (unlikely(q.v[1] >= numVertices())) return false;
+ if (unlikely(q.v[2] >= numVertices())) return false;
+ if (unlikely(q.v[3] >= numVertices())) return false;
+
+ assert(itime+1 < numTimeSteps);
+ const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+ const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+ const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+ const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false;
+ const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+ const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+ const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+ const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false;
+
+ /* use bounds of first time step in builder */
+ bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3));
+ return true;
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+ return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const
+ {
+ if (!valid(i, timeSegmentRange(dt))) return false;
+ bbox = linearBounds(i, dt);
+ return true;
+ }
+
+ /*! get fast access to first vertex buffer */
+ __forceinline float * getCompactVertexArray () const {
+ return (float*) vertices0.getPtr();
+ }
+
+ /* gets version info of topology */
+ unsigned int getTopologyVersion() const {
+ return quads.modCounter;
+ }
+
+ /* returns true if topology changed */
+ bool topologyChanged(unsigned int otherVersion) const {
+ return quads.isModified(otherVersion); // || numPrimitivesChanged;
+ }
+
+ /* returns the projected area */
+ __forceinline float projectedPrimitiveArea(const size_t i) const {
+ const Quad& q = quad(i);
+ const Vec3fa v0 = vertex(q.v[0]);
+ const Vec3fa v1 = vertex(q.v[1]);
+ const Vec3fa v2 = vertex(q.v[2]);
+ const Vec3fa v3 = vertex(q.v[3]);
+ return areaProjectedTriangle(v0,v1,v3) +
+ areaProjectedTriangle(v1,v2,v3);
+ }
+
+ public:
+ BufferView<Quad> quads; //!< array of quads
+ BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer
+ vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+ vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers
+ };
+
+ namespace isa
+ {
+ struct QuadMeshISA : public QuadMesh
+ {
+ QuadMeshISA (Device* device)
+ : QuadMesh(device) {}
+
+ PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j,&bounds)) continue;
+ const PrimRef prim(bounds,geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j,itime,bounds)) continue;
+ const PrimRef prim(bounds,geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfoMB pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!valid(j, timeSegmentRange(t0t1))) continue;
+ const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+ pinfo.add_primref(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+ };
+ }
+
+ DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h
new file mode 100644
index 0000000000..d0246009db
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h
@@ -0,0 +1,326 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+#include "../subdiv/half_edge.h"
+#include "../subdiv/tessellation_cache.h"
+#include "../subdiv/catmullclark_coefficients.h"
+#include "../subdiv/patch.h"
+#include "../../common/algorithms/parallel_map.h"
+#include "../../common/algorithms/parallel_set.h"
+
+namespace embree
+{
+ class SubdivMesh : public Geometry
+ {
+ ALIGNED_CLASS_(16);
+ public:
+
+ typedef HalfEdge::Edge Edge;
+
+ /*! type of this geometry */
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH;
+
+ /*! structure used to sort half edges using radix sort by their key */
+ struct KeyHalfEdge
+ {
+ KeyHalfEdge() {}
+
+ KeyHalfEdge (uint64_t key, HalfEdge* edge)
+ : key(key), edge(edge) {}
+
+ __forceinline operator uint64_t() const {
+ return key;
+ }
+
+ friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) {
+ return e0.key < e1.key;
+ }
+
+ public:
+ uint64_t key;
+ HalfEdge* edge;
+ };
+
+ public:
+
+ /*! subdiv mesh construction */
+ SubdivMesh(Device* device);
+
+ public:
+ void setMask (unsigned mask);
+ void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode);
+ void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID);
+ void setNumTimeSteps (unsigned int numTimeSteps);
+ void setVertexAttributeCount (unsigned int N);
+ void setTopologyCount (unsigned int N);
+ void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+ void* getBuffer(RTCBufferType type, unsigned int slot);
+ void updateBuffer(RTCBufferType type, unsigned int slot);
+ void setTessellationRate(float N);
+ bool verify();
+ void commit();
+ void addElementsToCount (GeometryCounts & counts) const;
+ void setDisplacementFunction (RTCDisplacementFunctionN func);
+ unsigned int getFirstHalfEdge(unsigned int faceID);
+ unsigned int getFace(unsigned int edgeID);
+ unsigned int getNextHalfEdge(unsigned int edgeID);
+ unsigned int getPreviousHalfEdge(unsigned int edgeID);
+ unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID);
+
+ public:
+
+ /*! return the number of faces */
+ size_t numFaces() const {
+ return faceVertices.size();
+ }
+
+ /*! return the number of edges */
+ size_t numEdges() const {
+ return topology[0].vertexIndices.size();
+ }
+
+ /*! return the number of vertices */
+ size_t numVertices() const {
+ return vertices[0].size();
+ }
+
+ /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */
+ __forceinline BBox3fa bounds(size_t i, size_t j = 0) const {
+ return topology[0].getHalfEdge(i)->bounds(vertices[j]);
+ }
+
+ /*! check if the i'th primitive is valid */
+ __forceinline bool valid(size_t i) const {
+ return topology[0].valid(i) && !invalidFace(i);
+ }
+
+ /*! check if the i'th primitive is valid for the j'th time range */
+ __forceinline bool valid(size_t i, size_t j) const {
+ return topology[0].valid(i) && !invalidFace(i,j);
+ }
+
+ /*! prints some statistics */
+ void printStatistics();
+
+ /*! initializes the half edge data structure */
+ void initializeHalfEdgeStructures ();
+
+ public:
+
+ /*! returns the vertex buffer for some time step */
+ __forceinline const BufferView<Vec3fa>& getVertexBuffer( const size_t t = 0 ) const {
+ return vertices[t];
+ }
+
+ /* returns tessellation level of edge */
+ __forceinline float getEdgeLevel(const size_t i) const
+ {
+ if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level?
+ else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level?
+ }
+
+ public:
+ RTCDisplacementFunctionN displFunc; //!< displacement function
+
+ /*! all buffers in this section are provided by the application */
+ public:
+
+ /*! the topology contains all data that may differ when
+ * interpolating different user data buffers */
+ struct Topology
+ {
+ public:
+
+ /*! Default topology construction */
+ Topology () : halfEdges(nullptr,0) {}
+
+ /*! Topology initialization */
+ Topology (SubdivMesh* mesh);
+
+ /*! make the class movable */
+ public:
+ Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+ : mesh(std::move(other.mesh)),
+ vertexIndices(std::move(other.vertexIndices)),
+ subdiv_mode(std::move(other.subdiv_mode)),
+ halfEdges(std::move(other.halfEdges)),
+ halfEdges0(std::move(other.halfEdges0)),
+ halfEdges1(std::move(other.halfEdges1)) {}
+
+ Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+ {
+ mesh = std::move(other.mesh);
+ vertexIndices = std::move(other.vertexIndices);
+ subdiv_mode = std::move(other.subdiv_mode);
+ halfEdges = std::move(other.halfEdges);
+ halfEdges0 = std::move(other.halfEdges0);
+ halfEdges1 = std::move(other.halfEdges1);
+ return *this;
+ }
+
+ public:
+ /*! check if the i'th primitive is valid in this topology */
+ __forceinline bool valid(size_t i) const
+ {
+ if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) {
+ if (getHalfEdge(i)->faceHasBorder()) return false;
+ }
+ return true;
+ }
+
+ /*! updates the interpolation mode for the topology */
+ void setSubdivisionMode (RTCSubdivisionMode mode);
+
+ /*! marks all buffers as modified */
+ void update ();
+
+ /*! verifies index array */
+ bool verify (size_t numVertices);
+
+ /*! initializes the half edge data structure */
+ void initializeHalfEdgeStructures ();
+
+ private:
+
+ /*! recalculates the half edges */
+ void calculateHalfEdges();
+
+ /*! updates half edges when recalculation is not necessary */
+ void updateHalfEdges();
+
+ /*! user input data */
+ public:
+
+ SubdivMesh* mesh;
+
+ /*! indices of the vertices composing each face */
+ BufferView<unsigned int> vertexIndices;
+
+ /*! subdiv interpolation mode */
+ RTCSubdivisionMode subdiv_mode;
+
+ /*! generated data */
+ public:
+
+ /*! returns the start half edge for face f */
+ __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const {
+ return &halfEdges[mesh->faceStartEdge[f]];
+ }
+
+ /*! Half edge structure, generated by initHalfEdgeStructures */
+ mvector<HalfEdge> halfEdges;
+
+ /*! the following data is only required during construction of the
+ * half edge structure and can be cleared for static scenes */
+ private:
+
+ /*! two arrays used to sort the half edges */
+ std::vector<KeyHalfEdge> halfEdges0;
+ std::vector<KeyHalfEdge> halfEdges1;
+ };
+
+ /*! returns the start half edge for topology t and face f */
+ __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const {
+ return topology[t].getHalfEdge(f);
+ }
+
+ /*! buffer containing the number of vertices for each face */
+ BufferView<unsigned int> faceVertices;
+
+ /*! array of topologies */
+ vector<Topology> topology;
+
+ /*! vertex buffer (one buffer for each time step) */
+ vector<BufferView<Vec3fa>> vertices;
+
+ /*! user data buffers */
+ vector<RawBufferView> vertexAttribs;
+
+ /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */
+ BufferView<Edge> edge_creases;
+
+ /*! edge crease weights for each edge of the edge_creases buffer */
+ BufferView<float> edge_crease_weights;
+
+ /*! vertex crease buffer containing all vertices that carry vertex crease weights */
+ BufferView<unsigned int> vertex_creases;
+
+ /*! vertex crease weights for each vertex of the vertex_creases buffer */
+ BufferView<float> vertex_crease_weights;
+
+ /*! subdivision level for each half edge of the vertexIndices buffer */
+ BufferView<float> levels;
+ float tessellationRate; // constant rate that is used when levels is not set
+
+ /*! buffer that marks specific faces as holes */
+ BufferView<unsigned> holes;
+
+ /*! all data in this section is generated by initializeHalfEdgeStructures function */
+ private:
+
+ /*! number of half edges used by faces */
+ size_t numHalfEdges;
+
+ /*! fast lookup table to find the first half edge for some face */
+ mvector<uint32_t> faceStartEdge;
+
+ /*! fast lookup table to find the face for some half edge */
+ mvector<uint32_t> halfEdgeFace;
+
+ /*! set with all holes */
+ parallel_set<uint32_t> holeSet;
+
+ /*! fast lookup table to detect invalid faces */
+ mvector<int8_t> invalid_face;
+
+ /*! test if face i is invalid in timestep j */
+ __forceinline int8_t& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; }
+ __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
+
+ /*! interpolation cache */
+ public:
+ static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; }
+ static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; }
+ static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) {
+ const size_t slots = numInterpolationSlots4(stride);
+ assert(slot < slots);
+ return slots*prim+slot;
+ }
+ std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_buffer_tags;
+ std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_attrib_buffer_tags;
+ std::vector<Patch3fa::Ref> patch_eval_trees;
+
+ /*! the following data is only required during construction of the
+ * half edge structure and can be cleared for static scenes */
+ private:
+
+ /*! map with all vertex creases */
+ parallel_map<uint32_t,float> vertexCreaseMap;
+
+ /*! map with all edge creases */
+ parallel_map<uint64_t,float> edgeCreaseMap;
+
+ protected:
+
+ /*! counts number of geometry commits */
+ size_t commitCounter;
+ };
+
+ namespace isa
+ {
+ struct SubdivMeshISA : public SubdivMesh
+ {
+ SubdivMeshISA (Device* device)
+ : SubdivMesh(device) {}
+
+ void interpolate(const RTCInterpolateArguments* const args);
+ void interpolateN(const RTCInterpolateNArguments* const args);
+ };
+ }
+
+ DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*);
+};
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp
new file mode 100644
index 0000000000..d1c2750f14
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp
@@ -0,0 +1,243 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene_triangle_mesh.h"
+#include "scene.h"
+
+namespace embree
+{
+#if defined(EMBREE_LOWEST_ISA)
+
+ TriangleMesh::TriangleMesh (Device* device)
+ : Geometry(device,GTY_TRIANGLE_MESH,0,1)
+ {
+ vertices.resize(numTimeSteps);
+ }
+
+ void TriangleMesh::setMask (unsigned mask)
+ {
+ this->mask = mask;
+ Geometry::update();
+ }
+
+ void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps)
+ {
+ vertices.resize(numTimeSteps);
+ Geometry::setNumTimeSteps(numTimeSteps);
+ }
+
+ void TriangleMesh::setVertexAttributeCount (unsigned int N)
+ {
+ vertexAttribs.resize(N);
+ Geometry::update();
+ }
+
+ void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num)
+ {
+ /* verify that all accesses are 4 bytes aligned */
+ if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3))
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned");
+
+ if (type == RTC_BUFFER_TYPE_VERTEX)
+ {
+ if (format != RTC_FORMAT_FLOAT3)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format");
+
+ /* if buffer is larger than 16GB the premultiplied index optimization does not work */
+ if (stride*num > 16ll*1024ll*1024ll*1024ll)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large");
+
+ if (slot >= vertices.size())
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot");
+
+ vertices[slot].set(buffer, offset, stride, num, format);
+ vertices[slot].checkPadding16();
+ vertices0 = vertices[0];
+ }
+ else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+ {
+ if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format");
+
+ if (slot >= vertexAttribs.size())
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot");
+
+ vertexAttribs[slot].set(buffer, offset, stride, num, format);
+ vertexAttribs[slot].checkPadding16();
+ }
+ else if (type == RTC_BUFFER_TYPE_INDEX)
+ {
+ if (slot != 0)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+ if (format != RTC_FORMAT_UINT3)
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format");
+
+ triangles.set(buffer, offset, stride, num, format);
+ setNumPrimitives(num);
+ }
+ else
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+ }
+
+ void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot)
+ {
+ if (type == RTC_BUFFER_TYPE_INDEX)
+ {
+ if (slot != 0)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+ return triangles.getPtr();
+ }
+ else if (type == RTC_BUFFER_TYPE_VERTEX)
+ {
+ if (slot >= vertices.size())
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+ return vertices[slot].getPtr();
+ }
+ else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+ {
+ if (slot >= vertexAttribs.size())
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+ return vertexAttribs[slot].getPtr();
+ }
+ else
+ {
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+ return nullptr;
+ }
+ }
+
+ void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot)
+ {
+ if (type == RTC_BUFFER_TYPE_INDEX)
+ {
+ if (slot != 0)
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+ triangles.setModified();
+ }
+ else if (type == RTC_BUFFER_TYPE_VERTEX)
+ {
+ if (slot >= vertices.size())
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+ vertices[slot].setModified();
+ }
+ else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+ {
+ if (slot >= vertexAttribs.size())
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+ vertexAttribs[slot].setModified();
+ }
+ else
+ {
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+ }
+
+ Geometry::update();
+ }
+
+ void TriangleMesh::commit()
+ {
+ /* verify that stride of all time steps are identical */
+ for (unsigned int t=0; t<numTimeSteps; t++)
+ if (vertices[t].getStride() != vertices[0].getStride())
+ throw_RTCError(RTC_ERROR_INVALID_OPERATION,"stride of vertex buffers have to be identical for each time step");
+
+ Geometry::commit();
+ }
+
+ void TriangleMesh::addElementsToCount (GeometryCounts & counts) const
+ {
+ if (numTimeSteps == 1) counts.numTriangles += numPrimitives;
+ else counts.numMBTriangles += numPrimitives;
+ }
+
+ bool TriangleMesh::verify()
+ {
+ /*! verify size of vertex arrays */
+ if (vertices.size() == 0) return false;
+ for (const auto& buffer : vertices)
+ if (buffer.size() != numVertices())
+ return false;
+
+ /*! verify size of user vertex arrays */
+ for (const auto& buffer : vertexAttribs)
+ if (buffer.size() != numVertices())
+ return false;
+
+ /*! verify triangle indices */
+ for (size_t i=0; i<size(); i++) {
+ if (triangles[i].v[0] >= numVertices()) return false;
+ if (triangles[i].v[1] >= numVertices()) return false;
+ if (triangles[i].v[2] >= numVertices()) return false;
+ }
+
+ /*! verify vertices */
+ for (const auto& buffer : vertices)
+ for (size_t i=0; i<buffer.size(); i++)
+ if (!isvalid(buffer[i]))
+ return false;
+
+ return true;
+ }
+
+ void TriangleMesh::interpolate(const RTCInterpolateArguments* const args)
+ {
+ unsigned int primID = args->primID;
+ float u = args->u;
+ float v = args->v;
+ RTCBufferType bufferType = args->bufferType;
+ unsigned int bufferSlot = args->bufferSlot;
+ float* P = args->P;
+ float* dPdu = args->dPdu;
+ float* dPdv = args->dPdv;
+ float* ddPdudu = args->ddPdudu;
+ float* ddPdvdv = args->ddPdvdv;
+ float* ddPdudv = args->ddPdudv;
+ unsigned int valueCount = args->valueCount;
+
+ /* calculate base pointer and stride */
+ assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+ (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+ const char* src = nullptr;
+ size_t stride = 0;
+ if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+ src = vertexAttribs[bufferSlot].getPtr();
+ stride = vertexAttribs[bufferSlot].getStride();
+ } else {
+ src = vertices[bufferSlot].getPtr();
+ stride = vertices[bufferSlot].getStride();
+ }
+
+ for (unsigned int i=0; i<valueCount; i+=4)
+ {
+ size_t ofs = i*sizeof(float);
+ const float w = 1.0f-u-v;
+ const Triangle& tri = triangle(primID);
+ const vbool4 valid = vint4((int)i)+vint4(step) < vint4(int(valueCount));
+ const vfloat4 p0 = vfloat4::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]);
+ const vfloat4 p1 = vfloat4::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]);
+ const vfloat4 p2 = vfloat4::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]);
+
+ if (P) {
+ vfloat4::storeu(valid,P+i,madd(w,p0,madd(u,p1,v*p2)));
+ }
+ if (dPdu) {
+ assert(dPdu); vfloat4::storeu(valid,dPdu+i,p1-p0);
+ assert(dPdv); vfloat4::storeu(valid,dPdv+i,p2-p0);
+ }
+ if (ddPdudu) {
+ assert(ddPdudu); vfloat4::storeu(valid,ddPdudu+i,vfloat4(zero));
+ assert(ddPdvdv); vfloat4::storeu(valid,ddPdvdv+i,vfloat4(zero));
+ assert(ddPdudv); vfloat4::storeu(valid,ddPdudv+i,vfloat4(zero));
+ }
+ }
+ }
+
+#endif
+
+ namespace isa
+ {
+ TriangleMesh* createTriangleMesh(Device* device) {
+ return new TriangleMeshISA(device);
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h
new file mode 100644
index 0000000000..eaf2e1799a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h
@@ -0,0 +1,264 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+ /*! Triangle Mesh */
+ struct TriangleMesh : public Geometry
+ {
+ /*! type of this geometry */
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_TRIANGLE_MESH;
+
+ /*! triangle indices */
+ struct Triangle
+ {
+ uint32_t v[3];
+
+ /*! outputs triangle indices */
+ __forceinline friend embree_ostream operator<<(embree_ostream cout, const Triangle& t) {
+ return cout << "Triangle { " << t.v[0] << ", " << t.v[1] << ", " << t.v[2] << " }";
+ }
+ };
+
+ public:
+
+ /*! triangle mesh construction */
+ TriangleMesh (Device* device);
+
+ /* geometry interface */
+ public:
+ void setMask(unsigned mask);
+ void setNumTimeSteps (unsigned int numTimeSteps);
+ void setVertexAttributeCount (unsigned int N);
+ void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+ void* getBuffer(RTCBufferType type, unsigned int slot);
+ void updateBuffer(RTCBufferType type, unsigned int slot);
+ void commit();
+ bool verify();
+ void interpolate(const RTCInterpolateArguments* const args);
+ void addElementsToCount (GeometryCounts & counts) const;
+
+ public:
+
+ /*! returns number of vertices */
+ __forceinline size_t numVertices() const {
+ return vertices[0].size();
+ }
+
+ /*! returns i'th triangle*/
+ __forceinline const Triangle& triangle(size_t i) const {
+ return triangles[i];
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline const Vec3fa vertex(size_t i) const {
+ return vertices0[i];
+ }
+
+ /*! returns i'th vertex of the first time step */
+ __forceinline const char* vertexPtr(size_t i) const {
+ return vertices0.getPtr(i);
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+ return vertices[itime][i];
+ }
+
+ /*! returns i'th vertex of itime'th timestep */
+ __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+ return vertices[itime].getPtr(i);
+ }
+
+ /*! calculates the bounds of the i'th triangle */
+ __forceinline BBox3fa bounds(size_t i) const
+ {
+ const Triangle& tri = triangle(i);
+ const Vec3fa v0 = vertex(tri.v[0]);
+ const Vec3fa v1 = vertex(tri.v[1]);
+ const Vec3fa v2 = vertex(tri.v[2]);
+ return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+ }
+
+ /*! calculates the bounds of the i'th triangle at the itime'th timestep */
+ __forceinline BBox3fa bounds(size_t i, size_t itime) const
+ {
+ const Triangle& tri = triangle(i);
+ const Vec3fa v0 = vertex(tri.v[0],itime);
+ const Vec3fa v1 = vertex(tri.v[1],itime);
+ const Vec3fa v2 = vertex(tri.v[2],itime);
+ return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+ }
+
+ /*! check if the i'th primitive is valid at the itime'th timestep */
+ __forceinline bool valid(size_t i, size_t itime) const {
+ return valid(i, make_range(itime, itime));
+ }
+
+ /*! check if the i'th primitive is valid between the specified time range */
+ __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+ {
+ const Triangle& tri = triangle(i);
+ if (unlikely(tri.v[0] >= numVertices())) return false;
+ if (unlikely(tri.v[1] >= numVertices())) return false;
+ if (unlikely(tri.v[2] >= numVertices())) return false;
+
+ for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+ {
+ if (!isvalid(vertex(tri.v[0],itime))) return false;
+ if (!isvalid(vertex(tri.v[1],itime))) return false;
+ if (!isvalid(vertex(tri.v[2],itime))) return false;
+ }
+
+ return true;
+ }
+
+ /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+ __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+ return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+ }
+
+ /*! calculates the build bounds of the i'th primitive, if it's valid */
+ __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+ {
+ const Triangle& tri = triangle(i);
+ if (unlikely(tri.v[0] >= numVertices())) return false;
+ if (unlikely(tri.v[1] >= numVertices())) return false;
+ if (unlikely(tri.v[2] >= numVertices())) return false;
+
+ for (size_t t=0; t<numTimeSteps; t++)
+ {
+ const Vec3fa v0 = vertex(tri.v[0],t);
+ const Vec3fa v1 = vertex(tri.v[1],t);
+ const Vec3fa v2 = vertex(tri.v[2],t);
+ if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2)))
+ return false;
+ }
+
+ if (likely(bbox))
+ *bbox = bounds(i);
+
+ return true;
+ }
+
+ /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+ __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+ {
+ const Triangle& tri = triangle(i);
+ if (unlikely(tri.v[0] >= numVertices())) return false;
+ if (unlikely(tri.v[1] >= numVertices())) return false;
+ if (unlikely(tri.v[2] >= numVertices())) return false;
+
+ assert(itime+1 < numTimeSteps);
+ const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+ const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+ const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+ const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+ const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+ const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+
+ /* use bounds of first time step in builder */
+ bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2));
+ return true;
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+ return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+ }
+
+ /*! calculates the linear bounds of the i'th primitive for the specified time range */
+ __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const {
+ if (!valid(i, timeSegmentRange(dt))) return false;
+ bbox = linearBounds(i, dt);
+ return true;
+ }
+
+ /*! get fast access to first vertex buffer */
+ __forceinline float * getCompactVertexArray () const {
+ return (float*) vertices0.getPtr();
+ }
+
+ /* gets version info of topology */
+ unsigned int getTopologyVersion() const {
+ return triangles.modCounter;
+ }
+
+ /* returns true if topology changed */
+ bool topologyChanged(unsigned int otherVersion) const {
+ return triangles.isModified(otherVersion); // || numPrimitivesChanged;
+ }
+
+ /* returns the projected area */
+ __forceinline float projectedPrimitiveArea(const size_t i) const {
+ const Triangle& tri = triangle(i);
+ const Vec3fa v0 = vertex(tri.v[0]);
+ const Vec3fa v1 = vertex(tri.v[1]);
+ const Vec3fa v2 = vertex(tri.v[2]);
+ return areaProjectedTriangle(v0,v1,v2);
+ }
+
+ public:
+ BufferView<Triangle> triangles; //!< array of triangles
+ BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer
+ vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+ vector<RawBufferView> vertexAttribs; //!< vertex attributes
+ };
+
+ namespace isa
+ {
+ struct TriangleMeshISA : public TriangleMesh
+ {
+ TriangleMeshISA (Device* device)
+ : TriangleMesh(device) {}
+
+ PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j,&bounds)) continue;
+ const PrimRef prim(bounds,geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j,itime,bounds)) continue;
+ const PrimRef prim(bounds,geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfoMB pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!valid(j, timeSegmentRange(t0t1))) continue;
+ const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+ pinfo.add_primref(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+ };
+ }
+
+ DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h
new file mode 100644
index 0000000000..8d11ed6986
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h
@@ -0,0 +1,77 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accelset.h"
+
+namespace embree
+{
+ /*! User geometry with user defined intersection functions */
+ struct UserGeometry : public AccelSet
+ {
+ /*! type of this geometry */
+ static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY;
+
+ public:
+ UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1);
+ virtual void setMask (unsigned mask);
+ virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr);
+ virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect);
+ virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded);
+ virtual void build() {}
+ virtual void addElementsToCount (GeometryCounts & counts) const;
+ };
+
+ namespace isa
+ {
+ struct UserGeometryISA : public UserGeometry
+ {
+ UserGeometryISA (Device* device)
+ : UserGeometry(device) {}
+
+ PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j,&bounds)) continue;
+ const PrimRef prim(bounds,geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!buildBounds(j,itime,bounds)) continue;
+ const PrimRef prim(bounds,geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+
+ PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+ {
+ PrimInfoMB pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!valid(j, timeSegmentRange(t0t1))) continue;
+ const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+ pinfo.add_primref(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }
+ };
+ }
+
+ DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stack_item.h b/thirdparty/embree-aarch64/kernels/common/stack_item.h
new file mode 100644
index 0000000000..533c385365
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stack_item.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+ /*! An item on the stack holds the node ID and distance of that node. */
+ template<typename T>
+ struct __aligned(16) StackItemT
+ {
+ /*! assert that the xchg function works */
+ static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed");
+
+ __forceinline StackItemT() {}
+
+ __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {}
+
+ /*! use SSE instructions to swap stack items */
+ __forceinline static void xchg(StackItemT& a, StackItemT& b)
+ {
+ const vfloat4 sse_a = vfloat4::load((float*)&a);
+ const vfloat4 sse_b = vfloat4::load((float*)&b);
+ vfloat4::store(&a,sse_b);
+ vfloat4::store(&b,sse_a);
+ }
+
+ /*! Sort 2 stack items. */
+ __forceinline friend void sort(StackItemT& s1, StackItemT& s2) {
+ if (s2.dist < s1.dist) xchg(s2,s1);
+ }
+
+ /*! Sort 3 stack items. */
+ __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3)
+ {
+ if (s2.dist < s1.dist) xchg(s2,s1);
+ if (s3.dist < s2.dist) xchg(s3,s2);
+ if (s2.dist < s1.dist) xchg(s2,s1);
+ }
+
+ /*! Sort 4 stack items. */
+ __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4)
+ {
+ if (s2.dist < s1.dist) xchg(s2,s1);
+ if (s4.dist < s3.dist) xchg(s4,s3);
+ if (s3.dist < s1.dist) xchg(s3,s1);
+ if (s4.dist < s2.dist) xchg(s4,s2);
+ if (s3.dist < s2.dist) xchg(s3,s2);
+ }
+
+ /*! use SSE instructions to swap stack items */
+ __forceinline static void cmp_xchg(vint4& a, vint4& b)
+ {
+#if defined(__AVX512VL__)
+ const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a));
+#else
+ const vboolf4 mask0(b < a);
+ const vboolf4 mask(shuffle<2,2,2,2>(mask0));
+#endif
+ const vint4 c = select(mask,b,a);
+ const vint4 d = select(mask,a,b);
+ a = c;
+ b = d;
+ }
+
+ /*! Sort 3 stack items. */
+ __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3)
+ {
+ cmp_xchg(s2,s1);
+ cmp_xchg(s3,s2);
+ cmp_xchg(s2,s1);
+ }
+
+ /*! Sort 4 stack items. */
+ __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4)
+ {
+ cmp_xchg(s2,s1);
+ cmp_xchg(s4,s3);
+ cmp_xchg(s3,s1);
+ cmp_xchg(s4,s2);
+ cmp_xchg(s3,s2);
+ }
+
+
+ /*! Sort N stack items. */
+ __forceinline friend void sort(StackItemT* begin, StackItemT* end)
+ {
+ for (StackItemT* i = begin+1; i != end; ++i)
+ {
+ const vfloat4 item = vfloat4::load((float*)i);
+ const unsigned dist = i->dist;
+ StackItemT* j = i;
+
+ while ((j != begin) && ((j-1)->dist < dist))
+ {
+ vfloat4::store(j, vfloat4::load((float*)(j-1)));
+ --j;
+ }
+
+ vfloat4::store(j, item);
+ }
+ }
+
+ public:
+ T ptr;
+ unsigned dist;
+ };
+
+ /*! An item on the stack holds the node ID and active ray mask. */
+ template<typename T>
+ struct __aligned(8) StackItemMaskT
+ {
+ T ptr;
+ size_t mask;
+ };
+
+ struct __aligned(8) StackItemMaskCoherent
+ {
+ size_t mask;
+ size_t parent;
+ size_t child;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stat.cpp b/thirdparty/embree-aarch64/kernels/common/stat.cpp
new file mode 100644
index 0000000000..b73c3a8c76
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stat.cpp
@@ -0,0 +1,128 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stat.h"
+
+namespace embree
+{
+ Stat Stat::instance;
+
+ Stat::Stat () {
+ }
+
+ Stat::~Stat ()
+ {
+#ifdef EMBREE_STAT_COUNTERS
+ Stat::print(std::cout);
+#endif
+ }
+
+ void Stat::print(std::ostream& cout)
+ {
+ Counters& cntrs = instance.cntrs;
+ Counters::Data& data = instance.cntrs.code;
+ //Counters::Data& data = instance.cntrs.active;
+
+ /* print absolute numbers */
+ cout << "--------- ABSOLUTE ---------" << std::endl;
+ cout << " #normal_travs = " << float(data.normal.travs )*1E-6 << "M" << std::endl;
+ cout << " #nodes = " << float(data.normal.trav_nodes )*1E-6 << "M" << std::endl;
+ cout << " #nodes_xfm = " << float(data.normal.trav_xfm_nodes )*1E-6 << "M" << std::endl;
+ cout << " #leaves = " << float(data.normal.trav_leaves )*1E-6 << "M" << std::endl;
+ cout << " #prims = " << float(data.normal.trav_prims )*1E-6 << "M" << std::endl;
+ cout << " #prim_hits = " << float(data.normal.trav_prim_hits )*1E-6 << "M" << std::endl;
+
+ cout << " #stack nodes = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl;
+ cout << " #stack pop = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+ size_t normal_box_hits = 0;
+ size_t weighted_box_hits = 0;
+ for (size_t i=0;i<SIZE_HISTOGRAM;i++) {
+ normal_box_hits += data.normal.trav_hit_boxes[i];
+ weighted_box_hits += data.normal.trav_hit_boxes[i]*i;
+ }
+ cout << " #hit_boxes = " << normal_box_hits << " (total) distribution: ";
+ float average = 0.0f;
+ for (size_t i=0;i<SIZE_HISTOGRAM;i++)
+ {
+ float value = 100.0f * data.normal.trav_hit_boxes[i] / normal_box_hits;
+ cout << "[" << i << "] " << value << " ";
+ average += (float)i*data.normal.trav_hit_boxes[i] / normal_box_hits;
+ }
+ cout << " average = " << average << std::endl;
+ for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.normal.trav_hit_boxes[i]*i / weighted_box_hits << " ";
+ cout << std::endl;
+
+ if (data.shadow.travs) {
+ cout << " #shadow_travs = " << float(data.shadow.travs )*1E-6 << "M" << std::endl;
+ cout << " #nodes = " << float(data.shadow.trav_nodes )*1E-6 << "M" << std::endl;
+ cout << " #nodes_xfm = " << float(data.shadow.trav_xfm_nodes)*1E-6 << "M" << std::endl;
+ cout << " #leaves = " << float(data.shadow.trav_leaves )*1E-6 << "M" << std::endl;
+ cout << " #prims = " << float(data.shadow.trav_prims )*1E-6 << "M" << std::endl;
+ cout << " #prim_hits = " << float(data.shadow.trav_prim_hits)*1E-6 << "M" << std::endl;
+
+ cout << " #stack nodes = " << float(data.shadow.trav_stack_nodes )*1E-6 << "M" << std::endl;
+ cout << " #stack pop = " << float(data.shadow.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+ size_t shadow_box_hits = 0;
+ size_t weighted_shadow_box_hits = 0;
+
+ for (size_t i=0;i<SIZE_HISTOGRAM;i++) {
+ shadow_box_hits += data.shadow.trav_hit_boxes[i];
+ weighted_shadow_box_hits += data.shadow.trav_hit_boxes[i]*i;
+ }
+ cout << " #hit_boxes = ";
+ for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i] / shadow_box_hits << " ";
+ cout << std::endl;
+ for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i]*i / weighted_shadow_box_hits << " ";
+ cout << std::endl;
+ }
+ cout << std::endl;
+
+ /* print per traversal numbers */
+ cout << "--------- PER TRAVERSAL ---------" << std::endl;
+ float active_normal_travs = float(cntrs.active.normal.travs )/float(cntrs.all.normal.travs );
+ float active_normal_trav_nodes = float(cntrs.active.normal.trav_nodes )/float(cntrs.all.normal.trav_nodes );
+ float active_normal_trav_xfm_nodes = float(cntrs.active.normal.trav_xfm_nodes )/float(cntrs.all.normal.trav_xfm_nodes );
+ float active_normal_trav_leaves = float(cntrs.active.normal.trav_leaves)/float(cntrs.all.normal.trav_leaves);
+ float active_normal_trav_prims = float(cntrs.active.normal.trav_prims )/float(cntrs.all.normal.trav_prims );
+ float active_normal_trav_prim_hits = float(cntrs.active.normal.trav_prim_hits )/float(cntrs.all.normal.trav_prim_hits );
+ float active_normal_trav_stack_pop = float(cntrs.active.normal.trav_stack_pop )/float(cntrs.all.normal.trav_stack_pop );
+
+ cout << " #normal_travs = " << float(cntrs.code.normal.travs )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_travs << "% active" << std::endl;
+ cout << " #nodes = " << float(cntrs.code.normal.trav_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_nodes << "% active" << std::endl;
+ cout << " #node_xfm = " << float(cntrs.code.normal.trav_xfm_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_xfm_nodes << "% active" << std::endl;
+ cout << " #leaves = " << float(cntrs.code.normal.trav_leaves)/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_leaves << "% active" << std::endl;
+ cout << " #prims = " << float(cntrs.code.normal.trav_prims )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prims << "% active" << std::endl;
+ cout << " #prim_hits = " << float(cntrs.code.normal.trav_prim_hits )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prim_hits << "% active" << std::endl;
+ cout << " #stack_pop = " << float(cntrs.code.normal.trav_stack_pop )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_stack_pop << "% active" << std::endl;
+
+ if (cntrs.all.shadow.travs) {
+ float active_shadow_travs = float(cntrs.active.shadow.travs )/float(cntrs.all.shadow.travs );
+ float active_shadow_trav_nodes = float(cntrs.active.shadow.trav_nodes )/float(cntrs.all.shadow.trav_nodes );
+ float active_shadow_trav_xfm_nodes = float(cntrs.active.shadow.trav_xfm_nodes )/float(cntrs.all.shadow.trav_xfm_nodes );
+ float active_shadow_trav_leaves = float(cntrs.active.shadow.trav_leaves)/float(cntrs.all.shadow.trav_leaves);
+ float active_shadow_trav_prims = float(cntrs.active.shadow.trav_prims )/float(cntrs.all.shadow.trav_prims );
+ float active_shadow_trav_prim_hits = float(cntrs.active.shadow.trav_prim_hits )/float(cntrs.all.shadow.trav_prim_hits );
+
+ cout << " #shadow_travs = " << float(cntrs.code.shadow.travs )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_travs << "% active" << std::endl;
+ cout << " #nodes = " << float(cntrs.code.shadow.trav_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_nodes << "% active" << std::endl;
+ cout << " #nodes_xfm = " << float(cntrs.code.shadow.trav_xfm_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_xfm_nodes << "% active" << std::endl;
+ cout << " #leaves = " << float(cntrs.code.shadow.trav_leaves)/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_leaves << "% active" << std::endl;
+ cout << " #prims = " << float(cntrs.code.shadow.trav_prims )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prims << "% active" << std::endl;
+ cout << " #prim_hits = " << float(cntrs.code.shadow.trav_prim_hits )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prim_hits << "% active" << std::endl;
+
+ }
+ cout << std::endl;
+
+ /* print user counters for performance tuning */
+ cout << "--------- USER ---------" << std::endl;
+ for (size_t i=0; i<10; i++)
+ cout << "#user" << i << " = " << float(cntrs.user[i])/float(cntrs.all.normal.travs+cntrs.all.shadow.travs) << " per traversal" << std::endl;
+
+ cout << "#user5/user3 " << 100.0f*float(cntrs.user[5])/float(cntrs.user[3]) << "%" << std::endl;
+ cout << "#user6/user3 " << 100.0f*float(cntrs.user[6])/float(cntrs.user[3]) << "%" << std::endl;
+ cout << "#user7/user3 " << 100.0f*float(cntrs.user[7])/float(cntrs.user[3]) << "%" << std::endl;
+ cout << std::endl;
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stat.h b/thirdparty/embree-aarch64/kernels/common/stat.h
new file mode 100644
index 0000000000..3cda2bd014
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stat.h
@@ -0,0 +1,116 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+/* Macros to gather statistics */
+#ifdef EMBREE_STAT_COUNTERS
+# define STAT(x) x
+# define STAT3(s,x,y,z) \
+ STAT(Stat::get().code .s+=x); \
+ STAT(Stat::get().active.s+=y); \
+ STAT(Stat::get().all .s+=z);
+# define STAT_USER(i,x) Stat::get().user[i]+=x;
+#else
+# define STAT(x)
+# define STAT3(s,x,y,z)
+# define STAT_USER(i,x)
+#endif
+
+namespace embree
+{
+ /*! Gathers ray tracing statistics. We count 1) how often a code
+ * location is reached, 2) how many SIMD lanes are active, 3) how
+ * many SIMD lanes reach the code location */
+ class Stat
+ {
+ public:
+
+ static const size_t SIZE_HISTOGRAM = 64+1;
+
+ /*! constructs stat counter class */
+ Stat ();
+
+ /*! destructs stat counter class */
+ ~Stat ();
+
+ class Counters
+ {
+ public:
+ Counters () {
+ clear();
+ }
+
+ void clear()
+ {
+ all.clear();
+ active.clear();
+ code.clear();
+ for (auto& u : user) u.store(0);
+ }
+
+ public:
+
+ /* per packet and per ray stastics */
+ struct Data
+ {
+ void clear () {
+ normal.clear();
+ shadow.clear();
+ point_query.clear();
+ }
+
+ /* normal and shadow ray statistics */
+ struct
+ {
+ void clear()
+ {
+ travs.store(0);
+ trav_nodes.store(0);
+ trav_leaves.store(0);
+ trav_prims.store(0);
+ trav_prim_hits.store(0);
+ for (auto& v : trav_hit_boxes) v.store(0);
+ trav_stack_pop.store(0);
+ trav_stack_nodes.store(0);
+ trav_xfm_nodes.store(0);
+ }
+
+ public:
+ std::atomic<size_t> travs;
+ std::atomic<size_t> trav_nodes;
+ std::atomic<size_t> trav_leaves;
+ std::atomic<size_t> trav_prims;
+ std::atomic<size_t> trav_prim_hits;
+ std::atomic<size_t> trav_hit_boxes[SIZE_HISTOGRAM+1];
+ std::atomic<size_t> trav_stack_pop;
+ std::atomic<size_t> trav_stack_nodes;
+ std::atomic<size_t> trav_xfm_nodes;
+
+ } normal, shadow, point_query;
+ } all, active, code;
+
+ std::atomic<size_t> user[10];
+ };
+
+ public:
+
+ static __forceinline Counters& get() {
+ return instance.cntrs;
+ }
+
+ static void clear() {
+ instance.cntrs.clear();
+ }
+
+ static void print(embree_ostream cout);
+
+ private:
+ Counters cntrs;
+
+ private:
+ static Stat instance;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/state.cpp b/thirdparty/embree-aarch64/kernels/common/state.cpp
new file mode 100644
index 0000000000..51fc9b7826
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/state.cpp
@@ -0,0 +1,543 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "state.h"
+#include "../../common/lexers/streamfilters.h"
+
+namespace embree
+{
+ MutexSys g_printMutex;
+
+ State::ErrorHandler State::g_errorHandler;
+
+ State::ErrorHandler::ErrorHandler()
+ : thread_error(createTls()) {}
+
+ State::ErrorHandler::~ErrorHandler()
+ {
+ Lock<MutexSys> lock(errors_mutex);
+ for (size_t i=0; i<thread_errors.size(); i++)
+ delete thread_errors[i];
+ destroyTls(thread_error);
+ thread_errors.clear();
+ }
+
+ RTCError* State::ErrorHandler::error()
+ {
+ RTCError* stored_error = (RTCError*) getTls(thread_error);
+ if (stored_error) return stored_error;
+
+ Lock<MutexSys> lock(errors_mutex);
+ stored_error = new RTCError(RTC_ERROR_NONE);
+ thread_errors.push_back(stored_error);
+ setTls(thread_error,stored_error);
+ return stored_error;
+ }
+
+ State::State ()
+ : enabled_cpu_features(getCPUFeatures()),
+ enabled_builder_cpu_features(enabled_cpu_features),
+ frequency_level(FREQUENCY_SIMD256)
+ {
+ tri_accel = "default";
+ tri_builder = "default";
+ tri_traverser = "default";
+
+ tri_accel_mb = "default";
+ tri_builder_mb = "default";
+ tri_traverser_mb = "default";
+
+ quad_accel = "default";
+ quad_builder = "default";
+ quad_traverser = "default";
+
+ quad_accel_mb = "default";
+ quad_builder_mb = "default";
+ quad_traverser_mb = "default";
+
+ line_accel = "default";
+ line_builder = "default";
+ line_traverser = "default";
+
+ line_accel_mb = "default";
+ line_builder_mb = "default";
+ line_traverser_mb = "default";
+
+ hair_accel = "default";
+ hair_builder = "default";
+ hair_traverser = "default";
+
+ hair_accel_mb = "default";
+ hair_builder_mb = "default";
+ hair_traverser_mb = "default";
+
+ object_accel = "default";
+ object_builder = "default";
+ object_accel_min_leaf_size = 1;
+ object_accel_max_leaf_size = 1;
+
+ object_accel_mb = "default";
+ object_builder_mb = "default";
+ object_accel_mb_min_leaf_size = 1;
+ object_accel_mb_max_leaf_size = 1;
+
+ max_spatial_split_replications = 1.2f;
+ useSpatialPreSplits = false;
+
+ tessellation_cache_size = 128*1024*1024;
+
+ subdiv_accel = "default";
+ subdiv_accel_mb = "default";
+
+ grid_accel = "default";
+ grid_builder = "default";
+ grid_accel_mb = "default";
+ grid_builder_mb = "default";
+
+ instancing_open_min = 0;
+ instancing_block_size = 0;
+ instancing_open_factor = 8.0f;
+ instancing_open_max_depth = 32;
+ instancing_open_max = 50000000;
+
+ ignore_config_files = false;
+ float_exceptions = false;
+ quality_flags = -1;
+ scene_flags = -1;
+ verbose = 0;
+ benchmark = 0;
+
+ numThreads = 0;
+ numUserThreads = 0;
+
+#if TASKING_INTERNAL
+ set_affinity = true;
+#else
+ set_affinity = false;
+#endif
+ /* per default enable affinity on KNL */
+ if (hasISA(AVX512KNL)) set_affinity = true;
+
+ start_threads = false;
+ enable_selockmemoryprivilege = false;
+#if defined(__LINUX__)
+ hugepages = true;
+#else
+ hugepages = false;
+#endif
+ hugepages_success = true;
+
+ alloc_main_block_size = 0;
+ alloc_num_main_slots = 0;
+ alloc_thread_block_size = 0;
+ alloc_single_thread_alloc = -1;
+
+ error_function = nullptr;
+ error_function_userptr = nullptr;
+
+ memory_monitor_function = nullptr;
+ memory_monitor_userptr = nullptr;
+ }
+
+ State::~State() {
+ }
+
+ bool State::hasISA(const int isa) {
+ return (enabled_cpu_features & isa) == isa;
+ }
+
+ bool State::checkISASupport() {
+#if defined(__ARM_NEON)
+ /*
+ * NEON CPU type is a mixture of NEON and SSE2
+ */
+
+ bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
+
+ /* this will be true when explicitly initialize Device with `isa=neon` config */
+ bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
+
+ return hasSSE2 || hasNEON;
+#else
+ return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+#endif
+ }
+
+ void State::verify()
+ {
+ /* verify that calculations stay in range */
+ assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX);
+
+ /* here we verify that CPP files compiled for a specific ISA only
+ * call that same or lower ISA version of non-inlined class member
+ * functions */
+#if defined(DEBUG)
+#if defined(EMBREE_TARGET_SSE2)
+#if !defined(__ARM_NEON)
+ assert(sse2::getISA() <= SSE2);
+#endif
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+ assert(sse42::getISA() <= SSE42);
+#endif
+#if defined(EMBREE_TARGET_AVX)
+ assert(avx::getISA() <= AVX);
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+ assert(avx2::getISA() <= AVX2);
+#endif
+#if defined (EMBREE_TARGET_AVX512KNL)
+ assert(avx512knl::getISA() <= AVX512KNL);
+#endif
+#if defined (EMBREE_TARGET_AVX512SKX)
+ assert(avx512skx::getISA() <= AVX512SKX);
+#endif
+#endif
+ }
+
+ const char* symbols[3] = { "=", ",", "|" };
+
+ bool State::parseFile(const FileName& fileName)
+ {
+ FILE* f = fopen(fileName.c_str(),"r");
+ if (!f) return false;
+ Ref<Stream<int> > file = new FileStream(f,fileName);
+
+ std::vector<std::string> syms;
+ for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++)
+ syms.push_back(symbols[i]);
+
+ Ref<TokenStream> cin = new TokenStream(new LineCommentFilter(file,"#"),
+ TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+ TokenStream::separators,syms);
+ parse(cin);
+ return true;
+ }
+
+ void State::parseString(const char* cfg)
+ {
+ if (cfg == nullptr) return;
+
+ std::vector<std::string> syms;
+ for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++)
+ syms.push_back(symbols[i]);
+
+ Ref<TokenStream> cin = new TokenStream(new StrStream(cfg),
+ TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+ TokenStream::separators,syms);
+ parse(cin);
+ }
+
+ int string_to_cpufeatures(const std::string& isa)
+ {
+ if (isa == "sse" ) return SSE;
+ else if (isa == "sse2") return SSE2;
+ else if (isa == "sse3") return SSE3;
+ else if (isa == "ssse3") return SSSE3;
+ else if (isa == "sse41") return SSE41;
+ else if (isa == "sse4.1") return SSE41;
+ else if (isa == "sse42") return SSE42;
+ else if (isa == "sse4.2") return SSE42;
+ else if (isa == "avx") return AVX;
+ else if (isa == "avxi") return AVXI;
+ else if (isa == "avx2") return AVX2;
+ else if (isa == "avx512knl") return AVX512KNL;
+ else if (isa == "avx512skx") return AVX512SKX;
+ else return SSE2;
+ }
+
+ void State::parse(Ref<TokenStream> cin)
+ {
+ /* parse until end of stream */
+ while (cin->peek() != Token::Eof())
+ {
+ const Token tok = cin->get();
+
+ if (tok == Token::Id("threads") && cin->trySymbol("="))
+ numThreads = cin->get().Int();
+
+ else if (tok == Token::Id("user_threads")&& cin->trySymbol("="))
+ numUserThreads = cin->get().Int();
+
+ else if (tok == Token::Id("set_affinity")&& cin->trySymbol("="))
+ set_affinity = cin->get().Int();
+
+ else if (tok == Token::Id("affinity")&& cin->trySymbol("="))
+ set_affinity = cin->get().Int();
+
+ else if (tok == Token::Id("start_threads")&& cin->trySymbol("="))
+ start_threads = cin->get().Int();
+
+ else if (tok == Token::Id("isa") && cin->trySymbol("=")) {
+ std::string isa = toLowerCase(cin->get().Identifier());
+ enabled_cpu_features = string_to_cpufeatures(isa);
+ enabled_builder_cpu_features = enabled_cpu_features;
+ }
+
+ else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) {
+ std::string isa = toLowerCase(cin->get().Identifier());
+ enabled_cpu_features &= string_to_cpufeatures(isa);
+ enabled_builder_cpu_features &= enabled_cpu_features;
+ }
+
+ else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) {
+ std::string isa = toLowerCase(cin->get().Identifier());
+ enabled_builder_cpu_features &= string_to_cpufeatures(isa);
+ }
+
+ else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) {
+ std::string freq = cin->get().Identifier();
+ if (freq == "simd128") frequency_level = FREQUENCY_SIMD128;
+ else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256;
+ else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512;
+ }
+
+ else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) {
+ enable_selockmemoryprivilege = cin->get().Int();
+ }
+ else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) {
+ hugepages = cin->get().Int();
+ }
+
+ else if (tok == Token::Id("ignore_config_files") && cin->trySymbol("="))
+ ignore_config_files = cin->get().Int();
+ else if (tok == Token::Id("float_exceptions") && cin->trySymbol("="))
+ float_exceptions = cin->get().Int();
+
+ else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("="))
+ tri_accel = cin->get().Identifier();
+ else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("="))
+ tri_builder = cin->get().Identifier();
+ else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("="))
+ tri_traverser = cin->get().Identifier();
+
+ else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("="))
+ tri_accel_mb = cin->get().Identifier();
+ else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("="))
+ tri_builder_mb = cin->get().Identifier();
+ else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("="))
+ tri_traverser_mb = cin->get().Identifier();
+
+ else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("="))
+ quad_accel = cin->get().Identifier();
+ else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("="))
+ quad_builder = cin->get().Identifier();
+ else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("="))
+ quad_traverser = cin->get().Identifier();
+
+ else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("="))
+ quad_accel_mb = cin->get().Identifier();
+ else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("="))
+ quad_builder_mb = cin->get().Identifier();
+ else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("="))
+ quad_traverser_mb = cin->get().Identifier();
+
+ else if ((tok == Token::Id("line_accel")) && cin->trySymbol("="))
+ line_accel = cin->get().Identifier();
+ else if ((tok == Token::Id("line_builder")) && cin->trySymbol("="))
+ line_builder = cin->get().Identifier();
+ else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("="))
+ line_traverser = cin->get().Identifier();
+
+ else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("="))
+ line_accel_mb = cin->get().Identifier();
+ else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("="))
+ line_builder_mb = cin->get().Identifier();
+ else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("="))
+ line_traverser_mb = cin->get().Identifier();
+
+ else if (tok == Token::Id("hair_accel") && cin->trySymbol("="))
+ hair_accel = cin->get().Identifier();
+ else if (tok == Token::Id("hair_builder") && cin->trySymbol("="))
+ hair_builder = cin->get().Identifier();
+ else if (tok == Token::Id("hair_traverser") && cin->trySymbol("="))
+ hair_traverser = cin->get().Identifier();
+
+ else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("="))
+ hair_accel_mb = cin->get().Identifier();
+ else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("="))
+ hair_builder_mb = cin->get().Identifier();
+ else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("="))
+ hair_traverser_mb = cin->get().Identifier();
+
+ else if (tok == Token::Id("object_accel") && cin->trySymbol("="))
+ object_accel = cin->get().Identifier();
+ else if (tok == Token::Id("object_builder") && cin->trySymbol("="))
+ object_builder = cin->get().Identifier();
+ else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("="))
+ object_accel_min_leaf_size = cin->get().Int();
+ else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("="))
+ object_accel_max_leaf_size = cin->get().Int();
+
+ else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("="))
+ object_accel_mb = cin->get().Identifier();
+ else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("="))
+ object_builder_mb = cin->get().Identifier();
+ else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("="))
+ object_accel_mb_min_leaf_size = cin->get().Int();
+ else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("="))
+ object_accel_mb_max_leaf_size = cin->get().Int();
+
+ else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("="))
+ instancing_open_min = cin->get().Int();
+ else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) {
+ instancing_block_size = cin->get().Int();
+ instancing_open_factor = 0.0f;
+ }
+ else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("="))
+ instancing_open_max_depth = cin->get().Int();
+ else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) {
+ instancing_block_size = 0;
+ instancing_open_factor = cin->get().Float();
+ }
+ else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("="))
+ instancing_open_max = cin->get().Int();
+
+ else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("="))
+ subdiv_accel = cin->get().Identifier();
+ else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("="))
+ subdiv_accel_mb = cin->get().Identifier();
+
+ else if (tok == Token::Id("grid_accel") && cin->trySymbol("="))
+ grid_accel = cin->get().Identifier();
+ else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("="))
+ grid_accel_mb = cin->get().Identifier();
+
+ else if (tok == Token::Id("verbose") && cin->trySymbol("="))
+ verbose = cin->get().Int();
+ else if (tok == Token::Id("benchmark") && cin->trySymbol("="))
+ benchmark = cin->get().Int();
+
+ else if (tok == Token::Id("quality")) {
+ if (cin->trySymbol("=")) {
+ Token flag = cin->get();
+ if (flag == Token::Id("low")) quality_flags = RTC_BUILD_QUALITY_LOW;
+ else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM;
+ else if (flag == Token::Id("high")) quality_flags = RTC_BUILD_QUALITY_HIGH;
+ }
+ }
+
+ else if (tok == Token::Id("scene_flags")) {
+ scene_flags = 0;
+ if (cin->trySymbol("=")) {
+ do {
+ Token flag = cin->get();
+ if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC;
+ else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT;
+ else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST;
+ } while (cin->trySymbol("|"));
+ }
+ }
+
+ else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("="))
+ max_spatial_split_replications = cin->get().Float();
+
+ else if (tok == Token::Id("presplits") && cin->trySymbol("="))
+ useSpatialPreSplits = cin->get().Int() != 0 ? true : false;
+
+ else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("="))
+ tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+ else if (tok == Token::Id("cache_size") && cin->trySymbol("="))
+ tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+
+ else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("="))
+ alloc_main_block_size = cin->get().Int();
+ else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("="))
+ alloc_num_main_slots = cin->get().Int();
+ else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("="))
+ alloc_thread_block_size = cin->get().Int();
+ else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("="))
+ alloc_single_thread_alloc = cin->get().Int();
+
+ cin->trySymbol(","); // optional , separator
+ }
+ }
+
+ bool State::verbosity(size_t N) {
+ return N <= verbose;
+ }
+
+ void State::print()
+ {
+ std::cout << "general:" << std::endl;
+ std::cout << " build threads = " << numThreads << std::endl;
+ std::cout << " build user threads = " << numUserThreads << std::endl;
+ std::cout << " start_threads = " << start_threads << std::endl;
+ std::cout << " affinity = " << set_affinity << std::endl;
+ std::cout << " frequency_level = ";
+ switch (frequency_level) {
+ case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break;
+ case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break;
+ case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break;
+ default: std::cout << "error" << std::endl; break;
+ }
+
+ std::cout << " hugepages = ";
+ if (!hugepages) std::cout << "disabled" << std::endl;
+ else if (hugepages_success) std::cout << "enabled" << std::endl;
+ else std::cout << "failed" << std::endl;
+
+ std::cout << " verbosity = " << verbose << std::endl;
+ std::cout << " cache_size = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl;
+ std::cout << " max_spatial_split_replications = " << max_spatial_split_replications << std::endl;
+
+ std::cout << "triangles:" << std::endl;
+ std::cout << " accel = " << tri_accel << std::endl;
+ std::cout << " builder = " << tri_builder << std::endl;
+ std::cout << " traverser = " << tri_traverser << std::endl;
+
+ std::cout << "motion blur triangles:" << std::endl;
+ std::cout << " accel = " << tri_accel_mb << std::endl;
+ std::cout << " builder = " << tri_builder_mb << std::endl;
+ std::cout << " traverser = " << tri_traverser_mb << std::endl;
+
+ std::cout << "quads:" << std::endl;
+ std::cout << " accel = " << quad_accel << std::endl;
+ std::cout << " builder = " << quad_builder << std::endl;
+ std::cout << " traverser = " << quad_traverser << std::endl;
+
+ std::cout << "motion blur quads:" << std::endl;
+ std::cout << " accel = " << quad_accel_mb << std::endl;
+ std::cout << " builder = " << quad_builder_mb << std::endl;
+ std::cout << " traverser = " << quad_traverser_mb << std::endl;
+
+ std::cout << "line segments:" << std::endl;
+ std::cout << " accel = " << line_accel << std::endl;
+ std::cout << " builder = " << line_builder << std::endl;
+ std::cout << " traverser = " << line_traverser << std::endl;
+
+ std::cout << "motion blur line segments:" << std::endl;
+ std::cout << " accel = " << line_accel_mb << std::endl;
+ std::cout << " builder = " << line_builder_mb << std::endl;
+ std::cout << " traverser = " << line_traverser_mb << std::endl;
+
+ std::cout << "hair:" << std::endl;
+ std::cout << " accel = " << hair_accel << std::endl;
+ std::cout << " builder = " << hair_builder << std::endl;
+ std::cout << " traverser = " << hair_traverser << std::endl;
+
+ std::cout << "motion blur hair:" << std::endl;
+ std::cout << " accel = " << hair_accel_mb << std::endl;
+ std::cout << " builder = " << hair_builder_mb << std::endl;
+ std::cout << " traverser = " << hair_traverser_mb << std::endl;
+
+ std::cout << "subdivision surfaces:" << std::endl;
+ std::cout << " accel = " << subdiv_accel << std::endl;
+
+ std::cout << "grids:" << std::endl;
+ std::cout << " accel = " << grid_accel << std::endl;
+ std::cout << " builder = " << grid_builder << std::endl;
+
+ std::cout << "motion blur grids:" << std::endl;
+ std::cout << " accel = " << grid_accel_mb << std::endl;
+ std::cout << " builder = " << grid_builder_mb << std::endl;
+
+ std::cout << "object_accel:" << std::endl;
+ std::cout << " min_leaf_size = " << object_accel_min_leaf_size << std::endl;
+ std::cout << " max_leaf_size = " << object_accel_max_leaf_size << std::endl;
+
+ std::cout << "object_accel_mb:" << std::endl;
+ std::cout << " min_leaf_size = " << object_accel_mb_min_leaf_size << std::endl;
+ std::cout << " max_leaf_size = " << object_accel_mb_max_leaf_size << std::endl;
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/state.h b/thirdparty/embree-aarch64/kernels/common/state.h
new file mode 100644
index 0000000000..d0fccc023f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/state.h
@@ -0,0 +1,197 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+ /* mutex to make printing to cout thread safe */
+ extern MutexSys g_printMutex;
+
+ struct State : public RefCount
+ {
+ public:
+ /*! state construction */
+ State ();
+
+ /*! state destruction */
+ ~State();
+
+ /*! verifies that state is correct */
+ void verify();
+
+ /*! parses state from a configuration file */
+ bool parseFile(const FileName& fileName);
+
+ /*! parses the state from a string */
+ void parseString(const char* cfg);
+
+ /*! parses the state from a stream */
+ void parse(Ref<TokenStream> cin);
+
+ /*! prints the state */
+ void print();
+
+ /*! checks if verbosity level is at least N */
+ bool verbosity(size_t N);
+
+ /*! checks if some particular ISA is enabled */
+ bool hasISA(const int isa);
+
+ /*! check whether selected ISA is supported by the HW */
+ bool checkISASupport();
+
+ public:
+ std::string tri_accel; //!< acceleration structure to use for triangles
+ std::string tri_builder; //!< builder to use for triangles
+ std::string tri_traverser; //!< traverser to use for triangles
+
+ public:
+ std::string tri_accel_mb; //!< acceleration structure to use for motion blur triangles
+ std::string tri_builder_mb; //!< builder to use for motion blur triangles
+ std::string tri_traverser_mb; //!< traverser to use for triangles
+
+ public:
+ std::string quad_accel; //!< acceleration structure to use for quads
+ std::string quad_builder; //!< builder to use for quads
+ std::string quad_traverser; //!< traverser to use for quads
+
+ public:
+ std::string quad_accel_mb; //!< acceleration structure to use for motion blur quads
+ std::string quad_builder_mb; //!< builder to use for motion blur quads
+ std::string quad_traverser_mb; //!< traverser to use for motion blur quads
+
+ public:
+ std::string line_accel; //!< acceleration structure to use for line segments
+ std::string line_builder; //!< builder to use for line segments
+ std::string line_traverser; //!< traverser to use for line segments
+
+ public:
+ std::string line_accel_mb; //!< acceleration structure to use for motion blur line segments
+ std::string line_builder_mb; //!< builder to use for motion blur line segments
+ std::string line_traverser_mb; //!< traverser to use for motion blur line segments
+
+ public:
+ std::string hair_accel; //!< hair acceleration structure to use
+ std::string hair_builder; //!< builder to use for hair
+ std::string hair_traverser; //!< traverser to use for hair
+
+ public:
+ std::string hair_accel_mb; //!< acceleration structure to use for motion blur hair
+ std::string hair_builder_mb; //!< builder to use for motion blur hair
+ std::string hair_traverser_mb; //!< traverser to use for motion blur hair
+
+ public:
+ std::string object_accel; //!< acceleration structure for user geometries
+ std::string object_builder; //!< builder for user geometries
+ int object_accel_min_leaf_size; //!< minimum leaf size for object acceleration structure
+ int object_accel_max_leaf_size; //!< maximum leaf size for object acceleration structure
+
+ public:
+ std::string object_accel_mb; //!< acceleration structure for user geometries
+ std::string object_builder_mb; //!< builder for user geometries
+ int object_accel_mb_min_leaf_size; //!< minimum leaf size for mblur object acceleration structure
+ int object_accel_mb_max_leaf_size; //!< maximum leaf size for mblur object acceleration structure
+
+ public:
+ std::string subdiv_accel; //!< acceleration structure to use for subdivision surfaces
+ std::string subdiv_accel_mb; //!< acceleration structure to use for subdivision surfaces
+
+ public:
+ std::string grid_accel; //!< acceleration structure to use for grids
+ std::string grid_builder; //!< builder for grids
+ std::string grid_accel_mb; //!< acceleration structure to use for motion blur grids
+ std::string grid_builder_mb; //!< builder for motion blur grids
+
+ public:
+ float max_spatial_split_replications; //!< maximally replications*N many primitives in accel for spatial splits
+ bool useSpatialPreSplits; //!< use spatial pre-splits instead of the full spatial split builder
+ size_t tessellation_cache_size; //!< size of the shared tessellation cache
+
+ public:
+ size_t instancing_open_min; //!< instancing opens tree to minimally that number of subtrees
+ size_t instancing_block_size; //!< instancing opens tree up to average block size of primitives
+ float instancing_open_factor; //!< instancing opens tree up to x times the number of instances
+ size_t instancing_open_max_depth; //!< maximum open depth for geometries
+ size_t instancing_open_max; //!< instancing opens tree to maximally that number of subtrees
+
+ public:
+ bool ignore_config_files; //!< if true no more config files get parse
+ bool float_exceptions; //!< enable floating point exceptions
+ int quality_flags;
+ int scene_flags;
+ size_t verbose; //!< verbosity of output
+ size_t benchmark; //!< true
+
+ public:
+ size_t numThreads; //!< number of threads to use in builders
+ size_t numUserThreads; //!< number of user provided threads to use in builders
+ bool set_affinity; //!< sets affinity for worker threads
+ bool start_threads; //!< true when threads should be started at device creation time
+ int enabled_cpu_features; //!< CPU ISA features to use
+ int enabled_builder_cpu_features; //!< CPU ISA features to use for builders only
+ enum FREQUENCY_LEVEL {
+ FREQUENCY_SIMD128,
+ FREQUENCY_SIMD256,
+ FREQUENCY_SIMD512
+ } frequency_level; //!< frequency level the app wants to run on (default is SIMD256)
+ bool enable_selockmemoryprivilege; //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages
+ bool hugepages; //!< true if huge pages should get used
+ bool hugepages_success; //!< status for enabling huge pages
+
+ public:
+ size_t alloc_main_block_size; //!< main allocation block size (shared between threads)
+ int alloc_num_main_slots; //!< number of such shared blocks to be used to allocate
+ size_t alloc_thread_block_size; //!< size of thread local allocator block size
+ int alloc_single_thread_alloc; //!< in single mode nodes and leaves use same thread local allocator
+
+ public:
+
+ /*! checks if we can use AVX */
+ bool canUseAVX() {
+ return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128;
+ }
+
+ /*! checks if we can use AVX2 */
+ bool canUseAVX2() {
+ return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128;
+ }
+
+ struct ErrorHandler
+ {
+ public:
+ ErrorHandler();
+ ~ErrorHandler();
+ RTCError* error();
+
+ public:
+ tls_t thread_error;
+ std::vector<RTCError*> thread_errors;
+ MutexSys errors_mutex;
+ };
+ ErrorHandler errorHandler;
+ static ErrorHandler g_errorHandler;
+
+ public:
+ void setErrorFunction(RTCErrorFunction fptr, void* uptr)
+ {
+ error_function = fptr;
+ error_function_userptr = uptr;
+ }
+
+ RTCErrorFunction error_function;
+ void* error_function_userptr;
+
+ public:
+ void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr)
+ {
+ memory_monitor_function = fptr;
+ memory_monitor_userptr = uptr;
+ }
+
+ RTCMemoryMonitorFunction memory_monitor_function;
+ void* memory_monitor_userptr;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/vector.h b/thirdparty/embree-aarch64/kernels/common/vector.h
new file mode 100644
index 0000000000..b478762240
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/vector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "default.h"
+
+namespace embree
+{
+ /*! invokes the memory monitor callback */
+ struct MemoryMonitorInterface {
+ virtual void memoryMonitor(ssize_t bytes, bool post) = 0;
+ };
+
+ /*! allocator that performs aligned monitored allocations */
+ template<typename T, size_t alignment = 64>
+ struct aligned_monitored_allocator
+ {
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device)
+ : device(device), hugepages(false) {}
+
+ __forceinline pointer allocate( size_type n )
+ {
+ if (n) {
+ assert(device);
+ device->memoryMonitor(n*sizeof(T),false);
+ }
+ if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+ {
+ pointer p = (pointer) os_malloc(n*sizeof(value_type),hugepages);
+ assert(p);
+ return p;
+ }
+ return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+ }
+
+ __forceinline void deallocate( pointer p, size_type n )
+ {
+ if (p)
+ {
+ if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+ os_free(p,n*sizeof(value_type),hugepages);
+ else
+ alignedFree(p);
+ }
+ else assert(n == 0);
+
+ if (n) {
+ assert(device);
+ device->memoryMonitor(-ssize_t(n)*sizeof(T),true);
+ }
+ }
+
+ __forceinline void construct( pointer p, const_reference val ) {
+ new (p) T(val);
+ }
+
+ __forceinline void destroy( pointer p ) {
+ p->~T();
+ }
+
+ private:
+ MemoryMonitorInterface* device;
+ bool hugepages;
+ };
+
+ /*! monitored vector */
+ template<typename T>
+ using mvector = vector_t<T,aligned_monitored_allocator<T,std::alignment_of<T>::value> >;
+}