diff options
author | RĂ©mi Verschelde <remi@verschelde.fr> | 2021-04-27 19:07:12 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-27 19:07:12 +0200 |
commit | 95cfce661bbe9700209c09dfe297ab7ef5ebfe09 (patch) | |
tree | 9f3c8a98b619e2a0dd171d6cb6d8dc2e791baa14 /thirdparty/embree-aarch64/kernels/common | |
parent | b999fbc4bd349cce153c2133dd0487694add1a05 (diff) | |
parent | 4d9d99bb827967e2bb931eeb8c3f0e079b39ae1a (diff) |
Merge pull request #48050 from JFonS/occlusion_culling
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/common')
46 files changed, 15110 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/common/accel.h b/thirdparty/embree-aarch64/kernels/common/accel.h new file mode 100644 index 0000000000..c038d3cf21 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/accel.h @@ -0,0 +1,556 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "ray.h" +#include "point_query.h" +#include "context.h" + +namespace embree +{ + class Scene; + + /*! Base class for the acceleration structure data. */ + class AccelData : public RefCount + { + ALIGNED_CLASS_(16); + public: + enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 }; + + public: + AccelData (const Type type) + : bounds(empty), type(type) {} + + /*! notifies the acceleration structure about the deletion of some geometry */ + virtual void deleteGeometry(size_t geomID) {}; + + /*! clears the acceleration structure data */ + virtual void clear() = 0; + + /*! returns normal bounds */ + __forceinline BBox3fa getBounds() const { + return bounds.bounds(); + } + + /*! returns bounds for some time */ + __forceinline BBox3fa getBounds(float t) const { + return bounds.interpolate(t); + } + + /*! returns linear bounds */ + __forceinline LBBox3fa getLinearBounds() const { + return bounds; + } + + /*! checks if acceleration structure is empty */ + __forceinline bool isEmpty() const { + return bounds.bounds0.lower.x == float(pos_inf); + } + + public: + LBBox3fa bounds; // linear bounds + Type type; + }; + + /*! Base class for all intersectable and buildable acceleration structures. */ + class Accel : public AccelData + { + ALIGNED_CLASS_(16); + public: + + struct Intersectors; + + /*! Type of collide function */ + typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr); + + /*! Type of point query function */ + typedef bool(*PointQueryFunc)(Intersectors* This, /*!< this pointer to accel */ + PointQuery* query, /*!< point query for lookup */ + PointQueryContext* context); /*!< point query context */ + + /*! Type of intersect function pointer for single rays. */ + typedef void (*IntersectFunc)(Intersectors* This, /*!< this pointer to accel */ + RTCRayHit& ray, /*!< ray to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 4. */ + typedef void (*IntersectFunc4)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit4& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 8. */ + typedef void (*IntersectFunc8)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit8& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 16. */ + typedef void (*IntersectFunc16)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit16& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size N. */ + typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */ + RTCRayHitN** ray, /*!< ray stream to intersect */ + const size_t N, /*!< number of rays in stream */ + IntersectContext* context /*!< layout flags */); + + + /*! Type of occlusion function pointer for single rays. */ + typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */ + RTCRay& ray, /*!< ray to test occlusion */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 4. */ + typedef void (*OccludedFunc4) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay4& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 8. */ + typedef void (*OccludedFunc8) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay8& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 16. */ + typedef void (*OccludedFunc16) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay16& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size N. */ + typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */ + RTCRayN** ray, /*!< ray stream to test occlusion */ + const size_t N, /*!< number of rays in stream */ + IntersectContext* context /*!< layout flags */); + typedef void (*ErrorFunc) (); + + struct Collider + { + Collider (ErrorFunc error = nullptr) + : collide((CollideFunc)error), name(nullptr) {} + + Collider (CollideFunc collide, const char* name) + : collide(collide), name(name) {} + + operator bool() const { return name; } + + public: + CollideFunc collide; + const char* name; + }; + + struct Intersector1 + { + Intersector1 (ErrorFunc error = nullptr) + : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {} + + Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name) + : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {} + + Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name) + : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc intersect; + OccludedFunc occluded; + PointQueryFunc pointQuery; + const char* name; + }; + + struct Intersector4 + { + Intersector4 (ErrorFunc error = nullptr) + : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {} + + Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc4 intersect; + OccludedFunc4 occluded; + const char* name; + }; + + struct Intersector8 + { + Intersector8 (ErrorFunc error = nullptr) + : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {} + + Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc8 intersect; + OccludedFunc8 occluded; + const char* name; + }; + + struct Intersector16 + { + Intersector16 (ErrorFunc error = nullptr) + : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {} + + Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc16 intersect; + OccludedFunc16 occluded; + const char* name; + }; + + struct IntersectorN + { + IntersectorN (ErrorFunc error = nullptr) + : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {} + + IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFuncN intersect; + OccludedFuncN occluded; + const char* name; + }; + + struct Intersectors + { + Intersectors() + : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {} + + Intersectors (ErrorFunc error) + : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {} + + void print(size_t ident) + { + if (collider.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "collider = " << collider.name << std::endl; + } + if (intersector1.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersector1 = " << intersector1.name << std::endl; + } + if (intersector4.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersector4 = " << intersector4.name << std::endl; + } + if (intersector8.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersector8 = " << intersector8.name << std::endl; + } + if (intersector16.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersector16 = " << intersector16.name << std::endl; + } + if (intersectorN.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersectorN = " << intersectorN.name << std::endl; + } + } + + void select(bool filter) + { + if (intersector4_filter) { + if (filter) intersector4 = intersector4_filter; + else intersector4 = intersector4_nofilter; + } + if (intersector8_filter) { + if (filter) intersector8 = intersector8_filter; + else intersector8 = intersector8_nofilter; + } + if (intersector16_filter) { + if (filter) intersector16 = intersector16_filter; + else intersector16 = intersector16_nofilter; + } + if (intersectorN_filter) { + if (filter) intersectorN = intersectorN_filter; + else intersectorN = intersectorN_nofilter; + } + } + + __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) { + assert(intersector1.pointQuery); + return intersector1.pointQuery(this,query,context); + } + + /*! collides two scenes */ + __forceinline void collide (Accel* scene0, Accel* scene1, RTCCollideFunc callback, void* userPtr) { + assert(collider.collide); + collider.collide(scene0->intersectors.ptr,scene1->intersectors.ptr,callback,userPtr); + } + + /*! Intersects a single ray with the scene. */ + __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) { + assert(intersector1.intersect); + intersector1.intersect(this,ray,context); + } + + /*! Intersects a packet of 4 rays with the scene. */ + __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) { + assert(intersector4.intersect); + intersector4.intersect(valid,this,ray,context); + } + + /*! Intersects a packet of 8 rays with the scene. */ + __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) { + assert(intersector8.intersect); + intersector8.intersect(valid,this,ray,context); + } + + /*! Intersects a packet of 16 rays with the scene. */ + __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) { + assert(intersector16.intersect); + intersector16.intersect(valid,this,ray,context); + } + + /*! Intersects a stream of N rays in SOA layout with the scene. */ + __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context) + { + assert(intersectorN.intersect); + intersectorN.intersect(this,rayN,N,context); + } + +#if defined(__SSE__) || defined(__ARM_NEON) + __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) { + const vint<4> mask = valid.mask32(); + intersect4(&mask,(RTCRayHit4&)ray,context); + } +#endif +#if defined(__AVX__) + __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) { + const vint<8> mask = valid.mask32(); + intersect8(&mask,(RTCRayHit8&)ray,context); + } +#endif +#if defined(__AVX512F__) + __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) { + const vint<16> mask = valid.mask32(); + intersect16(&mask,(RTCRayHit16&)ray,context); + } +#endif + + template<int K> + __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context) + { + intersectN((RTCRayHitN**)rayN,N,context); + } + + /*! Tests if single ray is occluded by the scene. */ + __forceinline void occluded (RTCRay& ray, IntersectContext* context) { + assert(intersector1.occluded); + intersector1.occluded(this,ray,context); + } + + /*! Tests if a packet of 4 rays is occluded by the scene. */ + __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) { + assert(intersector4.occluded); + intersector4.occluded(valid,this,ray,context); + } + + /*! Tests if a packet of 8 rays is occluded by the scene. */ + __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) { + assert(intersector8.occluded); + intersector8.occluded(valid,this,ray,context); + } + + /*! Tests if a packet of 16 rays is occluded by the scene. */ + __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) { + assert(intersector16.occluded); + intersector16.occluded(valid,this,ray,context); + } + + /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */ + __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context) + { + assert(intersectorN.occluded); + intersectorN.occluded(this,rayN,N,context); + } + +#if defined(__SSE__) || defined(__ARM_NEON) + __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) { + const vint<4> mask = valid.mask32(); + occluded4(&mask,(RTCRay4&)ray,context); + } +#endif +#if defined(__AVX__) + __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) { + const vint<8> mask = valid.mask32(); + occluded8(&mask,(RTCRay8&)ray,context); + } +#endif +#if defined(__AVX512F__) + __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) { + const vint<16> mask = valid.mask32(); + occluded16(&mask,(RTCRay16&)ray,context); + } +#endif + + template<int K> + __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context) + { + occludedN((RTCRayN**)rayN,N,context); + } + + /*! Tests if single ray is occluded by the scene. */ + __forceinline void intersect(RTCRay& ray, IntersectContext* context) { + occluded(ray, context); + } + + /*! Tests if a packet of K rays is occluded by the scene. */ + template<int K> + __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) { + occluded(valid, ray, context); + } + + /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */ + template<int K> + __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) { + occludedN(rayN, N, context); + } + + public: + AccelData* ptr; + void* leafIntersector; + Collider collider; + Intersector1 intersector1; + Intersector4 intersector4; + Intersector4 intersector4_filter; + Intersector4 intersector4_nofilter; + Intersector8 intersector8; + Intersector8 intersector8_filter; + Intersector8 intersector8_nofilter; + Intersector16 intersector16; + Intersector16 intersector16_filter; + Intersector16 intersector16_nofilter; + IntersectorN intersectorN; + IntersectorN intersectorN_filter; + IntersectorN intersectorN_nofilter; + }; + + public: + + /*! Construction */ + Accel (const AccelData::Type type) + : AccelData(type) {} + + /*! Construction */ + Accel (const AccelData::Type type, const Intersectors& intersectors) + : AccelData(type), intersectors(intersectors) {} + + /*! Virtual destructor */ + virtual ~Accel() {} + + /*! makes the acceleration structure immutable */ + virtual void immutable () {} + + /*! build acceleration structure */ + virtual void build () = 0; + + public: + Intersectors intersectors; + }; + +#define DEFINE_COLLIDER(symbol,collider) \ + Accel::Collider symbol() { \ + return Accel::Collider((Accel::CollideFunc)collider::collide, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTOR1(symbol,intersector) \ + Accel::Intersector1 symbol() { \ + return Accel::Intersector1((Accel::IntersectFunc )intersector::intersect, \ + (Accel::OccludedFunc )intersector::occluded, \ + (Accel::PointQueryFunc)intersector::pointQuery,\ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTOR4(symbol,intersector) \ + Accel::Intersector4 symbol() { \ + return Accel::Intersector4((Accel::IntersectFunc4)intersector::intersect, \ + (Accel::OccludedFunc4)intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTOR8(symbol,intersector) \ + Accel::Intersector8 symbol() { \ + return Accel::Intersector8((Accel::IntersectFunc8)intersector::intersect, \ + (Accel::OccludedFunc8)intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTOR16(symbol,intersector) \ + Accel::Intersector16 symbol() { \ + return Accel::Intersector16((Accel::IntersectFunc16)intersector::intersect, \ + (Accel::OccludedFunc16)intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTORN(symbol,intersector) \ + Accel::IntersectorN symbol() { \ + return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \ + (Accel::OccludedFuncN)intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + + /* ray stream filter interface */ + typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit* _rayN, const size_t N, const size_t stride, IntersectContext* context); + typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context); + typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context); + typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context); + + typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay* _rayN, const size_t N, const size_t stride, IntersectContext* context); + typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context); + typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context); + typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context); + + struct RayStreamFilterFuncs + { + RayStreamFilterFuncs() + : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr), + occludedAOS(nullptr), occludedAOP(nullptr), occludedSOA(nullptr), occludedSOP(nullptr) {} + + RayStreamFilterFuncs(void (*ptr) ()) + : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr), + occludedAOS((occludedStreamAOS_func) ptr), occludedAOP((occludedStreamAOP_func) ptr), occludedSOA((occludedStreamSOA_func) ptr), occludedSOP((occludedStreamSOP_func) ptr) {} + + RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP, + occludedStreamAOS_func occludedAOS, occludedStreamAOP_func occludedAOP, occludedStreamSOA_func occludedSOA, occludedStreamSOP_func occludedSOP) + : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP), + occludedAOS(occludedAOS), occludedAOP(occludedAOP), occludedSOA(occludedSOA), occludedSOP(occludedSOP) {} + + public: + intersectStreamAOS_func intersectAOS; + intersectStreamAOP_func intersectAOP; + intersectStreamSOA_func intersectSOA; + intersectStreamSOP_func intersectSOP; + + occludedStreamAOS_func occludedAOS; + occludedStreamAOP_func occludedAOP; + occludedStreamSOA_func occludedSOA; + occludedStreamSOP_func occludedSOP; + }; + + typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)(); +} diff --git a/thirdparty/embree-aarch64/kernels/common/accelinstance.h b/thirdparty/embree-aarch64/kernels/common/accelinstance.h new file mode 100644 index 0000000000..d74b96df3f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/accelinstance.h @@ -0,0 +1,41 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "accel.h" +#include "builder.h" + +namespace embree +{ + class AccelInstance : public Accel + { + public: + AccelInstance (AccelData* accel, Builder* builder, Intersectors& intersectors) + : Accel(AccelData::TY_ACCEL_INSTANCE,intersectors), accel(accel), builder(builder) {} + + void immutable () { + builder.reset(nullptr); + } + + public: + void build () { + if (builder) builder->build(); + bounds = accel->bounds; + } + + void deleteGeometry(size_t geomID) { + if (accel ) accel->deleteGeometry(geomID); + if (builder) builder->deleteGeometry(geomID); + } + + void clear() { + if (accel) accel->clear(); + if (builder) builder->clear(); + } + + private: + std::unique_ptr<AccelData> accel; + std::unique_ptr<Builder> builder; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.cpp b/thirdparty/embree-aarch64/kernels/common/acceln.cpp new file mode 100644 index 0000000000..aadb4a64ef --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/acceln.cpp @@ -0,0 +1,232 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "acceln.h" +#include "ray.h" +#include "../../include/embree3/rtcore_ray.h" +#include "../../common/algorithms/parallel_for.h" + +namespace embree +{ + AccelN::AccelN() + : Accel(AccelData::TY_ACCELN), accels() {} + + AccelN::~AccelN() + { + for (size_t i=0; i<accels.size(); i++) + delete accels[i]; + } + + void AccelN::accels_add(Accel* accel) + { + assert(accel); + accels.push_back(accel); + } + + void AccelN::accels_init() + { + for (size_t i=0; i<accels.size(); i++) + delete accels[i]; + + accels.clear(); + } + + bool AccelN::pointQuery (Accel::Intersectors* This_in, PointQuery* query, PointQueryContext* context) + { + bool changed = false; + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + changed |= This->accels[i]->intersectors.pointQuery(query,context); + return changed; + } + + void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect(ray,context); + } + + void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect4(valid,ray,context); + } + + void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect8(valid,ray,context); + } + + void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect16(valid,ray,context); + } + + void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersectN(ray,N,context); + } + + void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded(ray,context); + if (ray.tfar < 0.0f) break; + } + } + + void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded4(valid,ray,context); +#if defined(__SSE2__) || defined(__ARM_NEON) + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + if (unlikely(none(valid0 & hit0))) break; +#endif + } + } + + void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded8(valid,ray,context); +#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + vbool4 valid1 = asBool(((vint4*)valid)[1]); + vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero); + if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break; +#endif + } + } + + void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded16(valid,ray,context); +#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + vbool4 valid1 = asBool(((vint4*)valid)[1]); + vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero); + vbool4 valid2 = asBool(((vint4*)valid)[2]); + vbool4 hit2 = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero); + vbool4 valid3 = asBool(((vint4*)valid)[3]); + vbool4 hit3 = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero); + if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break; +#endif + } + } + + void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + size_t M = N; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.occludedN(ray,M,context); + } + + void AccelN::accels_print(size_t ident) + { + for (size_t i=0; i<accels.size(); i++) + { + for (size_t j=0; j<ident; j++) std::cout << " "; + std::cout << "accels[" << i << "]" << std::endl; + accels[i]->intersectors.print(ident+2); + } + } + + void AccelN::accels_immutable() + { + for (size_t i=0; i<accels.size(); i++) + accels[i]->immutable(); + } + + void AccelN::accels_build () + { + /* reduce memory consumption */ + accels.shrink_to_fit(); + + /* build all acceleration structures in parallel */ + parallel_for (accels.size(), [&] (size_t i) { + accels[i]->build(); + }); + + /* create list of non-empty acceleration structures */ + bool valid1 = true; + bool valid4 = true; + bool valid8 = true; + bool valid16 = true; + for (size_t i=0; i<accels.size(); i++) { + valid1 &= (bool) accels[i]->intersectors.intersector1; + valid4 &= (bool) accels[i]->intersectors.intersector4; + valid8 &= (bool) accels[i]->intersectors.intersector8; + valid16 &= (bool) accels[i]->intersectors.intersector16; + } + + if (accels.size() == 1) { + type = accels[0]->type; // FIXME: should just assign entire Accel + bounds = accels[0]->bounds; + intersectors = accels[0]->intersectors; + } + else + { + type = AccelData::TY_ACCELN; + intersectors.ptr = this; + intersectors.intersector1 = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr); + intersectors.intersector4 = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr); + intersectors.intersector8 = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr); + intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr); + intersectors.intersectorN = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN"); + + /*! calculate bounds */ + bounds = empty; + for (size_t i=0; i<accels.size(); i++) + bounds.extend(accels[i]->bounds); + } + } + + void AccelN::accels_select(bool filter) + { + for (size_t i=0; i<accels.size(); i++) + accels[i]->intersectors.select(filter); + } + + void AccelN::accels_deleteGeometry(size_t geomID) + { + for (size_t i=0; i<accels.size(); i++) + accels[i]->deleteGeometry(geomID); + } + + void AccelN::accels_clear() + { + for (size_t i=0; i<accels.size(); i++) { + accels[i]->clear(); + } + } +} + diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.h b/thirdparty/embree-aarch64/kernels/common/acceln.h new file mode 100644 index 0000000000..2edd98f647 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/acceln.h @@ -0,0 +1,49 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "accel.h" + +namespace embree +{ + /*! merges N acceleration structures together, by processing them in order */ + class AccelN : public Accel + { + public: + AccelN (); + ~AccelN(); + + public: + void accels_add(Accel* accel); + void accels_init(); + + public: + static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context); + + public: + static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context); + static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context); + static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context); + static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context); + static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context); + + public: + static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context); + static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context); + static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context); + static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context); + static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context); + + public: + void accels_print(size_t ident); + void accels_immutable(); + void accels_build (); + void accels_select(bool filter); + void accels_deleteGeometry(size_t geomID); + void accels_clear (); + + public: + std::vector<Accel*> accels; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.cpp b/thirdparty/embree-aarch64/kernels/common/accelset.cpp new file mode 100644 index 0000000000..79be1c4301 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/accelset.cpp @@ -0,0 +1,17 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "accelset.h" +#include "scene.h" + +namespace embree +{ + AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) + : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {} + + AccelSet::IntersectorN::IntersectorN (ErrorFunc error) + : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {} + + AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} +} diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.h b/thirdparty/embree-aarch64/kernels/common/accelset.h new file mode 100644 index 0000000000..3774b2accb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/accelset.h @@ -0,0 +1,248 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "builder.h" +#include "geometry.h" +#include "ray.h" +#include "hit.h" + +namespace embree +{ + struct IntersectFunctionNArguments; + struct OccludedFunctionNArguments; + + typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); + typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); + + struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments + { + IntersectContext* internal_context; + Geometry* geometry; + ReportIntersectionFunc report; + }; + + struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments + { + IntersectContext* internal_context; + Geometry* geometry; + ReportOcclusionFunc report; + }; + + /*! Base class for set of acceleration structures. */ + class AccelSet : public Geometry + { + public: + typedef RTCIntersectFunctionN IntersectFuncN; + typedef RTCOccludedFunctionN OccludedFuncN; + typedef void (*ErrorFunc) (); + + struct IntersectorN + { + IntersectorN (ErrorFunc error = nullptr) ; + IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name); + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFuncN intersect; + OccludedFuncN occluded; + const char* name; + }; + + public: + + /*! construction */ + AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps); + + /*! makes the acceleration structure immutable */ + virtual void immutable () {} + + /*! build accel */ + virtual void build () = 0; + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + if (!isvalid_non_empty(bounds(i,itime))) return false; + + return true; + } + + /*! Calculates the bounds of an item */ + __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const + { + BBox3fa box; + assert(i < size()); + RTCBoundsFunctionArguments args; + args.geometryUserPtr = userPtr; + args.primID = (unsigned int)i; + args.timeStep = (unsigned int)itime; + args.bounds_o = (RTCBounds*)&box; + boundsFunc(&args); + return box; + } + + /*! calculates the linear bounds of the i'th item at the itime'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const + { + BBox3fa box[2]; + assert(i < size()); + RTCBoundsFunctionArguments args; + args.geometryUserPtr = userPtr; + args.primID = (unsigned int)i; + args.timeStep = (unsigned int)(itime+0); + args.bounds_o = (RTCBounds*)&box[0]; + boundsFunc(&args); + args.timeStep = (unsigned int)(itime+1); + args.bounds_o = (RTCBounds*)&box[1]; + boundsFunc(&args); + return LBBox3fa(box[0],box[1]); + } + + /*! calculates the build bounds of the i'th item, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const BBox3fa b = bounds(i); + if (bbox) *bbox = b; + return isvalid_non_empty(b); + } + + /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + const LBBox3fa bounds = linearBounds(i,itime); + bbox = bounds.bounds0; // use bounding box of first timestep to build BVH + return isvalid_non_empty(bounds); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return numPrimitives; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return numPrimitives != otherVersion; + } + + public: + + /*! Intersects a single ray with the scene. */ + __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + { + assert(primID < size()); + assert(intersectorN.intersect); + + int mask = -1; + IntersectFunctionNArguments args; + args.valid = &mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.rayhit = (RTCRayHitN*)&ray; + args.N = 1; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.intersect(&args); + } + + /*! Tests if single ray is occluded by the scene. */ + __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + { + assert(primID < size()); + assert(intersectorN.occluded); + + int mask = -1; + OccludedFunctionNArguments args; + args.valid = &mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.N = 1; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.occluded(&args); + } + + /*! Intersects a packet of K rays with the scene. */ + template<int K> + __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + { + assert(primID < size()); + assert(intersectorN.intersect); + + vint<K> mask = valid.mask32(); + IntersectFunctionNArguments args; + args.valid = (int*)&mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.rayhit = (RTCRayHitN*)&ray; + args.N = K; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.intersect(&args); + } + + /*! Tests if a packet of K rays is occluded by the scene. */ + template<int K> + __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + { + assert(primID < size()); + assert(intersectorN.occluded); + + vint<K> mask = valid.mask32(); + OccludedFunctionNArguments args; + args.valid = (int*)&mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.N = K; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.occluded(&args); + } + + public: + RTCBoundsFunction boundsFunc; + IntersectorN intersectorN; + }; + +#define DEFINE_SET_INTERSECTORN(symbol,intersector) \ + AccelSet::IntersectorN symbol() { \ + return AccelSet::IntersectorN(intersector::intersect, \ + intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.cpp b/thirdparty/embree-aarch64/kernels/common/alloc.cpp new file mode 100644 index 0000000000..6fa406f03a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/alloc.cpp @@ -0,0 +1,82 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "alloc.h" +#include "../../common/sys/thread.h" +#if defined(__aarch64__) && defined(BUILD_IOS) +#include "../../common/sys/barrier.h" +#endif + +namespace embree +{ + __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr; + SpinLock FastAllocator::s_thread_local_allocators_lock; + std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators; + + struct fast_allocator_regression_test : public RegressionTest + { + BarrierSys barrier; + std::atomic<size_t> numFailed; + std::unique_ptr<FastAllocator> alloc; + + fast_allocator_regression_test() + : RegressionTest("fast_allocator_regression_test"), numFailed(0) + { + registerRegressionTest(this); + } + + static void thread_alloc(fast_allocator_regression_test* This) + { + FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator(); + + size_t* ptrs[1000]; + for (size_t j=0; j<1000; j++) + { + This->barrier.wait(); + for (size_t i=0; i<1000; i++) { + ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32)); + *ptrs[i] = size_t(threadalloc.talloc0) + i; + } + for (size_t i=0; i<1000; i++) { + if (*ptrs[i] != size_t(threadalloc.talloc0) + i) + This->numFailed++; + } + This->barrier.wait(); + } + } + + bool run () + { + alloc = make_unique(new FastAllocator(nullptr,false)); + numFailed.store(0); + + size_t numThreads = getNumberOfLogicalThreads(); + barrier.init(numThreads+1); + + /* create threads */ + std::vector<thread_t> threads; + for (size_t i=0; i<numThreads; i++) + threads.push_back(createThread((thread_func)thread_alloc,this)); + + /* run test */ + for (size_t i=0; i<1000; i++) + { + alloc->reset(); + barrier.wait(); + barrier.wait(); + } + + /* destroy threads */ + for (size_t i=0; i<numThreads; i++) + join(threads[i]); + + alloc = nullptr; + + return numFailed == 0; + } + }; + + fast_allocator_regression_test fast_allocator_regression; +} + + diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.h b/thirdparty/embree-aarch64/kernels/common/alloc.h new file mode 100644 index 0000000000..488fa707ef --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/alloc.h @@ -0,0 +1,1006 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" +#include "scene.h" +#include "primref.h" + +#if defined(__aarch64__) && defined(BUILD_IOS) +#include <mutex> +#endif + +namespace embree +{ + class FastAllocator + { + /*! maximum supported alignment */ + static const size_t maxAlignment = 64; + + /*! maximum allocation size */ + + /* default settings */ + //static const size_t defaultBlockSize = 4096; +#define maxAllocationSize size_t(2*1024*1024-maxAlignment) + + static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8; + + public: + + struct ThreadLocal2; + enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE }; + + /*! Per thread structure holding the current memory block. */ + struct __aligned(64) ThreadLocal + { + ALIGNED_CLASS_(64); + public: + + /*! Constructor for usage with ThreadLocalData */ + __forceinline ThreadLocal (ThreadLocal2* parent) + : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {} + + /*! initialize allocator */ + void init(FastAllocator* alloc) + { + ptr = nullptr; + cur = end = 0; + bytesUsed = 0; + bytesWasted = 0; + allocBlockSize = 0; + if (alloc) allocBlockSize = alloc->defaultBlockSize; + } + + /* Allocate aligned memory from the threads memory block. */ + __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) + { + /* bind the thread local allocator to the proper FastAllocator*/ + parent->bind(alloc); + + assert(align <= maxAlignment); + bytesUsed += bytes; + + /* try to allocate in local block */ + size_t ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* if allocation is too large allocate with parent allocator */ + if (4*bytes > allocBlockSize) { + return alloc->malloc(bytes,maxAlignment,false); + } + + /* get new partial block if allocation failed */ + size_t blockSize = allocBlockSize; + ptr = (char*) alloc->malloc(blockSize,maxAlignment,true); + bytesWasted += end-cur; + cur = 0; end = blockSize; + + /* retry allocation */ + ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* get new full block if allocation failed */ + blockSize = allocBlockSize; + ptr = (char*) alloc->malloc(blockSize,maxAlignment,false); + bytesWasted += end-cur; + cur = 0; end = blockSize; + + /* retry allocation */ + ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* should never happen as large allocations get handled specially above */ + assert(false); + return nullptr; + } + + + /*! returns amount of used bytes */ + __forceinline size_t getUsedBytes() const { return bytesUsed; } + + /*! returns amount of free bytes */ + __forceinline size_t getFreeBytes() const { return end-cur; } + + /*! returns amount of wasted bytes */ + __forceinline size_t getWastedBytes() const { return bytesWasted; } + + private: + ThreadLocal2* parent; + char* ptr; //!< pointer to memory block + size_t cur; //!< current location of the allocator + size_t end; //!< end of the memory block + size_t allocBlockSize; //!< block size for allocations + size_t bytesUsed; //!< number of total bytes allocated + size_t bytesWasted; //!< number of bytes wasted + }; + + /*! Two thread local structures. */ + struct __aligned(64) ThreadLocal2 + { + ALIGNED_CLASS_(64); + public: + + __forceinline ThreadLocal2() + : alloc(nullptr), alloc0(this), alloc1(this) {} + + /*! bind to fast allocator */ + __forceinline void bind(FastAllocator* alloc_i) + { + assert(alloc_i); + if (alloc.load() == alloc_i) return; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(mutex); +#else + Lock<SpinLock> lock(mutex); +#endif + //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind + if (alloc.load()) { + alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); + alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); + alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); + } + alloc0.init(alloc_i); + alloc1.init(alloc_i); + alloc.store(alloc_i); + alloc_i->join(this); + } + + /*! unbind to fast allocator */ + void unbind(FastAllocator* alloc_i) + { + assert(alloc_i); + if (alloc.load() != alloc_i) return; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(mutex); +#else + Lock<SpinLock> lock(mutex); +#endif + if (alloc.load() != alloc_i) return; // required as a different thread calls unbind + alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); + alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); + alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); + alloc0.init(nullptr); + alloc1.init(nullptr); + alloc.store(nullptr); + } + + public: +#if defined(__aarch64__) && defined(BUILD_IOS) + std::mutex mutex; +#else + SpinLock mutex; //!< required as unbind is called from other threads +#endif + std::atomic<FastAllocator*> alloc; //!< parent allocator + ThreadLocal alloc0; + ThreadLocal alloc1; + }; + + FastAllocator (Device* device, bool osAllocation) + : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), + growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC), + primrefarray(device,0) + { + for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) + { + threadUsedBlocks[i] = nullptr; + threadBlocks[i] = nullptr; + assert(!slotMutex[i].isLocked()); + } + } + + ~FastAllocator () { + clear(); + } + + /*! returns the device attached to this allocator */ + Device* getDevice() { + return device; + } + + void share(mvector<PrimRef>& primrefarray_i) { + primrefarray = std::move(primrefarray_i); + } + + void unshare(mvector<PrimRef>& primrefarray_o) + { + reset(); // this removes blocks that are allocated inside the shared primref array + primrefarray_o = std::move(primrefarray); + } + + /*! returns first fast thread local allocator */ + __forceinline ThreadLocal* _threadLocal() { + return &threadLocal2()->alloc0; + } + + void setOSallocation(bool flag) + { + atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC; + } + + private: + + /*! returns both fast thread local allocators */ + __forceinline ThreadLocal2* threadLocal2() + { + ThreadLocal2* alloc = thread_local_allocator2; + if (alloc == nullptr) { + thread_local_allocator2 = alloc = new ThreadLocal2; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(s_thread_local_allocators_lock); +#else + Lock<SpinLock> lock(s_thread_local_allocators_lock); +#endif + s_thread_local_allocators.push_back(make_unique(alloc)); + } + return alloc; + } + + public: + + __forceinline void join(ThreadLocal2* alloc) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(s_thread_local_allocators_lock); +#else + Lock<SpinLock> lock(thread_local_allocators_lock); +#endif + thread_local_allocators.push_back(alloc); + } + + public: + + struct CachedAllocator + { + __forceinline CachedAllocator(void* ptr) + : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) + { + assert(ptr == nullptr); + } + + __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc) + : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {} + + __forceinline operator bool () const { + return alloc != nullptr; + } + + __forceinline void* operator() (size_t bytes, size_t align = 16) const { + return talloc0->malloc(alloc,bytes,align); + } + + __forceinline void* malloc0 (size_t bytes, size_t align = 16) const { + return talloc0->malloc(alloc,bytes,align); + } + + __forceinline void* malloc1 (size_t bytes, size_t align = 16) const { + return talloc1->malloc(alloc,bytes,align); + } + + public: + FastAllocator* alloc; + ThreadLocal* talloc0; + ThreadLocal* talloc1; + }; + + __forceinline CachedAllocator getCachedAllocator() { + return CachedAllocator(this,threadLocal2()); + } + + /*! Builder interface to create thread local allocator */ + struct Create + { + public: + __forceinline Create (FastAllocator* allocator) : allocator(allocator) {} + __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); } + + private: + FastAllocator* allocator; + }; + + void internal_fix_used_blocks() + { + /* move thread local blocks to global block list */ + for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++) + { + while (threadBlocks[i].load() != nullptr) { + Block* nextUsedBlock = threadBlocks[i].load()->next; + threadBlocks[i].load()->next = usedBlocks.load(); + usedBlocks = threadBlocks[i].load(); + threadBlocks[i] = nextUsedBlock; + } + threadBlocks[i] = nullptr; + } + } + + static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks +#if defined(__AVX512ER__) // KNL + static const size_t mainAllocOverheadStatic = 15; //! 15 means 7.5% allocation overhead through unfilled main alloc blocks +#else + static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks +#endif + static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks + + /* calculates a single threaded threshold for the builders such + * that for small scenes the overhead of partly allocated blocks + * per thread is low */ + size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated) + { + if (numPrimitives == 0 || bytesEstimated == 0) + return defaultThreshold; + + /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */ + const size_t single_mode_factor = use_single_mode ? 1 : 2; + const size_t threadCount = TaskScheduler::threadCount(); + const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize; + + /* if we do not have to limit number of threads use optimal thresdhold */ + if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) + return defaultThreshold; + + /* otherwise limit number of threads by calculating proper single thread threshold */ + else { + double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives); + return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); + } + } + + __forceinline size_t alignSize(size_t i) { + return (i+127)/128*128; + } + + /*! initializes the grow size */ + __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) + { + /* we do not need single thread local allocator mode */ + use_single_mode = false; + + /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */ + size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic; + size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead); + growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize); + + /* if we reached the maxAllocationSize for growSize, we can + * increase the number of allocation slots by still guaranteeing + * the mainAllocationOverhead */ + slotMask = 0x0; + + if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */ + + /* set the thread local alloc block size */ + size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment; + + /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */ +#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL + const size_t threadCount = TaskScheduler::threadCount(); + const size_t single_mode_factor = use_single_mode ? 1 : 2; + const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch; + if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) + defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize); + + /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */ + else +#endif + defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch); + + if (bytesEstimated == 0) { + maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size + defaultBlockSize = defaultBlockSizeSwitch; + } + log2_grow_size_scale = 0; + + if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size; + if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0; + if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1; + if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3; + if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7; + if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size; + if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc; + } + + /*! initializes the allocator */ + void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate) + { + internal_fix_used_blocks(); + /* distribute the allocation to multiple thread block slots */ + slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove + if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } + if (bytesReserve == 0) bytesReserve = bytesAllocate; + freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype); + estimatedSize = bytesEstimate; + initGrowSizeAndNumSlots(bytesEstimate,true); + } + + /*! initializes the allocator */ + void init_estimate(size_t bytesEstimate) + { + internal_fix_used_blocks(); + if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } + /* single allocator mode ? */ + estimatedSize = bytesEstimate; + //initGrowSizeAndNumSlots(bytesEstimate,false); + initGrowSizeAndNumSlots(bytesEstimate,false); + + } + + /*! frees state not required after build */ + __forceinline void cleanup() + { + internal_fix_used_blocks(); + + /* unbind all thread local allocators */ + for (auto alloc : thread_local_allocators) alloc->unbind(this); + thread_local_allocators.clear(); + } + + /*! resets the allocator, memory blocks get reused */ + void reset () + { + internal_fix_used_blocks(); + + bytesUsed.store(0); + bytesFree.store(0); + bytesWasted.store(0); + + /* reset all used blocks and move them to begin of free block list */ + while (usedBlocks.load() != nullptr) { + usedBlocks.load()->reset_block(); + Block* nextUsedBlock = usedBlocks.load()->next; + usedBlocks.load()->next = freeBlocks.load(); + freeBlocks = usedBlocks.load(); + usedBlocks = nextUsedBlock; + } + + /* remove all shared blocks as they are re-added during build */ + freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load())); + + for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) + { + threadUsedBlocks[i] = nullptr; + threadBlocks[i] = nullptr; + } + + /* unbind all thread local allocators */ + for (auto alloc : thread_local_allocators) alloc->unbind(this); + thread_local_allocators.clear(); + } + + /*! frees all allocated memory */ + __forceinline void clear() + { + cleanup(); + bytesUsed.store(0); + bytesFree.store(0); + bytesWasted.store(0); + if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr; + if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr; + for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) { + threadUsedBlocks[i] = nullptr; + threadBlocks[i] = nullptr; + } + primrefarray.clear(); + } + + __forceinline size_t incGrowSizeScale() + { + size_t scale = log2_grow_size_scale.fetch_add(1)+1; + return size_t(1) << min(size_t(16),scale); + } + + /*! thread safe allocation of memory */ + void* malloc(size_t& bytes, size_t align, bool partial) + { + assert(align <= maxAlignment); + + while (true) + { + /* allocate using current block */ + size_t threadID = TaskScheduler::threadID(); + size_t slot = threadID & slotMask; + Block* myUsedBlocks = threadUsedBlocks[slot]; + if (myUsedBlocks) { + void* ptr = myUsedBlocks->malloc(device,bytes,align,partial); + if (ptr) return ptr; + } + + /* throw error if allocation is too large */ + if (bytes > maxAllocationSize) + throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large"); + + /* parallel block creation in case of no freeBlocks, avoids single global mutex */ + if (likely(freeBlocks.load() == nullptr)) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(slotMutex[slot]); +#else + Lock<SpinLock> lock(slotMutex[slot]); +#endif + if (myUsedBlocks == threadUsedBlocks[slot]) { + const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); + const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes); + assert(allocSize >= bytes); + threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here! + // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail. + } + continue; + } + + /* if this fails allocate new block */ + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(mutex); +#else + Lock<SpinLock> lock(mutex); +#endif + if (myUsedBlocks == threadUsedBlocks[slot]) + { + if (freeBlocks.load() != nullptr) { + Block* nextFreeBlock = freeBlocks.load()->next; + freeBlocks.load()->next = usedBlocks; + __memory_barrier(); + usedBlocks = freeBlocks.load(); + threadUsedBlocks[slot] = freeBlocks.load(); + freeBlocks = nextFreeBlock; + } else { + const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize); + usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above! + } + } + } + } + } + + /*! add new block */ + void addBlock(void* ptr, ssize_t bytes) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(mutex); +#else + Lock<SpinLock> lock(mutex); +#endif + const size_t sizeof_Header = offsetof(Block,data[0]); + void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); + size_t ofs = (size_t) aptr - (size_t) ptr; + bytes -= ofs; + if (bytes < 4096) return; // ignore empty or very small blocks + freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs); + } + + /* special allocation only used from morton builder only a single time for each build */ + void* specialAlloc(size_t bytes) + { + assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes); + return freeBlocks.load()->ptr(); + } + + struct Statistics + { + Statistics () + : bytesUsed(0), bytesFree(0), bytesWasted(0) {} + + Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted) + : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {} + + Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false) + : bytesUsed(0), bytesFree(0), bytesWasted(0) + { + Block* usedBlocks = alloc->usedBlocks.load(); + Block* freeBlocks = alloc->freeBlocks.load(); + if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages); + if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages); + if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages); + if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages); + if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages); + } + + std::string str(size_t numPrimitives) + { + std::stringstream str; + str.setf(std::ios::fixed, std::ios::floatfield); + str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " + << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " + << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives); + return str.str(); + } + + friend Statistics operator+ ( const Statistics& a, const Statistics& b) + { + return Statistics(a.bytesUsed+b.bytesUsed, + a.bytesFree+b.bytesFree, + a.bytesWasted+b.bytesWasted); + } + + size_t bytesAllocatedTotal() const { + return bytesUsed + bytesFree + bytesWasted; + } + + public: + size_t bytesUsed; + size_t bytesFree; + size_t bytesWasted; + }; + + Statistics getStatistics(AllocationType atype, bool huge_pages = false) { + return Statistics(this,atype,huge_pages); + } + + size_t getUsedBytes() { + return bytesUsed; + } + + size_t getWastedBytes() { + return bytesWasted; + } + + struct AllStatistics + { + AllStatistics (FastAllocator* alloc) + + : bytesUsed(alloc->bytesUsed), + bytesFree(alloc->bytesFree), + bytesWasted(alloc->bytesWasted), + stat_all(alloc,ANY_TYPE), + stat_malloc(alloc,ALIGNED_MALLOC), + stat_4K(alloc,EMBREE_OS_MALLOC,false), + stat_2M(alloc,EMBREE_OS_MALLOC,true), + stat_shared(alloc,SHARED) {} + + AllStatistics (size_t bytesUsed, + size_t bytesFree, + size_t bytesWasted, + Statistics stat_all, + Statistics stat_malloc, + Statistics stat_4K, + Statistics stat_2M, + Statistics stat_shared) + + : bytesUsed(bytesUsed), + bytesFree(bytesFree), + bytesWasted(bytesWasted), + stat_all(stat_all), + stat_malloc(stat_malloc), + stat_4K(stat_4K), + stat_2M(stat_2M), + stat_shared(stat_shared) {} + + friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b) + { + return AllStatistics(a.bytesUsed+b.bytesUsed, + a.bytesFree+b.bytesFree, + a.bytesWasted+b.bytesWasted, + a.stat_all + b.stat_all, + a.stat_malloc + b.stat_malloc, + a.stat_4K + b.stat_4K, + a.stat_2M + b.stat_2M, + a.stat_shared + b.stat_shared); + } + + void print(size_t numPrimitives) + { + std::stringstream str0; + str0.setf(std::ios::fixed, std::ios::floatfield); + str0 << " alloc : " + << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << " " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives); + std::cout << str0.str() << std::endl; + + std::stringstream str1; + str1.setf(std::ios::fixed, std::ios::floatfield); + str1 << " alloc : " + << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " + << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " + << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives); + std::cout << str1.str() << std::endl; + + std::cout << " total : " << stat_all.str(numPrimitives) << std::endl; + std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl; + std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl; + std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl; + std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl; + } + + private: + size_t bytesUsed; + size_t bytesFree; + size_t bytesWasted; + Statistics stat_all; + Statistics stat_malloc; + Statistics stat_4K; + Statistics stat_2M; + Statistics stat_shared; + }; + + void print_blocks() + { + std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl; + + std::cout << " used blocks = "; + if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list(); + std::cout << "[END]" << std::endl; + + std::cout << " free blocks = "; + if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list(); + std::cout << "[END]" << std::endl; + } + + private: + + struct Block + { + static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype) + { + /* We avoid using os_malloc for small blocks as this could + * cause a risk of fragmenting the virtual address space and + * reach the limit of vm.max_map_count = 65k under Linux. */ + if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize) + atype = ALIGNED_MALLOC; + + /* we need to additionally allocate some header */ + const size_t sizeof_Header = offsetof(Block,data[0]); + bytesAllocate = sizeof_Header+bytesAllocate; + bytesReserve = sizeof_Header+bytesReserve; + + /* consume full 4k pages with using os_malloc */ + if (atype == EMBREE_OS_MALLOC) { + bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); + bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); + } + + /* either use alignedMalloc or os_malloc */ + void *ptr = nullptr; + if (atype == ALIGNED_MALLOC) + { + /* special handling for default block size */ + if (bytesAllocate == (2*PAGE_SIZE_2M)) + { + const size_t alignment = maxAlignment; + if (device) device->memoryMonitor(bytesAllocate+alignment,false); + ptr = alignedMalloc(bytesAllocate,alignment); + + /* give hint to transparently convert these pages to 2MB pages */ + const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1); + os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block + os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M); + os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block + + return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); + } + else + { + const size_t alignment = maxAlignment; + if (device) device->memoryMonitor(bytesAllocate+alignment,false); + ptr = alignedMalloc(bytesAllocate,alignment); + return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); + } + } + else if (atype == EMBREE_OS_MALLOC) + { + if (device) device->memoryMonitor(bytesAllocate,false); + bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages); + return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); + } + else + assert(false); + + return NULL; + } + + Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false) + : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages) + { + assert((((size_t)&data[0]) & (maxAlignment-1)) == 0); + } + + static Block* remove_shared_blocks(Block* head) + { + Block** prev_next = &head; + for (Block* block = head; block; block = block->next) { + if (block->atype == SHARED) *prev_next = block->next; + else prev_next = &block->next; + } + return head; + } + + void clear_list(MemoryMonitorInterface* device) + { + Block* block = this; + while (block) { + Block* next = block->next; + block->clear_block(device); + block = next; + } + } + + void clear_block (MemoryMonitorInterface* device) + { + const size_t sizeof_Header = offsetof(Block,data[0]); + const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes(); + + if (atype == ALIGNED_MALLOC) { + alignedFree(this); + if (device) device->memoryMonitor(-sizeof_Alloced,true); + } + + else if (atype == EMBREE_OS_MALLOC) { + size_t sizeof_This = sizeof_Header+reserveEnd; + os_free(this,sizeof_This,huge_pages); + if (device) device->memoryMonitor(-sizeof_Alloced,true); + } + + else /* if (atype == SHARED) */ { + } + } + + void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial) + { + size_t bytes = bytes_in; + assert(align <= maxAlignment); + bytes = (bytes+(align-1)) & ~(align-1); + if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr; + const size_t i = cur.fetch_add(bytes); + if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr; + if (unlikely(i > reserveEnd)) return nullptr; + bytes_in = bytes = min(bytes,reserveEnd-i); + + if (i+bytes > allocEnd) { + if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true); + } + return &data[i]; + } + + void* ptr() { + return &data[cur]; + } + + void reset_block () + { + allocEnd = max(allocEnd,(size_t)cur); + cur = 0; + } + + size_t getBlockUsedBytes() const { + return min(size_t(cur),reserveEnd); + } + + size_t getBlockFreeBytes() const { + return getBlockAllocatedBytes() - getBlockUsedBytes(); + } + + size_t getBlockAllocatedBytes() const { + return min(max(allocEnd,size_t(cur)),reserveEnd); + } + + size_t getBlockWastedBytes() const { + const size_t sizeof_Header = offsetof(Block,data[0]); + return sizeof_Header + wasted; + } + + size_t getBlockReservedBytes() const { + return reserveEnd; + } + + bool hasType(AllocationType atype_i, bool huge_pages_i) const + { + if (atype_i == ANY_TYPE ) return true; + else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; + else return atype_i == atype; + } + + size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockUsedBytes(); + } + return bytes; + } + + size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockFreeBytes(); + } + return bytes; + } + + size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockWastedBytes(); + } + return bytes; + } + + size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockAllocatedBytes(); + } + return bytes; + } + + void print_list () + { + for (const Block* block = this; block; block = block->next) + block->print_block(); + } + + void print_block() const + { + if (atype == ALIGNED_MALLOC) std::cout << "A"; + else if (atype == EMBREE_OS_MALLOC) std::cout << "O"; + else if (atype == SHARED) std::cout << "S"; + if (huge_pages) std::cout << "H"; + size_t bytesUsed = getBlockUsedBytes(); + size_t bytesFree = getBlockFreeBytes(); + size_t bytesWasted = getBlockWastedBytes(); + std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] "; + } + + public: + std::atomic<size_t> cur; //!< current location of the allocator + std::atomic<size_t> allocEnd; //!< end of the allocated memory region + std::atomic<size_t> reserveEnd; //!< end of the reserved memory region + Block* next; //!< pointer to next block in list + size_t wasted; //!< amount of memory wasted through block alignment + AllocationType atype; //!< allocation mode of the block + bool huge_pages; //!< whether the block uses huge pages + char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment + char data[1]; //!< here starts memory to use for allocations + }; + + private: + Device* device; + SpinLock mutex; + size_t slotMask; + std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; + std::atomic<Block*> usedBlocks; + std::atomic<Block*> freeBlocks; + + std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#else + SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#endif + + bool use_single_mode; + size_t defaultBlockSize; + size_t estimatedSize; + size_t growSize; + size_t maxGrowSize; + std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove + std::atomic<size_t> bytesUsed; + std::atomic<size_t> bytesFree; + std::atomic<size_t> bytesWasted; + static __thread ThreadLocal2* thread_local_allocator2; + static SpinLock s_thread_local_allocators_lock; + static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::mutex thread_local_allocators_lock; +#else + SpinLock thread_local_allocators_lock; +#endif + std::vector<ThreadLocal2*> thread_local_allocators; + AllocationType atype; + mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/buffer.h b/thirdparty/embree-aarch64/kernels/common/buffer.h new file mode 100644 index 0000000000..02d319c59d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/buffer.h @@ -0,0 +1,263 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" + +namespace embree +{ + /*! Implements an API data buffer object. This class may or may not own the data. */ + class Buffer : public RefCount + { + public: + /*! Buffer construction */ + Buffer() + : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {} + + /*! Buffer construction */ + Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr) + : device(device), numBytes(numBytes_in) + { + device->refInc(); + + if (ptr_in) + { + shared = true; + ptr = (char*)ptr_in; + } + else + { + shared = false; + alloc(); + } + } + + /*! Buffer destruction */ + ~Buffer() { + free(); + device->refDec(); + } + + /*! this class is not copyable */ + private: + Buffer(const Buffer& other) DELETED; // do not implement + Buffer& operator =(const Buffer& other) DELETED; // do not implement + + public: + /* inits and allocates the buffer */ + void create(Device* device_in, size_t numBytes_in) + { + init(device_in, numBytes_in); + alloc(); + } + + /* inits the buffer */ + void init(Device* device_in, size_t numBytes_in) + { + free(); + device = device_in; + ptr = nullptr; + numBytes = numBytes_in; + shared = false; + } + + /*! sets shared buffer */ + void set(Device* device_in, void* ptr_in, size_t numBytes_in) + { + free(); + device = device_in; + ptr = (char*)ptr_in; + if (numBytes_in != (size_t)-1) + numBytes = numBytes_in; + shared = true; + } + + /*! allocated buffer */ + void alloc() + { + if (device) + device->memoryMonitor(this->bytes(), false); + size_t b = (this->bytes()+15) & ssize_t(-16); + ptr = (char*)alignedMalloc(b,16); + } + + /*! frees the buffer */ + void free() + { + if (shared) return; + alignedFree(ptr); + if (device) + device->memoryMonitor(-ssize_t(this->bytes()), true); + ptr = nullptr; + } + + /*! gets buffer pointer */ + void* data() + { + /* report error if buffer is not existing */ + if (!device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified"); + + /* return buffer */ + return ptr; + } + + /*! returns pointer to first element */ + __forceinline char* getPtr() const { + return ptr; + } + + /*! returns the number of bytes of the buffer */ + __forceinline size_t bytes() const { + return numBytes; + } + + /*! returns true of the buffer is not empty */ + __forceinline operator bool() const { + return ptr; + } + + public: + Device* device; //!< device to report memory usage to + char* ptr; //!< pointer to buffer data + size_t numBytes; //!< number of bytes in the buffer + bool shared; //!< set if memory is shared with application + }; + + /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */ + class RawBufferView + { + public: + /*! Buffer construction */ + RawBufferView() + : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {} + + public: + /*! sets the buffer view */ + void set(const Ref<Buffer>& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in) + { + if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes)) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds"); + + ptr_ofs = buffer_in->ptr + offset_in; + stride = stride_in; + num = num_in; + format = format_in; + modCounter++; + modified = true; + buffer = buffer_in; + } + + /*! returns pointer to the first element */ + __forceinline char* getPtr() const { + return ptr_ofs; + } + + /*! returns pointer to the i'th element */ + __forceinline char* getPtr(size_t i) const + { + assert(i<num); + return ptr_ofs + i*stride; + } + + /*! returns the number of elements of the buffer */ + __forceinline size_t size() const { + return num; + } + + /*! returns the number of bytes of the buffer */ + __forceinline size_t bytes() const { + return num*stride; + } + + /*! returns the buffer stride */ + __forceinline unsigned getStride() const + { + assert(stride <= unsigned(inf)); + return unsigned(stride); + } + + /*! return the buffer format */ + __forceinline RTCFormat getFormat() const { + return format; + } + + /*! mark buffer as modified or unmodified */ + __forceinline void setModified() { + modCounter++; + modified = true; + } + + /*! mark buffer as modified or unmodified */ + __forceinline bool isModified(unsigned int otherModCounter) const { + return modCounter > otherModCounter; + } + + /*! mark buffer as modified or unmodified */ + __forceinline bool isLocalModified() const { + return modified; + } + + /*! clear local modified flag */ + __forceinline void clearLocalModified() { + modified = false; + } + + /*! returns true of the buffer is not empty */ + __forceinline operator bool() const { + return ptr_ofs; + } + + /*! checks padding to 16 byte check, fails hard */ + __forceinline void checkPadding16() const + { + if (ptr_ofs && num) + volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable? + } + + public: + char* ptr_ofs; //!< base pointer plus offset + size_t stride; //!< stride of the buffer in bytes + size_t num; //!< number of elements in the buffer + RTCFormat format; //!< format of the buffer + unsigned int modCounter; //!< version ID of this buffer + bool modified; //!< local modified data + int userData; //!< special data + Ref<Buffer> buffer; //!< reference to the parent buffer + }; + + /*! A typed contiguous range of a buffer. This class does not own the buffer content. */ + template<typename T> + class BufferView : public RawBufferView + { + public: + typedef T value_type; + + /*! access to the ith element of the buffer */ + __forceinline T& operator [](size_t i) { assert(i<num); return *(T*)(ptr_ofs + i*stride); } + __forceinline const T& operator [](size_t i) const { assert(i<num); return *(T*)(ptr_ofs + i*stride); } + }; + + template<> + class BufferView<Vec3fa> : public RawBufferView + { + public: + typedef Vec3fa value_type; + + /*! access to the ith element of the buffer */ + __forceinline const Vec3fa operator [](size_t i) const + { + assert(i<num); + return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride))); + } + + /*! writes the i'th element */ + __forceinline void store(size_t i, const Vec3fa& v) + { + assert(i<num); + vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v); + } + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/builder.h b/thirdparty/embree-aarch64/kernels/common/builder.h new file mode 100644 index 0000000000..d2a1cfe3ce --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/builder.h @@ -0,0 +1,60 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "accel.h" + +namespace embree +{ +#define MODE_HIGH_QUALITY (1<<8) + + /*! virtual interface for all hierarchy builders */ + class Builder : public RefCount { + public: + + static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024; + + /*! initiates the hierarchy builder */ + virtual void build() = 0; + + /*! notifies the builder about the deletion of some geometry */ + virtual void deleteGeometry(size_t geomID) {}; + + /*! clears internal builder state */ + virtual void clear() = 0; + }; + + /*! virtual interface for progress monitor class */ + struct BuildProgressMonitor { + virtual void operator() (size_t dn) const = 0; + }; + + /*! build the progress monitor interface from a closure */ + template<typename Closure> + struct ProgressMonitorClosure : BuildProgressMonitor + { + public: + ProgressMonitorClosure (const Closure& closure) : closure(closure) {} + void operator() (size_t dn) const { closure(dn); } + private: + const Closure closure; + }; + template<typename Closure> __forceinline const ProgressMonitorClosure<Closure> BuildProgressMonitorFromClosure(const Closure& closure) { + return ProgressMonitorClosure<Closure>(closure); + } + + struct LineSegments; + struct TriangleMesh; + struct QuadMesh; + struct UserGeometry; + + class Scene; + + typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder); + typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + +} diff --git a/thirdparty/embree-aarch64/kernels/common/context.h b/thirdparty/embree-aarch64/kernels/common/context.h new file mode 100644 index 0000000000..d0185a74f2 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/context.h @@ -0,0 +1,131 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "rtcore.h" +#include "point_query.h" + +namespace embree +{ + class Scene; + + struct IntersectContext + { + public: + __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context) + : scene(scene), user(user_context) {} + + __forceinline bool hasContextFilter() const { + return user->filter != nullptr; + } + + __forceinline bool isCoherent() const { + return embree::isCoherent(user->flags); + } + + __forceinline bool isIncoherent() const { + return embree::isIncoherent(user->flags); + } + + public: + Scene* scene; + RTCIntersectContext* user; + }; + + template<int M, typename Geometry> + __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v) + { +#if RTC_MIN_WIDTH + const vfloat<M> d = length(Vec3vf<M>(v) - ray_org); + const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w); + return Vec4vf<M>(v.x,v.y,v.z,r); +#else + return v; +#endif + } + + template<typename Geometry> + __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v) + { +#if RTC_MIN_WIDTH + const float d = length(Vec3fa(v) - ray_org); + const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w); + return Vec3ff(v.x,v.y,v.z,r); +#else + return v; +#endif + } + + enum PointQueryType + { + POINT_QUERY_TYPE_UNDEFINED = 0, + POINT_QUERY_TYPE_SPHERE = 1, + POINT_QUERY_TYPE_AABB = 2, + }; + + typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args); + + struct PointQueryContext + { + public: + __forceinline PointQueryContext(Scene* scene, + PointQuery* query_ws, + PointQueryType query_type, + PointQueryFunction func, + RTCPointQueryContext* userContext, + float similarityScale, + void* userPtr) + : scene(scene) + , query_ws(query_ws) + , query_type(query_type) + , func(func) + , userContext(userContext) + , similarityScale(similarityScale) + , userPtr(userPtr) + , primID(RTC_INVALID_GEOMETRY_ID) + , geomID(RTC_INVALID_GEOMETRY_ID) + , query_radius(query_ws->radius) + { + if (query_type == POINT_QUERY_TYPE_AABB) { + assert(similarityScale == 0.f); + updateAABB(); + } + if (userContext->instStackSize == 0) { + assert(similarityScale == 1.f); + } + } + + public: + __forceinline void updateAABB() + { + if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) { + query_radius = Vec3fa(query_ws->radius); + return; + } + + const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]); + BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius)); + bbox = xfmBounds(m, bbox); + query_radius = 0.5f * (bbox.upper - bbox.lower); + } + +public: + Scene* scene; + + PointQuery* query_ws; // the original world space point query + PointQueryType query_type; + PointQueryFunction func; + RTCPointQueryContext* userContext; + const float similarityScale; + + void* userPtr; + + unsigned int primID; + unsigned int geomID; + + Vec3fa query_radius; // used if the query is converted to an AABB internally + }; +} + diff --git a/thirdparty/embree-aarch64/kernels/common/default.h b/thirdparty/embree-aarch64/kernels/common/default.h new file mode 100644 index 0000000000..709119163b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/default.h @@ -0,0 +1,273 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/sys/platform.h" +#include "../../common/sys/sysinfo.h" +#include "../../common/sys/thread.h" +#include "../../common/sys/alloc.h" +#include "../../common/sys/ref.h" +#include "../../common/sys/intrinsics.h" +#include "../../common/sys/atomic.h" +#include "../../common/sys/mutex.h" +#include "../../common/sys/vector.h" +#include "../../common/sys/array.h" +#include "../../common/sys/string.h" +#include "../../common/sys/regression.h" +#include "../../common/sys/vector.h" + +#include "../../common/math/math.h" +#include "../../common/math/transcendental.h" +#include "../../common/simd/simd.h" +#include "../../common/math/vec2.h" +#include "../../common/math/vec3.h" +#include "../../common/math/vec4.h" +#include "../../common/math/vec2fa.h" +#include "../../common/math/vec3fa.h" +#include "../../common/math/interval.h" +#include "../../common/math/bbox.h" +#include "../../common/math/obbox.h" +#include "../../common/math/lbbox.h" +#include "../../common/math/linearspace2.h" +#include "../../common/math/linearspace3.h" +#include "../../common/math/affinespace.h" +#include "../../common/math/range.h" +#include "../../common/lexers/tokenstream.h" + +#include "../../common/tasking/taskscheduler.h" + +#define COMMA , + +#include "../config.h" +#include "isa.h" +#include "stat.h" +#include "profile.h" +#include "rtcore.h" +#include "vector.h" +#include "state.h" +#include "instance_stack.h" + +#include <vector> +#include <map> +#include <algorithm> +#include <functional> +#include <utility> +#include <sstream> + +#if !defined(_DEBUG) && defined(BUILD_IOS) +#undef assert +#define assert(_EXPR) +#endif + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Vec2 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template<int N> using Vec2vf = Vec2<vfloat<N>>; + template<int N> using Vec2vd = Vec2<vdouble<N>>; + template<int N> using Vec2vr = Vec2<vreal<N>>; + template<int N> using Vec2vi = Vec2<vint<N>>; + template<int N> using Vec2vl = Vec2<vllong<N>>; + template<int N> using Vec2vb = Vec2<vbool<N>>; + template<int N> using Vec2vbf = Vec2<vboolf<N>>; + template<int N> using Vec2vbd = Vec2<vboold<N>>; + + typedef Vec2<vfloat4> Vec2vf4; + typedef Vec2<vdouble4> Vec2vd4; + typedef Vec2<vreal4> Vec2vr4; + typedef Vec2<vint4> Vec2vi4; + typedef Vec2<vllong4> Vec2vl4; + typedef Vec2<vbool4> Vec2vb4; + typedef Vec2<vboolf4> Vec2vbf4; + typedef Vec2<vboold4> Vec2vbd4; + + typedef Vec2<vfloat8> Vec2vf8; + typedef Vec2<vdouble8> Vec2vd8; + typedef Vec2<vreal8> Vec2vr8; + typedef Vec2<vint8> Vec2vi8; + typedef Vec2<vllong8> Vec2vl8; + typedef Vec2<vbool8> Vec2vb8; + typedef Vec2<vboolf8> Vec2vbf8; + typedef Vec2<vboold8> Vec2vbd8; + + typedef Vec2<vfloat16> Vec2vf16; + typedef Vec2<vdouble16> Vec2vd16; + typedef Vec2<vreal16> Vec2vr16; + typedef Vec2<vint16> Vec2vi16; + typedef Vec2<vllong16> Vec2vl16; + typedef Vec2<vbool16> Vec2vb16; + typedef Vec2<vboolf16> Vec2vbf16; + typedef Vec2<vboold16> Vec2vbd16; + + typedef Vec2<vfloatx> Vec2vfx; + typedef Vec2<vdoublex> Vec2vdx; + typedef Vec2<vrealx> Vec2vrx; + typedef Vec2<vintx> Vec2vix; + typedef Vec2<vllongx> Vec2vlx; + typedef Vec2<vboolx> Vec2vbx; + typedef Vec2<vboolfx> Vec2vbfx; + typedef Vec2<vbooldx> Vec2vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Vec3 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template<int N> using Vec3vf = Vec3<vfloat<N>>; + template<int N> using Vec3vd = Vec3<vdouble<N>>; + template<int N> using Vec3vr = Vec3<vreal<N>>; + template<int N> using Vec3vi = Vec3<vint<N>>; + template<int N> using Vec3vl = Vec3<vllong<N>>; + template<int N> using Vec3vb = Vec3<vbool<N>>; + template<int N> using Vec3vbf = Vec3<vboolf<N>>; + template<int N> using Vec3vbd = Vec3<vboold<N>>; + + typedef Vec3<vfloat4> Vec3vf4; + typedef Vec3<vdouble4> Vec3vd4; + typedef Vec3<vreal4> Vec3vr4; + typedef Vec3<vint4> Vec3vi4; + typedef Vec3<vllong4> Vec3vl4; + typedef Vec3<vbool4> Vec3vb4; + typedef Vec3<vboolf4> Vec3vbf4; + typedef Vec3<vboold4> Vec3vbd4; + + typedef Vec3<vfloat8> Vec3vf8; + typedef Vec3<vdouble8> Vec3vd8; + typedef Vec3<vreal8> Vec3vr8; + typedef Vec3<vint8> Vec3vi8; + typedef Vec3<vllong8> Vec3vl8; + typedef Vec3<vbool8> Vec3vb8; + typedef Vec3<vboolf8> Vec3vbf8; + typedef Vec3<vboold8> Vec3vbd8; + + typedef Vec3<vfloat16> Vec3vf16; + typedef Vec3<vdouble16> Vec3vd16; + typedef Vec3<vreal16> Vec3vr16; + typedef Vec3<vint16> Vec3vi16; + typedef Vec3<vllong16> Vec3vl16; + typedef Vec3<vbool16> Vec3vb16; + typedef Vec3<vboolf16> Vec3vbf16; + typedef Vec3<vboold16> Vec3vbd16; + + typedef Vec3<vfloatx> Vec3vfx; + typedef Vec3<vdoublex> Vec3vdx; + typedef Vec3<vrealx> Vec3vrx; + typedef Vec3<vintx> Vec3vix; + typedef Vec3<vllongx> Vec3vlx; + typedef Vec3<vboolx> Vec3vbx; + typedef Vec3<vboolfx> Vec3vbfx; + typedef Vec3<vbooldx> Vec3vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Vec4 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template<int N> using Vec4vf = Vec4<vfloat<N>>; + template<int N> using Vec4vd = Vec4<vdouble<N>>; + template<int N> using Vec4vr = Vec4<vreal<N>>; + template<int N> using Vec4vi = Vec4<vint<N>>; + template<int N> using Vec4vl = Vec4<vllong<N>>; + template<int N> using Vec4vb = Vec4<vbool<N>>; + template<int N> using Vec4vbf = Vec4<vboolf<N>>; + template<int N> using Vec4vbd = Vec4<vboold<N>>; + + typedef Vec4<vfloat4> Vec4vf4; + typedef Vec4<vdouble4> Vec4vd4; + typedef Vec4<vreal4> Vec4vr4; + typedef Vec4<vint4> Vec4vi4; + typedef Vec4<vllong4> Vec4vl4; + typedef Vec4<vbool4> Vec4vb4; + typedef Vec4<vboolf4> Vec4vbf4; + typedef Vec4<vboold4> Vec4vbd4; + + typedef Vec4<vfloat8> Vec4vf8; + typedef Vec4<vdouble8> Vec4vd8; + typedef Vec4<vreal8> Vec4vr8; + typedef Vec4<vint8> Vec4vi8; + typedef Vec4<vllong8> Vec4vl8; + typedef Vec4<vbool8> Vec4vb8; + typedef Vec4<vboolf8> Vec4vbf8; + typedef Vec4<vboold8> Vec4vbd8; + + typedef Vec4<vfloat16> Vec4vf16; + typedef Vec4<vdouble16> Vec4vd16; + typedef Vec4<vreal16> Vec4vr16; + typedef Vec4<vint16> Vec4vi16; + typedef Vec4<vllong16> Vec4vl16; + typedef Vec4<vbool16> Vec4vb16; + typedef Vec4<vboolf16> Vec4vbf16; + typedef Vec4<vboold16> Vec4vbd16; + + typedef Vec4<vfloatx> Vec4vfx; + typedef Vec4<vdoublex> Vec4vdx; + typedef Vec4<vrealx> Vec4vrx; + typedef Vec4<vintx> Vec4vix; + typedef Vec4<vllongx> Vec4vlx; + typedef Vec4<vboolx> Vec4vbx; + typedef Vec4<vboolfx> Vec4vbfx; + typedef Vec4<vbooldx> Vec4vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Other shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template<int N> using BBox3vf = BBox<Vec3vf<N>>; + typedef BBox<Vec3vf4> BBox3vf4; + typedef BBox<Vec3vf8> BBox3vf8; + typedef BBox<Vec3vf16> BBox3vf16; + + /* calculate time segment itime and fractional time ftime */ + __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime) + { + const float timeScaled = time * numTimeSegments; + const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return int(itimef); + } + + __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime) + { + const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments; + const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return int(itimef); + } + + template<int N> + __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime) + { + const vfloat<N> timeScaled = time * numTimeSegments; + const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return vint<N>(itimef); + } + + template<int N> + __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& start_time, const vfloat<N>& end_time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime) + { + const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments; + const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return vint<N>(itimef); + } + + /* calculate overlapping time segment range */ + __forceinline range<int> getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments) + { + const float round_up = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step + const float round_down = 1.0f-2.0f*float(ulp); + const int itime_lower = (int)max(floor(round_up *time_range.lower*numTimeSegments), 0.0f); + const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments); + return make_range(itime_lower, itime_upper); + } + + /* calculate overlapping time segment range */ + __forceinline range<int> getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments) + { + const float lower = (range.lower-time_range.lower)/time_range.size(); + const float upper = (range.upper-time_range.lower)/time_range.size(); + return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments); + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/device.cpp b/thirdparty/embree-aarch64/kernels/common/device.cpp new file mode 100644 index 0000000000..16ec11b892 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/device.cpp @@ -0,0 +1,567 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "device.h" +#include "../hash.h" +#include "scene_triangle_mesh.h" +#include "scene_user_geometry.h" +#include "scene_instance.h" +#include "scene_curves.h" +#include "scene_subdiv_mesh.h" + +#include "../subdiv/tessellation_cache.h" + +#include "acceln.h" +#include "geometry.h" + +#include "../geometry/cylinder.h" + +#include "../bvh/bvh4_factory.h" +#include "../bvh/bvh8_factory.h" + +#include "../../common/tasking/taskscheduler.h" +#include "../../common/sys/alloc.h" + +namespace embree +{ + /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */ + ssize_t Device::debug_int0 = 0; + ssize_t Device::debug_int1 = 0; + ssize_t Device::debug_int2 = 0; + ssize_t Device::debug_int3 = 0; + + DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs); + + static MutexSys g_mutex; + static std::map<Device*,size_t> g_cache_size_map; + static std::map<Device*,size_t> g_num_threads_map; + + Device::Device (const char* cfg) + { + /* check that CPU supports lowest ISA */ + if (!hasISA(ISA)) { + throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR); + } + + /* set default frequency level for detected CPU */ + switch (getCPUModel()) { + case CPU::UNKNOWN: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_KABY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::XEON_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::XEON_BROADWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_BROADWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_HASWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_HASWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::SANDY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::NEHALEM: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE2: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE1: frequency_level = FREQUENCY_SIMD128; break; + } + + /* initialize global state */ +#if defined(EMBREE_CONFIG) + State::parseString(EMBREE_CONFIG); +#endif + State::parseString(cfg); + if (!ignore_config_files && FileName::executableFolder() != FileName("")) + State::parseFile(FileName::executableFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR))); + if (!ignore_config_files && FileName::homeFolder() != FileName("")) + State::parseFile(FileName::homeFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR))); + State::verify(); + + /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */ + if (!checkISASupport()) { + throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA"); + } + + /*! do some internal tests */ + assert(isa::Cylinder::verify()); + + /*! enable huge page support if desired */ +#if defined(__WIN32__) + if (State::enable_selockmemoryprivilege) + State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3)); +#endif + State::hugepages_success &= os_init(State::hugepages,State::verbosity(3)); + + /*! set tessellation cache size */ + setCacheSize( State::tessellation_cache_size ); + + /*! enable some floating point exceptions to catch bugs */ + if (State::float_exceptions) + { + int exceptions = _MM_MASK_MASK; + //exceptions &= ~_MM_MASK_INVALID; + exceptions &= ~_MM_MASK_DENORM; + exceptions &= ~_MM_MASK_DIV_ZERO; + //exceptions &= ~_MM_MASK_OVERFLOW; + //exceptions &= ~_MM_MASK_UNDERFLOW; + //exceptions &= ~_MM_MASK_INEXACT; + _MM_SET_EXCEPTION_MASK(exceptions); + } + + /* print info header */ + if (State::verbosity(1)) + print(); + if (State::verbosity(2)) + State::print(); + + /* register all algorithms */ + bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features)); + +#if defined(EMBREE_TARGET_SIMD8) + bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features)); +#endif + + /* setup tasking system */ + initTaskingSystem(numThreads); + + /* ray stream SOA to AOS conversion */ +#if defined(EMBREE_RAY_PACKETS) + RayStreamFilterFuncsType rayStreamFilterFuncs; + SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(enabled_cpu_features,rayStreamFilterFuncs); + rayStreamFilters = rayStreamFilterFuncs(); +#endif + } + + Device::~Device () + { + setCacheSize(0); + exitTaskingSystem(); + } + + std::string getEnabledTargets() + { + std::string v; +#if defined(EMBREE_TARGET_SSE2) + v += "SSE2 "; +#endif +#if defined(EMBREE_TARGET_SSE42) + v += "SSE4.2 "; +#endif +#if defined(EMBREE_TARGET_AVX) + v += "AVX "; +#endif +#if defined(EMBREE_TARGET_AVX2) + v += "AVX2 "; +#endif +#if defined(EMBREE_TARGET_AVX512KNL) + v += "AVX512KNL "; +#endif +#if defined(EMBREE_TARGET_AVX512SKX) + v += "AVX512SKX "; +#endif + return v; + } + + std::string getEmbreeFeatures() + { + std::string v; +#if defined(EMBREE_RAY_MASK) + v += "raymasks "; +#endif +#if defined (EMBREE_BACKFACE_CULLING) + v += "backfaceculling "; +#endif +#if defined (EMBREE_BACKFACE_CULLING_CURVES) + v += "backfacecullingcurves "; +#endif +#if defined(EMBREE_FILTER_FUNCTION) + v += "intersection_filter "; +#endif +#if defined (EMBREE_COMPACT_POLYS) + v += "compact_polys "; +#endif + return v; + } + + void Device::print() + { + const int cpu_features = getCPUFeatures(); + std::cout << std::endl; + std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl; + std::cout << " Compiler : " << getCompilerName() << std::endl; + std::cout << " Build : "; +#if defined(DEBUG) + std::cout << "Debug " << std::endl; +#else + std::cout << "Release " << std::endl; +#endif + std::cout << " Platform : " << getPlatformName() << std::endl; + std::cout << " CPU : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl; + std::cout << " Threads : " << getNumberOfLogicalThreads() << std::endl; + std::cout << " ISA : " << stringOfCPUFeatures(cpu_features) << std::endl; + std::cout << " Targets : " << supportedTargetList(cpu_features) << std::endl; + const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON; + const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON; + std::cout << " MXCSR : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl; + std::cout << " Config" << std::endl; + std::cout << " Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl; + std::cout << " ISA : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl; + std::cout << " Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl; + std::cout << " " << getEnabledTargets() << " (compile time enabled)" << std::endl; + std::cout << " Features: " << getEmbreeFeatures() << std::endl; + std::cout << " Tasking : "; +#if defined(TASKING_TBB) + std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " "; + #if TBB_INTERFACE_VERSION >= 12002 + std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " "; + #else + std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " "; + #endif +#endif +#if defined(TASKING_INTERNAL) + std::cout << "internal_tasking_system "; +#endif +#if defined(TASKING_GCD) && defined(BUILD_IOS) + std::cout << "GCD tasking system "; +#endif +#if defined(TASKING_PPL) + std::cout << "PPL "; +#endif + std::cout << std::endl; + + /* check of FTZ and DAZ flags are set in CSR */ + if (!hasFTZ || !hasDAZ) + { +#if !defined(_DEBUG) + if (State::verbosity(1)) +#endif + { + std::cout << std::endl; + std::cout << "================================================================================" << std::endl; + std::cout << " WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled " << std::endl + << " in the MXCSR control and status register. This can have a severe " << std::endl + << " performance impact. Please enable these modes for each application " << std::endl + << " thread the following way:" << std::endl + << std::endl + << " #include \"xmmintrin.h\"" << std::endl + << " #include \"pmmintrin.h\"" << std::endl + << std::endl + << " _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl + << " _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl; + std::cout << "================================================================================" << std::endl; + std::cout << std::endl; + } + } + std::cout << std::endl; + } + + void Device::setDeviceErrorCode(RTCError error) + { + RTCError* stored_error = errorHandler.error(); + if (*stored_error == RTC_ERROR_NONE) + *stored_error = error; + } + + RTCError Device::getDeviceErrorCode() + { + RTCError* stored_error = errorHandler.error(); + RTCError error = *stored_error; + *stored_error = RTC_ERROR_NONE; + return error; + } + + void Device::setThreadErrorCode(RTCError error) + { + RTCError* stored_error = g_errorHandler.error(); + if (*stored_error == RTC_ERROR_NONE) + *stored_error = error; + } + + RTCError Device::getThreadErrorCode() + { + RTCError* stored_error = g_errorHandler.error(); + RTCError error = *stored_error; + *stored_error = RTC_ERROR_NONE; + return error; + } + + void Device::process_error(Device* device, RTCError error, const char* str) + { + /* store global error code when device construction failed */ + if (!device) + return setThreadErrorCode(error); + + /* print error when in verbose mode */ + if (device->verbosity(1)) + { + switch (error) { + case RTC_ERROR_NONE : std::cerr << "Embree: No error"; break; + case RTC_ERROR_UNKNOWN : std::cerr << "Embree: Unknown error"; break; + case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break; + case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break; + case RTC_ERROR_OUT_OF_MEMORY : std::cerr << "Embree: Out of memory"; break; + case RTC_ERROR_UNSUPPORTED_CPU : std::cerr << "Embree: Unsupported CPU"; break; + default : std::cerr << "Embree: Invalid error code"; break; + }; + if (str) std::cerr << ", (" << str << ")"; + std::cerr << std::endl; + } + + /* call user specified error callback */ + if (device->error_function) + device->error_function(device->error_function_userptr,error,str); + + /* record error code */ + device->setDeviceErrorCode(error); + } + + void Device::memoryMonitor(ssize_t bytes, bool post) + { + if (State::memory_monitor_function && bytes != 0) { + if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) { + if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor + throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination"); + } + } + } + } + + size_t getMaxNumThreads() + { + size_t maxNumThreads = 0; + for (std::map<Device*,size_t>::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++) + maxNumThreads = max(maxNumThreads, (*i).second); + if (maxNumThreads == 0) + maxNumThreads = std::numeric_limits<size_t>::max(); + return maxNumThreads; + } + + size_t getMaxCacheSize() + { + size_t maxCacheSize = 0; + for (std::map<Device*,size_t>::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++) + maxCacheSize = max(maxCacheSize, (*i).second); + return maxCacheSize; + } + + void Device::setCacheSize(size_t bytes) + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + Lock<MutexSys> lock(g_mutex); + if (bytes == 0) g_cache_size_map.erase(this); + else g_cache_size_map[this] = bytes; + + size_t maxCacheSize = getMaxCacheSize(); + resizeTessellationCache(maxCacheSize); +#endif + } + + void Device::initTaskingSystem(size_t numThreads) + { + Lock<MutexSys> lock(g_mutex); + if (numThreads == 0) + g_num_threads_map[this] = std::numeric_limits<size_t>::max(); + else + g_num_threads_map[this] = numThreads; + + /* create task scheduler */ + size_t maxNumThreads = getMaxNumThreads(); + TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads); +#if USE_TASK_ARENA + const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount()); + const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads); + arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads)); +#endif + } + + void Device::exitTaskingSystem() + { + Lock<MutexSys> lock(g_mutex); + g_num_threads_map.erase(this); + + /* terminate tasking system */ + if (g_num_threads_map.size() == 0) { + TaskScheduler::destroy(); + } + /* or configure new number of threads */ + else { + size_t maxNumThreads = getMaxNumThreads(); + TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads); + } +#if USE_TASK_ARENA + arena.reset(); +#endif + } + + void Device::setProperty(const RTCDeviceProperty prop, ssize_t val) + { + /* hidden internal properties */ + switch ((size_t)prop) + { + case 1000000: debug_int0 = val; return; + case 1000001: debug_int1 = val; return; + case 1000002: debug_int2 = val; return; + case 1000003: debug_int3 = val; return; + } + + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property"); + } + + ssize_t Device::getProperty(const RTCDeviceProperty prop) + { + size_t iprop = (size_t)prop; + + /* get name of internal regression test */ + if (iprop >= 2000000 && iprop < 3000000) + { + RegressionTest* test = getRegressionTest(iprop-2000000); + if (test) return (ssize_t) test->name.c_str(); + else return 0; + } + + /* run internal regression test */ + if (iprop >= 3000000 && iprop < 4000000) + { + RegressionTest* test = getRegressionTest(iprop-3000000); + if (test) return test->run(); + else return 0; + } + + /* documented properties */ + switch (prop) + { + case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR; + case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR; + case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH; + case RTC_DEVICE_PROPERTY_VERSION : return RTC_VERSION; + +#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return hasISA(SSE2); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return hasISA(AVX); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512KNL) | hasISA(AVX512SKX); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_RAY_MASK) + case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_BACKFACE_CULLING) + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0; +#endif + +#if defined(EMBREE_BACKFACE_CULLING_CURVES) + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0; +#endif + +#if defined(EMBREE_COMPACT_POLYS) + case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0; +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_IGNORE_INVALID_RAYS) + case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0; +#endif + +#if defined(TASKING_INTERNAL) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0; +#endif + +#if defined(TASKING_TBB) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1; +#endif + +#if defined(TASKING_PPL) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2; +#endif + +#if defined(TASKING_GCD) && defined(BUILD_IOS) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3; +#endif + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) + case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_USER) + case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_POINT) + case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(TASKING_PPL) + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0; +#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0; +#else + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1; +#endif + +#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION + case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0; +#endif + + default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/device.h b/thirdparty/embree-aarch64/kernels/common/device.h new file mode 100644 index 0000000000..e9a81bb109 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/device.h @@ -0,0 +1,85 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "state.h" +#include "accel.h" + +namespace embree +{ + class BVH4Factory; + class BVH8Factory; + + class Device : public State, public MemoryMonitorInterface + { + ALIGNED_CLASS_(16); + + public: + + /*! Device construction */ + Device (const char* cfg); + + /*! Device destruction */ + virtual ~Device (); + + /*! prints info about the device */ + void print(); + + /*! sets the error code */ + void setDeviceErrorCode(RTCError error); + + /*! returns and clears the error code */ + RTCError getDeviceErrorCode(); + + /*! sets the error code */ + static void setThreadErrorCode(RTCError error); + + /*! returns and clears the error code */ + static RTCError getThreadErrorCode(); + + /*! processes error codes, do not call directly */ + static void process_error(Device* device, RTCError error, const char* str); + + /*! invokes the memory monitor callback */ + void memoryMonitor(ssize_t bytes, bool post); + + /*! sets the size of the software cache. */ + void setCacheSize(size_t bytes); + + /*! sets a property */ + void setProperty(const RTCDeviceProperty prop, ssize_t val); + + /*! gets a property */ + ssize_t getProperty(const RTCDeviceProperty prop); + + private: + + /*! initializes the tasking system */ + void initTaskingSystem(size_t numThreads); + + /*! shuts down the tasking system */ + void exitTaskingSystem(); + + /*! some variables that can be set via rtcSetParameter1i for debugging purposes */ + public: + static ssize_t debug_int0; + static ssize_t debug_int1; + static ssize_t debug_int2; + static ssize_t debug_int3; + + public: + std::unique_ptr<BVH4Factory> bvh4_factory; +#if defined(EMBREE_TARGET_SIMD8) + std::unique_ptr<BVH8Factory> bvh8_factory; +#endif + +#if USE_TASK_ARENA + std::unique_ptr<tbb::task_arena> arena; +#endif + + /* ray streams filter */ + RayStreamFilterFuncs rayStreamFilters; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.cpp b/thirdparty/embree-aarch64/kernels/common/geometry.cpp new file mode 100644 index 0000000000..b3aa8e3396 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/geometry.cpp @@ -0,0 +1,259 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "geometry.h" +#include "scene.h" + +namespace embree +{ + const char* Geometry::gtype_names[Geometry::GTY_END] = + { + "flat_linear_curve", + "round_linear_curve", + "oriented_linear_curve", + "", + "flat_bezier_curve", + "round_bezier_curve", + "oriented_bezier_curve", + "", + "flat_bspline_curve", + "round_bspline_curve", + "oriented_bspline_curve", + "", + "flat_hermite_curve", + "round_hermite_curve", + "oriented_hermite_curve", + "", + "flat_catmull_rom_curve", + "round_catmull_rom_curve", + "oriented_catmull_rom_curve", + "", + "triangles", + "quads", + "grid", + "subdivs", + "", + "sphere", + "disc", + "oriented_disc", + "", + "usergeom", + "instance_cheap", + "instance_expensive", + }; + + Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) + : device(device), userPtr(nullptr), + numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f), + mask(-1), + gtype(gtype), + gsubtype(GTY_SUBTYPE_DEFAULT), + quality(RTC_BUILD_QUALITY_MEDIUM), + state((unsigned)State::MODIFIED), + enabled(true), + intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr) + { + device->refInc(); + } + + Geometry::~Geometry() + { + device->refDec(); + } + + void Geometry::setNumPrimitives(unsigned int numPrimitives_in) + { + if (numPrimitives_in == numPrimitives) return; + + numPrimitives = numPrimitives_in; + + Geometry::update(); + } + + void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in) + { + if (numTimeSteps_in == numTimeSteps) { + return; + } + + numTimeSteps = numTimeSteps_in; + fnumTimeSegments = float(numTimeSteps_in-1); + + Geometry::update(); + } + + void Geometry::setTimeRange (const BBox1f range) + { + time_range = range; + Geometry::update(); + } + + void Geometry::update() + { + ++modCounter_; // FIXME: required? + state = (unsigned)State::MODIFIED; + } + + void Geometry::commit() + { + ++modCounter_; + state = (unsigned)State::COMMITTED; + } + + void Geometry::preCommit() + { + if (State::MODIFIED == (State)state) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed"); + } + + void Geometry::postCommit() + { + } + + void Geometry::enable () + { + if (isEnabled()) + return; + + enabled = true; + ++modCounter_; + } + + void Geometry::disable () + { + if (isDisabled()) + return; + + enabled = false; + ++modCounter_; + } + + void Geometry::setUserData (void* ptr) + { + userPtr = ptr; + } + + void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) + { + if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH))) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); + + intersectionFilterN = filter; + } + + void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) + { + if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH))) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); + + occlusionFilterN = filter; + } + + void Geometry::setPointQueryFunction (RTCPointQueryFunction func) + { + pointQueryFunc = func; + } + + void Geometry::interpolateN(const RTCInterpolateNArguments* const args) + { + const void* valid_i = args->valid; + const unsigned* primIDs = args->primIDs; + const float* u = args->u; + const float* v = args->v; + unsigned int N = args->N; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* dPdv = args->dPdv; + float* ddPdudu = args->ddPdudu; + float* ddPdvdv = args->ddPdvdv; + float* ddPdudv = args->ddPdudv; + unsigned int valueCount = args->valueCount; + + if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex"); + const int* valid = (const int*) valid_i; + + __aligned(64) float P_tmp[256]; + __aligned(64) float dPdu_tmp[256]; + __aligned(64) float dPdv_tmp[256]; + __aligned(64) float ddPdudu_tmp[256]; + __aligned(64) float ddPdvdv_tmp[256]; + __aligned(64) float ddPdudv_tmp[256]; + + float* Pt = P ? P_tmp : nullptr; + float* dPdut = nullptr, *dPdvt = nullptr; + if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; } + float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr; + if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; } + + for (unsigned int i=0; i<N; i++) + { + if (valid && !valid[i]) continue; + + RTCInterpolateArguments iargs; + iargs.primID = primIDs[i]; + iargs.u = u[i]; + iargs.v = v[i]; + iargs.bufferType = bufferType; + iargs.bufferSlot = bufferSlot; + iargs.P = Pt; + iargs.dPdu = dPdut; + iargs.dPdv = dPdvt; + iargs.ddPdudu = ddPdudut; + iargs.ddPdvdv = ddPdvdvt; + iargs.ddPdudv = ddPdudvt; + iargs.valueCount = valueCount; + interpolate(&iargs); + + if (likely(P)) { + for (unsigned int j=0; j<valueCount; j++) + P[j*N+i] = Pt[j]; + } + if (likely(dPdu)) + { + for (unsigned int j=0; j<valueCount; j++) { + dPdu[j*N+i] = dPdut[j]; + dPdv[j*N+i] = dPdvt[j]; + } + } + if (likely(ddPdudu)) + { + for (unsigned int j=0; j<valueCount; j++) { + ddPdudu[j*N+i] = ddPdudut[j]; + ddPdvdv[j*N+i] = ddPdvdvt[j]; + ddPdudv[j*N+i] = ddPdudvt[j]; + } + } + } + } + + bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context) + { + assert(context->primID < size()); + + RTCPointQueryFunctionArguments args; + args.query = (RTCPointQuery*)context->query_ws; + args.userPtr = context->userPtr; + args.primID = context->primID; + args.geomID = context->geomID; + args.context = context->userContext; + args.similarityScale = context->similarityScale; + + bool update = false; + if(context->func) update |= context->func(&args); + if(pointQueryFunc) update |= pointQueryFunc(&args); + + if (update && context->userContext->instStackSize > 0) + { + // update point query + if (context->query_type == POINT_QUERY_TYPE_AABB) { + context->updateAABB(); + } else { + assert(context->similarityScale > 0.f); + query->radius = context->query_ws->radius * context->similarityScale; + } + } + return update; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.h b/thirdparty/embree-aarch64/kernels/common/geometry.h new file mode 100644 index 0000000000..953974bfd2 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/geometry.h @@ -0,0 +1,582 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" +#include "buffer.h" +#include "../common/point_query.h" +#include "../builders/priminfo.h" + +namespace embree +{ + class Scene; + class Geometry; + + struct GeometryCounts + { + __forceinline GeometryCounts() + : numFilterFunctions(0), + numTriangles(0), numMBTriangles(0), + numQuads(0), numMBQuads(0), + numBezierCurves(0), numMBBezierCurves(0), + numLineSegments(0), numMBLineSegments(0), + numSubdivPatches(0), numMBSubdivPatches(0), + numUserGeometries(0), numMBUserGeometries(0), + numInstancesCheap(0), numMBInstancesCheap(0), + numInstancesExpensive(0), numMBInstancesExpensive(0), + numGrids(0), numMBGrids(0), + numPoints(0), numMBPoints(0) {} + + __forceinline size_t size() const { + return numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints + + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints; + } + + __forceinline unsigned int enabledGeometryTypesMask() const + { + unsigned int mask = 0; + if (numTriangles) mask |= 1 << 0; + if (numQuads) mask |= 1 << 1; + if (numBezierCurves+numLineSegments) mask |= 1 << 2; + if (numSubdivPatches) mask |= 1 << 3; + if (numUserGeometries) mask |= 1 << 4; + if (numInstancesCheap) mask |= 1 << 5; + if (numInstancesExpensive) mask |= 1 << 6; + if (numGrids) mask |= 1 << 7; + if (numPoints) mask |= 1 << 8; + + unsigned int maskMB = 0; + if (numMBTriangles) maskMB |= 1 << 0; + if (numMBQuads) maskMB |= 1 << 1; + if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2; + if (numMBSubdivPatches) maskMB |= 1 << 3; + if (numMBUserGeometries) maskMB |= 1 << 4; + if (numMBInstancesCheap) maskMB |= 1 << 5; + if (numMBInstancesExpensive) maskMB |= 1 << 6; + if (numMBGrids) maskMB |= 1 << 7; + if (numMBPoints) maskMB |= 1 << 8; + + return (mask<<8) + maskMB; + } + + __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const + { + GeometryCounts ret; + ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions; + ret.numTriangles = numTriangles + rhs.numTriangles; + ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles; + ret.numQuads = numQuads + rhs.numQuads; + ret.numMBQuads = numMBQuads + rhs.numMBQuads; + ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves; + ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves; + ret.numLineSegments = numLineSegments + rhs.numLineSegments; + ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments; + ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches; + ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches; + ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries; + ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries; + ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap; + ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap; + ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive; + ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive; + ret.numGrids = numGrids + rhs.numGrids; + ret.numMBGrids = numMBGrids + rhs.numMBGrids; + ret.numPoints = numPoints + rhs.numPoints; + ret.numMBPoints = numMBPoints + rhs.numMBPoints; + + return ret; + } + + size_t numFilterFunctions; //!< number of geometries with filter functions enabled + size_t numTriangles; //!< number of enabled triangles + size_t numMBTriangles; //!< number of enabled motion blured triangles + size_t numQuads; //!< number of enabled quads + size_t numMBQuads; //!< number of enabled motion blurred quads + size_t numBezierCurves; //!< number of enabled curves + size_t numMBBezierCurves; //!< number of enabled motion blurred curves + size_t numLineSegments; //!< number of enabled line segments + size_t numMBLineSegments; //!< number of enabled line motion blurred segments + size_t numSubdivPatches; //!< number of enabled subdivision patches + size_t numMBSubdivPatches; //!< number of enabled motion blured subdivision patches + size_t numUserGeometries; //!< number of enabled user geometries + size_t numMBUserGeometries; //!< number of enabled motion blurred user geometries + size_t numInstancesCheap; //!< number of enabled cheap instances + size_t numMBInstancesCheap; //!< number of enabled motion blurred cheap instances + size_t numInstancesExpensive; //!< number of enabled expensive instances + size_t numMBInstancesExpensive; //!< number of enabled motion blurred expensive instances + size_t numGrids; //!< number of enabled grid geometries + size_t numMBGrids; //!< number of enabled motion blurred grid geometries + size_t numPoints; //!< number of enabled points + size_t numMBPoints; //!< number of enabled motion blurred points + }; + + /*! Base class all geometries are derived from */ + class Geometry : public RefCount + { + friend class Scene; + public: + + /*! type of geometry */ + enum GType + { + GTY_FLAT_LINEAR_CURVE = 0, + GTY_ROUND_LINEAR_CURVE = 1, + GTY_ORIENTED_LINEAR_CURVE = 2, + GTY_CONE_LINEAR_CURVE = 3, + + GTY_FLAT_BEZIER_CURVE = 4, + GTY_ROUND_BEZIER_CURVE = 5, + GTY_ORIENTED_BEZIER_CURVE = 6, + + GTY_FLAT_BSPLINE_CURVE = 8, + GTY_ROUND_BSPLINE_CURVE = 9, + GTY_ORIENTED_BSPLINE_CURVE = 10, + + GTY_FLAT_HERMITE_CURVE = 12, + GTY_ROUND_HERMITE_CURVE = 13, + GTY_ORIENTED_HERMITE_CURVE = 14, + + GTY_FLAT_CATMULL_ROM_CURVE = 16, + GTY_ROUND_CATMULL_ROM_CURVE = 17, + GTY_ORIENTED_CATMULL_ROM_CURVE = 18, + + GTY_TRIANGLE_MESH = 20, + GTY_QUAD_MESH = 21, + GTY_GRID_MESH = 22, + GTY_SUBDIV_MESH = 23, + + GTY_SPHERE_POINT = 25, + GTY_DISC_POINT = 26, + GTY_ORIENTED_DISC_POINT = 27, + + GTY_USER_GEOMETRY = 29, + GTY_INSTANCE_CHEAP = 30, + GTY_INSTANCE_EXPENSIVE = 31, + GTY_END = 32, + + GTY_BASIS_LINEAR = 0, + GTY_BASIS_BEZIER = 4, + GTY_BASIS_BSPLINE = 8, + GTY_BASIS_HERMITE = 12, + GTY_BASIS_CATMULL_ROM = 16, + GTY_BASIS_MASK = 28, + + GTY_SUBTYPE_FLAT_CURVE = 0, + GTY_SUBTYPE_ROUND_CURVE = 1, + GTY_SUBTYPE_ORIENTED_CURVE = 2, + GTY_SUBTYPE_MASK = 3, + }; + + enum GSubType + { + GTY_SUBTYPE_DEFAULT= 0, + GTY_SUBTYPE_INSTANCE_LINEAR = 0, + GTY_SUBTYPE_INSTANCE_QUATERNION = 1 + }; + + enum GTypeMask + { + MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE, + MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE, + MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE, + MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE, + + MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE, + MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE, + MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE, + + MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE, + MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE, + MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE, + + MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE, + MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE, + MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE, + + MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE, + MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE, + MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE, + + MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE, + + MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE | + MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE | + MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE | + MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE, + + MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT, + MTY_DISC_POINT = 1ul << GTY_DISC_POINT, + MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT, + + MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT, + + MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS, + + MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH, + MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH, + MTY_GRID_MESH = 1ul << GTY_GRID_MESH, + MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH, + MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY, + + MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP, + MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE, + MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE + }; + + static const char* gtype_names[GTY_END]; + + enum class State : unsigned { + MODIFIED = 0, + COMMITTED = 1, + }; + + public: + + /*! Geometry constructor */ + Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps); + + /*! Geometry destructor */ + virtual ~Geometry(); + + public: + + /*! tests if geometry is enabled */ + __forceinline bool isEnabled() const { return enabled; } + + /*! tests if geometry is disabled */ + __forceinline bool isDisabled() const { return !isEnabled(); } + + /*! tests if that geometry has some filter function set */ + __forceinline bool hasFilterFunctions () const { + return (intersectionFilterN != nullptr) || (occlusionFilterN != nullptr); + } + + /*! returns geometry type */ + __forceinline GType getType() const { return gtype; } + + /*! returns curve type */ + __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); } + + /*! returns curve basis */ + __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); } + + /*! returns geometry type mask */ + __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); } + + /*! returns number of primitives */ + __forceinline size_t size() const { return numPrimitives; } + + /*! sets the number of primitives */ + virtual void setNumPrimitives(unsigned int numPrimitives_in); + + /*! sets number of time steps */ + virtual void setNumTimeSteps (unsigned int numTimeSteps_in); + + /*! sets motion blur time range */ + void setTimeRange (const BBox1f range); + + /*! sets number of vertex attributes */ + virtual void setVertexAttributeCount (unsigned int N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! sets number of topologies */ + virtual void setTopologyCount (unsigned int N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! sets the build quality */ + void setBuildQuality(RTCBuildQuality quality_in) + { + this->quality = quality_in; + Geometry::update(); + } + + /* calculate time segment itime and fractional time ftime */ + __forceinline int timeSegment(float time, float& ftime) const { + return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime); + } + + template<int N> + __forceinline vint<N> timeSegment(const vfloat<N>& time, vfloat<N>& ftime) const { + return getTimeSegment(time,vfloat<N>(time_range.lower),vfloat<N>(time_range.upper),vfloat<N>(fnumTimeSegments),ftime); + } + + /* calculate overlapping time segment range */ + __forceinline range<int> timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,fnumTimeSegments); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<(int)numTimeSteps); + return time_range.lower + time_range.size()*float(i)/fnumTimeSegments; + } + + /*! for all geometries */ + public: + + /*! Enable geometry. */ + virtual void enable(); + + /*! Update geometry. */ + void update(); + + /*! commit of geometry */ + virtual void commit(); + + /*! Update geometry buffer. */ + virtual void updateBuffer(RTCBufferType type, unsigned int slot) { + update(); // update everything for geometries not supporting this call + } + + /*! Disable geometry. */ + virtual void disable(); + + /*! Verify the geometry */ + virtual bool verify() { return true; } + + /*! called before every build */ + virtual void preCommit(); + + /*! called after every build */ + virtual void postCommit(); + + virtual void addElementsToCount (GeometryCounts & counts) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + }; + + /*! sets constant tessellation rate for the geometry */ + virtual void setTessellationRate(float N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets the maximal curve radius scale allowed by min-width feature. */ + virtual void setMaxRadiusScale(float s) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set user data pointer. */ + virtual void setUserData(void* ptr); + + /*! Get user data pointer. */ + __forceinline void* getUserData() const { + return userPtr; + } + + /*! interpolates user data to the specified u/v location */ + virtual void interpolate(const RTCInterpolateArguments* const args) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! interpolates user data to the specified u/v locations */ + virtual void interpolateN(const RTCInterpolateNArguments* const args); + + /* point query api */ + bool pointQuery(PointQuery* query, PointQueryContext* context); + + /*! for subdivision surfaces only */ + public: + virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set displacement function. */ + virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getFirstHalfEdge(unsigned int faceID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getFace(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getNextHalfEdge(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! get fast access to first vertex buffer if applicable */ + virtual float * getCompactVertexArray () const { + return nullptr; + } + + /*! Returns the modified counter - how many times the geo has been modified */ + __forceinline unsigned int getModCounter () const { + return modCounter_; + } + + /*! for triangle meshes and bezier curves only */ + public: + + + /*! Sets ray mask. */ + virtual void setMask(unsigned mask) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets specified buffer. */ + virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Gets specified buffer. */ + virtual void* getBuffer(RTCBufferType type, unsigned int slot) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set intersection filter function for ray packets of size N. */ + virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN); + + /*! Set occlusion filter function for ray packets of size N. */ + virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN); + + /*! for instances only */ + public: + + /*! Sets the instanced scene */ + virtual void setInstancedScene(const Ref<Scene>& scene) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets transformation of the instance */ + virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets transformation of the instance */ + virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Returns the transformation of the instance */ + virtual AffineSpace3fa getTransform(float time) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! for user geometries only */ + public: + + /*! Set bounds function. */ + virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set intersect function for ray packets of size N. */ + virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set occlusion function for ray packets of size N. */ + virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set point query function. */ + void setPointQueryFunction(RTCPointQueryFunction func); + + /*! returns number of time segments */ + __forceinline unsigned numTimeSegments () const { + return numTimeSteps-1; + } + + public: + + virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); + } + + virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); + } + + virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); + } + + virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); + } + + virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); + } + + virtual Vec3fa computeDirection(unsigned int primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); + } + + virtual Vec3fa computeDirection(unsigned int primID, size_t time) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); + } + + virtual BBox3fa vbounds(size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + public: + __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; } + __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; } + + public: + Device* device; //!< device this geometry belongs to + + void* userPtr; //!< user pointer + unsigned int numPrimitives; //!< number of primitives of this geometry + + unsigned int numTimeSteps; //!< number of time steps + float fnumTimeSegments; //!< number of time segments (precalculation) + BBox1f time_range; //!< motion blur time range + + unsigned int mask; //!< for masking out geometry + unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified + + struct { + GType gtype : 8; //!< geometry type + GSubType gsubtype : 8; //!< geometry subtype + RTCBuildQuality quality : 3; //!< build quality for geometry + unsigned state : 2; + bool enabled : 1; //!< true if geometry is enabled + }; + + RTCFilterFunctionN intersectionFilterN; + RTCFilterFunctionN occlusionFilterN; + RTCPointQueryFunction pointQueryFunc; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/hit.h b/thirdparty/embree-aarch64/kernels/common/hit.h new file mode 100644 index 0000000000..32a198cdfe --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/hit.h @@ -0,0 +1,114 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "ray.h" +#include "instance_stack.h" + +namespace embree +{ + /* Hit structure for K hits */ + template<int K> + struct HitK + { + /* Default construction does nothing */ + __forceinline HitK() {} + + /* Constructs a hit */ + __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng) + : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + instance_id_stack::copy(context->instID, instID); + } + + /* Returns the size of the hit */ + static __forceinline size_t size() { return K; } + + public: + Vec3vf<K> Ng; // geometry normal + vfloat<K> u; // barycentric u coordinate of hit + vfloat<K> v; // barycentric v coordinate of hit + vuint<K> primID; // primitive ID + vuint<K> geomID; // geometry ID + vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Specialization for a single hit */ + template<> + struct __aligned(16) HitK<1> + { + /* Default construction does nothing */ + __forceinline HitK() {} + + /* Constructs a hit */ + __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng) + : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID) + { + instance_id_stack::copy(context->instID, instID); + } + + /* Returns the size of the hit */ + static __forceinline size_t size() { return 1; } + + public: + Vec3<float> Ng; // geometry normal + float u; // barycentric u coordinate of hit + float v; // barycentric v coordinate of hit + unsigned int primID; // primitive ID + unsigned int geomID; // geometry ID + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Shortcuts */ + typedef HitK<1> Hit; + typedef HitK<4> Hit4; + typedef HitK<8> Hit8; + typedef HitK<16> Hit16; + + /* Outputs hit to stream */ + template<int K> + __forceinline embree_ostream operator<<(embree_ostream cout, const HitK<K>& ray) + { + cout << "{ " << embree_endl + << " Ng = " << ray.Ng << embree_endl + << " u = " << ray.u << embree_endl + << " v = " << ray.v << embree_endl + << " primID = " << ray.primID << embree_endl + << " geomID = " << ray.geomID << embree_endl + << " instID ="; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + { + cout << " " << ray.instID[l]; + } + cout << embree_endl; + return cout << "}"; + } + + template<typename Hit> + __forceinline void copyHitToRay(RayHit& ray, const Hit& hit) + { + ray.Ng = hit.Ng; + ray.u = hit.u; + ray.v = hit.v; + ray.primID = hit.primID; + ray.geomID = hit.geomID; + instance_id_stack::copy(hit.instID, ray.instID); + } + + template<int K> + __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit) + { + vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x); + vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y); + vfloat<K>::storeu(mask,&ray.Ng.z, hit.Ng.z); + vfloat<K>::storeu(mask,&ray.u, hit.u); + vfloat<K>::storeu(mask,&ray.v, hit.v); + vuint<K>::storeu(mask,&ray.primID, hit.primID); + vuint<K>::storeu(mask,&ray.geomID, hit.geomID); + instance_id_stack::copy(hit.instID, ray.instID, mask); + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/instance_stack.h b/thirdparty/embree-aarch64/kernels/common/instance_stack.h new file mode 100644 index 0000000000..d7e3637f7b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/instance_stack.h @@ -0,0 +1,199 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore.h" + +namespace embree { +namespace instance_id_stack { + +static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, + "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0."); + +/******************************************************************************* + * Instance ID stack manipulation. + * This is used from the instance intersector. + ******************************************************************************/ + +/* + * Push an instance to the stack. + */ +RTC_FORCEINLINE bool push(RTCIntersectContext* context, + unsigned instanceId) +{ +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT; + /* We assert here because instances are silently dropped when the stack is full. + This might be quite hard to find in production. */ + assert(spaceAvailable); + if (likely(spaceAvailable)) + context->instID[context->instStackSize++] = instanceId; + return spaceAvailable; +#else + const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID); + assert(spaceAvailable); + if (likely(spaceAvailable)) + context->instID[0] = instanceId; + return spaceAvailable; +#endif +} + + +/* + * Pop the last instance pushed to the stack. + * Do not call on an empty stack. + */ +RTC_FORCEINLINE void pop(RTCIntersectContext* context) +{ + assert(context); +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + assert(context->instStackSize > 0); + context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID; +#else + assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID); + context->instID[0] = RTC_INVALID_GEOMETRY_ID; +#endif +} + +/******************************************************************************* + * Optimized instance id stack copy. + * The copy() function at the bottom of this block will either copy full + * stacks or copy only until the last valid element has been copied, depending + * on RTC_MAX_INSTANCE_LEVEL_COUNT. + ******************************************************************************/ + +/* + * Plain array assignment. This works for scalar->scalar, + * scalar->vector, and vector->vector. + */ +template <class Src, class Tgt> +RTC_FORCEINLINE void level_copy(unsigned level, Src* src, Tgt* tgt) +{ + tgt[level] = src[level]; +} + +/* + * Masked SIMD vector->vector store. + */ +template <int K> +RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const vbool<K>& mask) +{ + vuint<K>::storeu(mask, tgt + level, src[level]); +} + +/* + * Masked scalar->SIMD vector store. + */ +template <int K> +RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const vbool<K>& mask) +{ + vuint<K>::store(mask, tgt + level, src[level]); +} + +/* + * Indexed assign from vector to scalar. + */ +template <int K> +RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, unsigned* tgt, const size_t& idx) +{ + tgt[level] = src[level][idx]; +} + +/* + * Indexed assign from scalar to vector. + */ +template <int K> +RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const size_t& idx) +{ + tgt[level][idx] = src[level]; +} + +/* + * Indexed assign from vector to vector. + */ +template <int K> +RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const size_t& i, const size_t& j) +{ + tgt[level][j] = src[level][i]; +} + +/* + * Check if the given stack level is valid. + * These are only used for large max stack sizes. + */ +RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack) +{ + return stack[level] != RTC_INVALID_GEOMETRY_ID; +} +RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const size_t& /*i*/) +{ + return stack[level] != RTC_INVALID_GEOMETRY_ID; +} +template <int K> +RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const vbool<K>& /*mask*/) +{ + return stack[level] != RTC_INVALID_GEOMETRY_ID; +} + +template <int K> +RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack) +{ + return any(stack[level] != RTC_INVALID_GEOMETRY_ID); +} +template <int K> +RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const vbool<K>& mask) +{ + return any(mask & (stack[level] != RTC_INVALID_GEOMETRY_ID)); +} + +template <int K> +RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i) +{ + return stack[level][i] != RTC_INVALID_GEOMETRY_ID; +} +template <int K> +RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i, const size_t& /*j*/) +{ + return stack[level][i] != RTC_INVALID_GEOMETRY_ID; +} + +/* + * Copy an instance ID stack. + * + * This function automatically selects a LevelFunctor from the above Assign + * structs. + */ +template <class Src, class Tgt, class... Args> +RTC_FORCEINLINE void copy(Src src, Tgt tgt, Args&&... args) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + /* + * Avoid all loops for only one level. + */ + level_copy(0, src, tgt, std::forward<Args>(args)...); + +#elif (RTC_MAX_INSTANCE_LEVEL_COUNT <= 4) + /* + * It is faster to avoid the valid test for low level counts. + * Just copy the whole stack. + */ + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + level_copy(l, src, tgt, std::forward<Args>(args)...); + +#else + /* + * For general stack sizes, it pays off to test for validity. + */ + bool valid = true; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT && valid; ++l) + { + level_copy(l, src, tgt, std::forward<Args>(args)...); + valid = level_valid(l, src, std::forward<Args>(args)...); + } +#endif +} + +} // namespace instance_id_stack +} // namespace embree + diff --git a/thirdparty/embree-aarch64/kernels/common/isa.h b/thirdparty/embree-aarch64/kernels/common/isa.h new file mode 100644 index 0000000000..63fb8d3351 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/isa.h @@ -0,0 +1,271 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/sys/platform.h" +#include "../../common/sys/sysinfo.h" + +namespace embree +{ +#define DEFINE_SYMBOL2(type,name) \ + typedef type (*name##Func)(); \ + name##Func name; + +#define DECLARE_SYMBOL2(type,name) \ + namespace sse2 { extern type name(); } \ + namespace sse42 { extern type name(); } \ + namespace avx { extern type name(); } \ + namespace avx2 { extern type name(); } \ + namespace avx512knl { extern type name(); } \ + namespace avx512skx { extern type name(); } \ + void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \ + type name##_error() { return type(name##_error2); } \ + type name##_zero() { return type(nullptr); } + +#define DECLARE_ISA_FUNCTION(type,symbol,args) \ + namespace sse2 { extern type symbol(args); } \ + namespace sse42 { extern type symbol(args); } \ + namespace avx { extern type symbol(args); } \ + namespace avx2 { extern type symbol(args); } \ + namespace avx512knl { extern type symbol(args); } \ + namespace avx512skx { extern type symbol(args); } \ + inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \ + typedef type (*symbol##Ty)(args); \ + +#define DEFINE_ISA_FUNCTION(type,symbol,args) \ + typedef type (*symbol##Func)(args); \ + symbol##Func symbol; + +#define ZERO_SYMBOL(features,intersector) \ + intersector = intersector##_zero; + +#define INIT_SYMBOL(features,intersector) \ + intersector = decltype(intersector)(intersector##_error); + +#define SELECT_SYMBOL_DEFAULT(features,intersector) \ + intersector = isa::intersector; + +#if defined(__SSE__) || defined(__ARM_NEON) +#if !defined(EMBREE_TARGET_SIMD4) +#define EMBREE_TARGET_SIMD4 +#endif +#endif + +#if defined(EMBREE_TARGET_SSE42) +#define SELECT_SYMBOL_SSE42(features,intersector) \ + if ((features & SSE42) == SSE42) intersector = sse42::intersector; +#else +#define SELECT_SYMBOL_SSE42(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX) || defined(__AVX__) +#if !defined(EMBREE_TARGET_SIMD8) +#define EMBREE_TARGET_SIMD8 +#endif +#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target +#define SELECT_SYMBOL_AVX(features,intersector) \ + if ((features & ISA) == ISA) intersector = isa::intersector; +#else +#define SELECT_SYMBOL_AVX(features,intersector) \ + if ((features & AVX) == AVX) intersector = avx::intersector; +#endif +#else +#define SELECT_SYMBOL_AVX(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX2) +#if !defined(EMBREE_TARGET_SIMD8) +#define EMBREE_TARGET_SIMD8 +#endif +#define SELECT_SYMBOL_AVX2(features,intersector) \ + if ((features & AVX2) == AVX2) intersector = avx2::intersector; +#else +#define SELECT_SYMBOL_AVX2(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX512KNL) +#if !defined(EMBREE_TARGET_SIMD16) +#define EMBREE_TARGET_SIMD16 +#endif +#define SELECT_SYMBOL_AVX512KNL(features,intersector) \ + if ((features & AVX512KNL) == AVX512KNL) intersector = avx512knl::intersector; +#else +#define SELECT_SYMBOL_AVX512KNL(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX512SKX) +#if !defined(EMBREE_TARGET_SIMD16) +#define EMBREE_TARGET_SIMD16 +#endif +#define SELECT_SYMBOL_AVX512SKX(features,intersector) \ + if ((features & AVX512SKX) == AVX512SKX) intersector = avx512skx::intersector; +#else +#define SELECT_SYMBOL_AVX512SKX(features,intersector) +#endif + +#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX512KNL_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + ZERO_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + + struct VerifyMultiTargetLinking { + static __noinline int getISA(int depth = 5) { + if (depth == 0) return ISA; + else return getISA(depth-1); + } + }; + namespace sse2 { int getISA(); }; + namespace sse42 { int getISA(); }; + namespace avx { int getISA(); }; + namespace avx2 { int getISA(); }; + namespace avx512knl { int getISA(); }; + namespace avx512skx { int getISA(); }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/motion_derivative.h b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h new file mode 100644 index 0000000000..82953f0e89 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h @@ -0,0 +1,325 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/math/affinespace.h" +#include "../../common/math/interval.h" + +#include <functional> + +namespace embree { + +#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f + +static void motion_derivative_coefficients(const float *p, float *coeff); + +struct MotionDerivativeCoefficients +{ + float theta; + float coeffs[3*8*7]; + + MotionDerivativeCoefficients() {} + + // xfm0 and xfm1 are interpret as quaternion decomposition + MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1) + { + // cosTheta of the two quaternions + const float cosTheta = min(1.f, max(-1.f, + xfm0.l.vx.w * xfm1.l.vx.w + + xfm0.l.vy.w * xfm1.l.vy.w + + xfm0.l.vz.w * xfm1.l.vz.w + + xfm0.p.w * xfm1.p.w)); + + theta = std::acos(cosTheta); + Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w); + if (cosTheta < 0.995f) { + // compute perpendicular quaternion + qperp.x = xfm1.p.w - cosTheta * xfm0.p.w; + qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w; + qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w; + qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w; + qperp = normalize(qperp); + } + const float p[33] = { + theta, + xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0 + xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1 + xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0 + qperp.x, qperp.y, qperp.z, qperp.w, + xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0 + xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y, + xfm0.l.vz.z, xfm0.p.z, + xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1 + xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y, + xfm1.l.vz.z, xfm1.p.z + }; + motion_derivative_coefficients(p, coeffs); + } +}; + +struct MotionDerivative +{ + float twoTheta; + float c[8]; + + MotionDerivative(MotionDerivativeCoefficients const& mdc, + int dim, Vec3fa const& p0, Vec3fa const& p1) + : twoTheta(2.f*mdc.theta) + { + const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z }; + for (int i = 0; i < 8; ++i) { + c[i] = 0; + for (int j = 0; j < 7; ++j) { + c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j]; + } + } + } + + template<typename T> + struct EvalMotionDerivative + { + MotionDerivative const& md; + float offset; + + EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {} + + T operator()(T const& time) const { + return md.c[0] + md.c[1] * time + + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time) + + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time) + + offset; + } + }; + + unsigned int findRoots( + Interval1f const& interval, + float offset, + float* roots, + unsigned int maxNumRoots) + { + unsigned int numRoots = 0; + EvalMotionDerivative<Interval1f> eval(*this, offset); + findRoots(eval, interval, numRoots, roots, maxNumRoots); + return numRoots; + } + + template<typename Eval> + static void findRoots( + + Eval const& eval, + Interval1f const& interval, + unsigned int& numRoots, + float* roots, + unsigned int maxNumRoots) + { + Interval1f range = eval(interval); + if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return; + + const float split = 0.5f * (interval.upper + interval.lower); + if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f || abs(split-interval.upper) < 1e-7f) + { + // check if the root already exists + for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) { + if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON) + return; + } + if (numRoots < maxNumRoots) { + roots[numRoots++] = split; + } + if (numRoots > maxNumRoots) { + printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS + return; + } + return; + } + + findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots); + findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots); + } +}; + +/****************************************************************************** + * Code generated with sympy 1.4 * + * See http://www.sympy.org/ for more information. * + * * + * see * + * * + * scripts/generate_motion_derivative_coefficients.py * + * * + * for how this code is generated * + * * + ******************************************************************************/ +static void motion_derivative_coefficients(const float *p, float *coeff) +{ + coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27]; + coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24]; + coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25]; + coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26]; + coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15]; + coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16]; + coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17]; + coeff[7] = 0; + coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24]; + coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25]; + coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26]; + coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24]; + coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25]; + coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26]; + coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27]; + coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24]; + coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25]; + coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26]; + coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15]; + coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16]; + coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17]; + coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0]; + coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24]; + coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25]; + coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26]; + coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24]; + coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25]; + coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26]; + coeff[28] = 0; + coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0]; + coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0]; + coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0]; + coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0]; + coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0]; + coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0]; + coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27]; + coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24]; + coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25]; + coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26]; + coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15]; + coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16]; + coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17]; + coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0]; + coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24]; + coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25]; + coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26]; + coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24]; + coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25]; + coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26]; + coeff[49] = 0; + coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0]; + coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0]; + coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0]; + coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0]; + coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0]; + coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0]; + coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30]; + coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24]; + coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28]; + coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29]; + coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15]; + coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19]; + coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20]; + coeff[63] = 0; + coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24]; + coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28]; + coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29]; + coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24]; + coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28]; + coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29]; + coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30]; + coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24]; + coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28]; + coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29]; + coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15]; + coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19]; + coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20]; + coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0]; + coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24]; + coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28]; + coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29]; + coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24]; + coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28]; + coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29]; + coeff[84] = 0; + coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0]; + coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0]; + coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0]; + coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0]; + coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0]; + coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0]; + coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30]; + coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24]; + coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28]; + coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29]; + coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15]; + coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19]; + coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20]; + coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0]; + coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24]; + coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28]; + coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29]; + coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24]; + coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28]; + coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29]; + coeff[105] = 0; + coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0]; + coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0]; + coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0]; + coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0]; + coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0]; + coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0]; + coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32]; + coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24]; + coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28]; + coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31]; + coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15]; + coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19]; + coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22]; + coeff[119] = 0; + coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24]; + coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28]; + coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31]; + coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24]; + coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28]; + coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31]; + coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30]; + coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24]; + coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28]; + coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29]; + coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15]; + coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19]; + coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20]; + coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0]; + coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24]; + coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28]; + coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29]; + coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24]; + coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28]; + coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29]; + coeff[140] = 0; + coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0]; + coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0]; + coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0]; + coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0]; + coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0]; + coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0]; + coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32]; + coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24]; + coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28]; + coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31]; + coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15]; + coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19]; + coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22]; + coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0]; + coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24]; + coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28]; + coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31]; + coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24]; + coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28]; + coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31]; + coeff[161] = 0; + coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0]; + coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0]; + coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0]; + coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0]; + coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0]; + coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0]; +} + +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/common/point_query.h b/thirdparty/embree-aarch64/kernels/common/point_query.h new file mode 100644 index 0000000000..27d158ca3a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/point_query.h @@ -0,0 +1,136 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /* Point query structure for closest point query */ + template<int K> + struct RTC_ALIGN(16) PointQueryK + { + /* Default construction does nothing */ + __forceinline PointQueryK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline PointQueryK(const Vec3vf<K>& p, const vfloat<K>& radius = inf, const vfloat<K>& time = zero) + : p(p), time(time), radius(radius) {} + + /* Returns the size of the ray */ + static __forceinline size_t size() { return K; } + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline vbool<K> valid() const + { + const vbool<K> vx = (abs(p.x) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vy = (abs(p.y) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vz = (abs(p.z) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vn = radius >= vfloat<K>(0); + const vbool<K> vf = abs(time) < vfloat<K>(inf); + return vx & vy & vz & vn & vf; + } + + __forceinline void get(PointQueryK<1>* ray) const; + __forceinline void get(size_t i, PointQueryK<1>& ray) const; + __forceinline void set(const PointQueryK<1>* ray); + __forceinline void set(size_t i, const PointQueryK<1>& ray); + + Vec3vf<K> p; // location of the query point + vfloat<K> time; // time for motion blur + vfloat<K> radius; // radius for the point query + }; + + /* Specialization for a single point query */ + template<> + struct RTC_ALIGN(16) PointQueryK<1> + { + /* Default construction does nothing */ + __forceinline PointQueryK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero) + : p(p), time(time), radius(radius) {} + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline bool valid() const { + return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf); + } + + Vec3f p; + float time; + float radius; + }; + + /* Converts point query packet to single point query */ + template<int K> + __forceinline void PointQueryK<K>::get(PointQueryK<1>* query) const + { + for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose + { + query[i].p.x = p.x[i]; + query[i].p.y = p.y[i]; + query[i].p.z = p.z[i]; + query[i].time = time[i]; + query[i].radius = radius[i]; + } + } + + /* Extracts a single point query out of a point query packet*/ + template<int K> + __forceinline void PointQueryK<K>::get(size_t i, PointQueryK<1>& query) const + { + query.p.x = p.x[i]; + query.p.y = p.y[i]; + query.p.z = p.z[i]; + query.radius = radius[i]; + query.time = time[i]; + } + + /* Converts single point query to point query packet */ + template<int K> + __forceinline void PointQueryK<K>::set(const PointQueryK<1>* query) + { + for (size_t i = 0; i < K; i++) + { + p.x[i] = query[i].p.x; + p.y[i] = query[i].p.y; + p.z[i] = query[i].p.z; + radius[i] = query[i].radius; + time[i] = query[i].time; + } + } + + /* inserts a single point query into a point query packet element */ + template<int K> + __forceinline void PointQueryK<K>::set(size_t i, const PointQueryK<1>& query) + { + p.x[i] = query.p.x; + p.y[i] = query.p.y; + p.z[i] = query.p.z; + radius[i] = query.radius; + time[i] = query.time; + } + + /* Shortcuts */ + typedef PointQueryK<1> PointQuery; + typedef PointQueryK<4> PointQuery4; + typedef PointQueryK<8> PointQuery8; + typedef PointQueryK<16> PointQuery16; + struct PointQueryN; + + /* Outputs point query to stream */ + template<int K> + __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK<K>& query) + { + cout << "{ " << embree_endl + << " p = " << query.p << embree_endl + << " r = " << query.radius << embree_endl + << " time = " << query.time << embree_endl + << "}"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/primref.h b/thirdparty/embree-aarch64/kernels/common/primref.h new file mode 100644 index 0000000000..ce75c982bb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/primref.h @@ -0,0 +1,138 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct __aligned(32) PrimRef + { + __forceinline PrimRef () {} + +#if defined(__AVX__) + __forceinline PrimRef(const PrimRef& v) { + vfloat8::store((float*)this,vfloat8::load((float*)&v)); + } + __forceinline PrimRef& operator=(const PrimRef& v) { + vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this; + } +#endif + + __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) + { + lower = Vec3fx(bounds.lower, geomID); + upper = Vec3fx(bounds.upper, primID); + } + + __forceinline PrimRef (const BBox3fa& bounds, size_t id) + { +#if defined(__X86_64__) || defined(__aarch64__) + lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF)); + upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF)); +#else + lower = Vec3fx(bounds.lower, (unsigned)id); + upper = Vec3fx(bounds.upper, (unsigned)0); +#endif + } + + /*! calculates twice the center of the primitive */ + __forceinline const Vec3fa center2() const { + return lower+upper; + } + + /*! return the bounding box of the primitive */ + __forceinline const BBox3fa bounds() const { + return BBox3fa(lower,upper); + } + + /*! size for bin heuristic is 1 */ + __forceinline unsigned size() const { + return 1; + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = embree::center2(bounds_o); + } + + __forceinline unsigned& geomIDref() { // FIXME: remove !!!!!!! + return lower.u; + } + __forceinline unsigned& primIDref() { // FIXME: remove !!!!!!! + return upper.u; + } + + /*! returns the geometry ID */ + __forceinline unsigned geomID() const { + return lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned primID() const { + return upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__X86_64__) || defined(__aarch64__) + return size_t(lower.u) + (size_t(upper.u) << 32); +#else + return size_t(lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) { + return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }"; + } + + public: + Vec3fx lower; //!< lower bounds and geomID + Vec3fx upper; //!< upper bounds and primID + }; + + /*! fast exchange for PrimRefs */ + __forceinline void xchg(PrimRef& a, PrimRef& b) + { +#if defined(__AVX__) + const vfloat8 aa = vfloat8::load((float*)&a); + const vfloat8 bb = vfloat8::load((float*)&b); + vfloat8::store((float*)&a,bb); + vfloat8::store((float*)&b,aa); +#else + std::swap(a,b); +#endif + } + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + struct SubGridBuildData { + unsigned short sx,sy; + unsigned int primID; + + __forceinline SubGridBuildData() {}; + __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {}; + + __forceinline size_t x() const { return (size_t)sx & 0x7fff; } + __forceinline size_t y() const { return (size_t)sy & 0x7fff; } + + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/primref_mb.h b/thirdparty/embree-aarch64/kernels/common/primref_mb.h new file mode 100644 index 0000000000..b6c1ad5712 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/primref_mb.h @@ -0,0 +1,262 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +#define MBLUR_BIN_LBBOX 1 + +namespace embree +{ +#if MBLUR_BIN_LBBOX + + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct PrimRefMB + { + typedef LBBox3fa BBox; + + __forceinline PrimRefMB () {} + + __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); + lbounds.bounds0.lower.a = geomID; + lbounds.bounds0.upper.a = primID; + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__X86_64__) || defined(__aarch64__) + lbounds.bounds0.lower.a = id & 0xFFFFFFFF; + lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF; +#else + lbounds.bounds0.lower.a = id; + lbounds.bounds0.upper.a = 0; +#endif + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__X86_64__) || defined(__aarch64__) + lbounds.bounds0.lower.u = id & 0xFFFFFFFF; + lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF; +#else + lbounds.bounds0.lower.u = id; + lbounds.bounds0.upper.u = 0; +#endif + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + /*! returns bounds for binning */ + __forceinline LBBox3fa bounds() const { + return lbounds; + } + + /*! returns the number of time segments of this primref */ + __forceinline unsigned size() const { + return lbounds.bounds1.lower.a; + } + + __forceinline unsigned totalTimeSegments() const { + return lbounds.bounds1.upper.a; + } + + /* calculate overlapping time segment range */ + __forceinline range<int> timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,float(totalTimeSegments())); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<=(int)totalTimeSegments()); + return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments()); + } + + /*! checks if time range overlaps */ + __forceinline bool time_range_overlap(const BBox1f& range) const + { + if (0.9999f*time_range.upper <= range.lower) return false; + if (1.0001f*time_range.lower >= range.upper) return false; + return true; + } + + /*! returns center for binning */ + __forceinline Vec3fa binCenter() const { + return center2(lbounds.interpolate(0.5f)); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = binCenter(); + } + + /*! returns the geometry ID */ + __forceinline unsigned geomID() const { + return lbounds.bounds0.lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned primID() const { + return lbounds.bounds0.upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__X86_64__) || defined(__aarch64__) + return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32); +#else + return size_t(lbounds.bounds0.lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) { + return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }"; + } + + public: + LBBox3fx lbounds; + BBox1f time_range; // entire geometry time range + }; + +#else + + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct __aligned(16) PrimRefMB + { + typedef BBox3fa BBox; + + __forceinline PrimRefMB () {} + + __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID) + : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) + { + assert(activeTimeSegments > 0); + bbox.lower.a = geomID; + bbox.upper.a = primID; + } + + __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__X86_64__) || defined(__aarch64__) + bbox.lower.u = id & 0xFFFFFFFF; + bbox.upper.u = (id >> 32) & 0xFFFFFFFF; +#else + bbox.lower.u = id; + bbox.upper.u = 0; +#endif + } + + /*! returns bounds for binning */ + __forceinline BBox3fa bounds() const { + return bbox; + } + + /*! returns the number of time segments of this primref */ + __forceinline unsigned int size() const { + return _activeTimeSegments; + } + + __forceinline unsigned int totalTimeSegments() const { + return _totalTimeSegments; + } + + /* calculate overlapping time segment range */ + __forceinline range<int> timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,float(_totalTimeSegments)); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<=(int)_totalTimeSegments); + return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments); + } + + /*! checks if time range overlaps */ + __forceinline bool time_range_overlap(const BBox1f& range) const + { + if (0.9999f*time_range.upper <= range.lower) return false; + if (1.0001f*time_range.lower >= range.upper) return false; + return true; + } + + /*! returns center for binning */ + __forceinline Vec3fa binCenter() const { + return center2(bounds()); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = center2(bounds()); + } + + /*! returns the geometry ID */ + __forceinline unsigned int geomID() const { + return bbox.lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned int primID() const { + return bbox.upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__X86_64__) || defined(__aarch64__) + return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32); +#else + return size_t(bbox.lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) { + return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }"; + } + + public: + BBox3fa bbox; // bounds, geomID, primID + unsigned int _activeTimeSegments; + unsigned int _totalTimeSegments; + BBox1f time_range; // entire geometry time range + }; + +#endif +} diff --git a/thirdparty/embree-aarch64/kernels/common/profile.h b/thirdparty/embree-aarch64/kernels/common/profile.h new file mode 100644 index 0000000000..a7de36414d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/profile.h @@ -0,0 +1,159 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! helper structure for the implementation of the profile functions below */ + struct ProfileTimer + { + static const size_t N = 20; + + ProfileTimer () {} + + ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0) + { + for (size_t i=0; i<N; i++) names[i] = nullptr; + for (size_t i=0; i<N; i++) dt_fst[i] = 0.0; + for (size_t i=0; i<N; i++) dt_min[i] = pos_inf; + for (size_t i=0; i<N; i++) dt_avg[i] = 0.0; + for (size_t i=0; i<N; i++) dt_max[i] = neg_inf; + } + + __forceinline void begin() + { + j=0; + t0 = tj = getSeconds(); + } + + __forceinline void end() { + absolute("total"); + i++; + } + + __forceinline void operator() (const char* name) { + relative(name); + } + + __forceinline void absolute (const char* name) + { + const double t1 = getSeconds(); + const double dt = t1-t0; + assert(names[j] == nullptr || names[j] == name); + names[j] = name; + if (i == 0) dt_fst[j] = dt; + if (i>=numSkip) { + dt_min[j] = min(dt_min[j],dt); + dt_avg[j] = dt_avg[j] + dt; + dt_max[j] = max(dt_max[j],dt); + } + j++; + maxJ = max(maxJ,j); + } + + __forceinline void relative (const char* name) + { + const double t1 = getSeconds(); + const double dt = t1-tj; + tj = t1; + assert(names[j] == nullptr || names[j] == name); + names[j] = name; + if (i == 0) dt_fst[j] = dt; + if (i>=numSkip) { + dt_min[j] = min(dt_min[j],dt); + dt_avg[j] = dt_avg[j] + dt; + dt_max[j] = max(dt_max[j],dt); + } + j++; + maxJ = max(maxJ,j); + } + + void print(size_t numElements) + { + for (size_t k=0; k<N; k++) + dt_avg[k] /= double(i-numSkip); + + printf(" profile [M/s]:\n"); + for (size_t j=0; j<maxJ; j++) + printf("%20s: fst = %7.2f M/s, min = %7.2f M/s, avg = %7.2f M/s, max = %7.2f M/s\n", + names[j],numElements/dt_fst[j]*1E-6,numElements/dt_max[j]*1E-6,numElements/dt_avg[j]*1E-6,numElements/dt_min[j]*1E-6); + + printf(" profile [ms]:\n"); + for (size_t j=0; j<maxJ; j++) + printf("%20s: fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n", + names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]); + } + + void print() + { + printf(" profile:\n"); + + for (size_t k=0; k<N; k++) + dt_avg[k] /= double(i-numSkip); + + for (size_t j=0; j<maxJ; j++) { + printf("%20s: fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n", + names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]); + } + } + + double avg() { + return dt_avg[maxJ-1]/double(i-numSkip); + } + + private: + size_t i; + size_t j; + size_t maxJ; + size_t numSkip; + double t0; + double tj; + const char* names[N]; + double dt_fst[N]; + double dt_min[N]; + double dt_avg[N]; + double dt_max[N]; + }; + + /*! This function executes some code block multiple times and measured sections of it. + Use the following way: + + profile(1,10,1000,[&](ProfileTimer& timer) { + // code + timer("A"); + // code + timer("B"); + }); + */ + template<typename Closure> + void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) + { + ProfileTimer timer(numSkip); + + for (size_t i=0; i<numSkip+numIter; i++) + { + timer.begin(); + closure(timer); + timer.end(); + } + timer.print(numElements); + } + + /*! similar as the function above, but the timer object comes externally */ + template<typename Closure> + void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) + { + timer = ProfileTimer(numSkip); + + for (size_t i=0; i<numSkip+numIter; i++) + { + timer.begin(); + closure(timer); + timer.end(); + } + timer.print(numElements); + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/ray.h b/thirdparty/embree-aarch64/kernels/common/ray.h new file mode 100644 index 0000000000..336d48942c --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/ray.h @@ -0,0 +1,1517 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "instance_stack.h" + +// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted + +namespace embree +{ + static const size_t MAX_INTERNAL_STREAM_SIZE = 32; + + /* Ray structure for K rays */ + template<int K> + struct RayK + { + /* Default construction does nothing */ + __forceinline RayK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayK(const Vec3vf<K>& org, const Vec3vf<K>& dir, + const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf, + const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0) + : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {} + + /* Returns the size of the ray */ + static __forceinline size_t size() { return K; } + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline vbool<K> valid() const + { + const vbool<K> vx = (abs(org.x) <= vfloat<K>(FLT_LARGE)) & (abs(dir.x) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vy = (abs(org.y) <= vfloat<K>(FLT_LARGE)) & (abs(dir.y) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vz = (abs(org.z) <= vfloat<K>(FLT_LARGE)) & (abs(dir.z) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vn = abs(tnear()) <= vfloat<K>(inf); + const vbool<K> vf = abs(tfar) <= vfloat<K>(inf); + return vx & vy & vz & vn & vf; + } + + __forceinline void get(RayK<1>* ray) const; + __forceinline void get(size_t i, RayK<1>& ray) const; + __forceinline void set(const RayK<1>* ray); + __forceinline void set(size_t i, const RayK<1>& ray); + + __forceinline void copy(size_t dest, size_t source); + + __forceinline vint<K> octant() const + { + return select(dir.x < 0.0f, vint<K>(1), vint<K>(zero)) | + select(dir.y < 0.0f, vint<K>(2), vint<K>(zero)) | + select(dir.z < 0.0f, vint<K>(4), vint<K>(zero)); + } + + /* Ray data */ + Vec3vf<K> org; // ray origin + vfloat<K> _tnear; // start of ray segment + Vec3vf<K> dir; // ray direction + vfloat<K> _time; // time of this ray for motion blur + vfloat<K> tfar; // end of ray segment + vint<K> mask; // used to mask out objects during traversal + vint<K> id; + vint<K> flags; + + __forceinline vfloat<K>& tnear() { return _tnear; } + __forceinline vfloat<K>& time() { return _time; } + __forceinline const vfloat<K>& tnear() const { return _tnear; } + __forceinline const vfloat<K>& time() const { return _time; } + }; + + /* Ray+hit structure for K rays */ + template<int K> + struct RayHitK : RayK<K> + { + using RayK<K>::org; + using RayK<K>::_tnear; + using RayK<K>::dir; + using RayK<K>::_time; + using RayK<K>::tfar; + using RayK<K>::mask; + using RayK<K>::id; + using RayK<K>::flags; + + using RayK<K>::tnear; + using RayK<K>::time; + + /* Default construction does nothing */ + __forceinline RayHitK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayHitK(const Vec3vf<K>& org, const Vec3vf<K>& dir, + const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf, + const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0) + : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags), + geomID(RTC_INVALID_GEOMETRY_ID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + } + + __forceinline RayHitK(const RayK<K>& ray) + : RayK<K>(ray), + geomID(RTC_INVALID_GEOMETRY_ID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + } + + __forceinline RayHitK<K>& operator =(const RayK<K>& ray) + { + org = ray.org; + _tnear = ray._tnear; + dir = ray.dir; + _time = ray._time; + tfar = ray.tfar; + mask = ray.mask; + id = ray.id; + flags = ray.flags; + + geomID = RTC_INVALID_GEOMETRY_ID; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + + return *this; + } + + /* Calculates if the hit is valid */ + __forceinline void verifyHit(const vbool<K>& valid0) const + { + vbool<K> valid = valid0 & geomID != vuint<K>(RTC_INVALID_GEOMETRY_ID); + const vbool<K> vt = (abs(tfar) <= vfloat<K>(FLT_LARGE)) | (tfar == vfloat<K>(neg_inf)); + const vbool<K> vu = (abs(u) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vv = (abs(u) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vnx = abs(Ng.x) <= vfloat<K>(FLT_LARGE); + const vbool<K> vny = abs(Ng.y) <= vfloat<K>(FLT_LARGE); + const vbool<K> vnz = abs(Ng.z) <= vfloat<K>(FLT_LARGE); + if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t"); + if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u"); + if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v"); + if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x"); + if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y"); + if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z"); + } + + __forceinline void get(RayHitK<1>* ray) const; + __forceinline void get(size_t i, RayHitK<1>& ray) const; + __forceinline void set(const RayHitK<1>* ray); + __forceinline void set(size_t i, const RayHitK<1>& ray); + + __forceinline void copy(size_t dest, size_t source); + + /* Hit data */ + Vec3vf<K> Ng; // geometry normal + vfloat<K> u; // barycentric u coordinate of hit + vfloat<K> v; // barycentric v coordinate of hit + vuint<K> primID; // primitive ID + vuint<K> geomID; // geometry ID + vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Specialization for a single ray */ + template<> + struct RayK<1> + { + /* Default construction does nothing */ + __forceinline RayK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0) + : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {} + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline bool valid() const { + return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf); + } + + /* Ray data */ + Vec3ff org; // 3 floats for ray origin, 1 float for tnear + //float tnear; // start of ray segment + Vec3ff dir; // 3 floats for ray direction, 1 float for time + // float time; + float tfar; // end of ray segment + int mask; // used to mask out objects during traversal + int id; // ray ID + int flags; // ray flags + + __forceinline float& tnear() { return org.w; }; + __forceinline const float& tnear() const { return org.w; }; + + __forceinline float& time() { return dir.w; }; + __forceinline const float& time() const { return dir.w; }; + + }; + + template<> + struct RayHitK<1> : RayK<1> + { + /* Default construction does nothing */ + __forceinline RayHitK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0) + : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags), + geomID(RTC_INVALID_GEOMETRY_ID) {} + + __forceinline RayHitK(const RayK<1>& ray) + : RayK<1>(ray), + geomID(RTC_INVALID_GEOMETRY_ID) {} + + __forceinline RayHitK<1>& operator =(const RayK<1>& ray) + { + org = ray.org; + dir = ray.dir; + tfar = ray.tfar; + mask = ray.mask; + id = ray.id; + flags = ray.flags; + + geomID = RTC_INVALID_GEOMETRY_ID; + + return *this; + } + + /* Calculates if the hit is valid */ + __forceinline void verifyHit() const + { + if (geomID == RTC_INVALID_GEOMETRY_ID) return; + const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf)); + const bool vu = (abs(u) <= FLT_LARGE); + const bool vv = (abs(u) <= FLT_LARGE); + const bool vnx = abs(Ng.x) <= FLT_LARGE; + const bool vny = abs(Ng.y) <= FLT_LARGE; + const bool vnz = abs(Ng.z) <= FLT_LARGE; + if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t"); + if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u"); + if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v"); + if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x"); + if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y"); + if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z"); + } + + /* Hit data */ + Vec3f Ng; // not normalized geometry normal + float u; // barycentric u coordinate of hit + float v; // barycentric v coordinate of hit + unsigned int primID; // primitive ID + unsigned int geomID; // geometry ID + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Converts ray packet to single rays */ + template<int K> + __forceinline void RayK<K>::get(RayK<1>* ray) const + { + for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose + { + ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i]; + ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time() = time()[i]; + ray[i].tfar = tfar[i]; ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i]; + } + } + + template<int K> + __forceinline void RayHitK<K>::get(RayHitK<1>* ray) const + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + get(i, ray[i]); + } + + /* Extracts a single ray out of a ray packet*/ + template<int K> + __forceinline void RayK<K>::get(size_t i, RayK<1>& ray) const + { + ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; + ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time() = time()[i]; + ray.tfar = tfar[i]; ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i]; + } + + template<int K> + __forceinline void RayHitK<K>::get(size_t i, RayHitK<1>& ray) const + { + ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; + ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar = tfar[i]; ray.time() = time()[i]; + ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i]; + ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i]; + ray.u = u[i]; ray.v = v[i]; + ray.primID = primID[i]; ray.geomID = geomID[i]; + + instance_id_stack::copy(instID, ray.instID, i); + } + + /* Converts single rays to ray packet */ + template<int K> + __forceinline void RayK<K>::set(const RayK<1>* ray) + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + set(i, ray[i]); + } + + template<int K> + __forceinline void RayHitK<K>::set(const RayHitK<1>* ray) + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + set(i, ray[i]); + } + + /* inserts a single ray into a ray packet element */ + template<int K> + __forceinline void RayK<K>::set(size_t i, const RayK<1>& ray) + { + org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear(); + dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time(); + tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags; + } + + template<int K> + __forceinline void RayHitK<K>::set(size_t i, const RayHitK<1>& ray) + { + org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear(); + dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time(); + tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags; + Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z; + u[i] = ray.u; v[i] = ray.v; + primID[i] = ray.primID; geomID[i] = ray.geomID; + + instance_id_stack::copy(ray.instID, instID, i); + } + + /* copies a ray packet element into another element*/ + template<int K> + __forceinline void RayK<K>::copy(size_t dest, size_t source) + { + org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source]; + dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; + tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; + } + + template<int K> + __forceinline void RayHitK<K>::copy(size_t dest, size_t source) + { + org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source]; + dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; + tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; + Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source]; + u[dest] = u[source]; v[dest] = v[source]; + primID[dest] = primID[source]; geomID[dest] = geomID[source]; + + instance_id_stack::copy(instID, instID, source, dest); + } + + /* Shortcuts */ + typedef RayK<1> Ray; + typedef RayK<4> Ray4; + typedef RayK<8> Ray8; + typedef RayK<16> Ray16; + struct RayN; + + typedef RayHitK<1> RayHit; + typedef RayHitK<4> RayHit4; + typedef RayHitK<8> RayHit8; + typedef RayHitK<16> RayHit16; + struct RayHitN; + + template<int K, bool intersect> + struct RayTypeHelper; + + template<int K> + struct RayTypeHelper<K, true> + { + typedef RayHitK<K> Ty; + }; + + template<int K> + struct RayTypeHelper<K, false> + { + typedef RayK<K> Ty; + }; + + template<bool intersect> + using RayType = typename RayTypeHelper<1, intersect>::Ty; + + template<int K, bool intersect> + using RayTypeK = typename RayTypeHelper<K, intersect>::Ty; + + /* Outputs ray to stream */ + template<int K> + __forceinline embree_ostream operator <<(embree_ostream cout, const RayK<K>& ray) + { + return cout << "{ " << embree_endl + << " org = " << ray.org << embree_endl + << " dir = " << ray.dir << embree_endl + << " near = " << ray.tnear() << embree_endl + << " far = " << ray.tfar << embree_endl + << " time = " << ray.time() << embree_endl + << " mask = " << ray.mask << embree_endl + << " id = " << ray.id << embree_endl + << " flags = " << ray.flags << embree_endl + << "}"; + } + + template<int K> + __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK<K>& ray) + { + cout << "{ " << embree_endl + << " org = " << ray.org << embree_endl + << " dir = " << ray.dir << embree_endl + << " near = " << ray.tnear() << embree_endl + << " far = " << ray.tfar << embree_endl + << " time = " << ray.time() << embree_endl + << " mask = " << ray.mask << embree_endl + << " id = " << ray.id << embree_endl + << " flags = " << ray.flags << embree_endl + << " Ng = " << ray.Ng + << " u = " << ray.u << embree_endl + << " v = " << ray.v << embree_endl + << " primID = " << ray.primID << embree_endl + << " geomID = " << ray.geomID << embree_endl + << " instID ="; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + { + cout << " " << ray.instID[l]; + } + cout << embree_endl; + return cout << "}"; + } + + struct RayStreamSOA + { + __forceinline RayStreamSOA(void* rays, size_t N) + : ptr((char*)rays), N(N) {} + + /* ray data access functions */ + __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; } // x coordinate of ray origin + __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; } // y coordinate of ray origin + __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin + __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment + + __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction + __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction + __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction + __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur + + __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance) + __forceinline int* mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset]; }; // used to mask out objects during traversal (optional) + __forceinline int* id (size_t offset = 0) { return (int*)&ptr[10*4*N+offset]; }; // id + __forceinline int* flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset]; }; // flags + + /* hit data access functions */ + __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal + __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal + __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal + + __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; }; // barycentric u coordinate of hit + __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; }; // barycentric v coordinate of hit + + __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; }; // primitive ID + __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; }; // geometry ID + __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; }; // instance ID + + __forceinline Ray getRayByOffset(size_t offset) + { + Ray ray; + ray.org.x = org_x(offset)[0]; + ray.org.y = org_y(offset)[0]; + ray.org.z = org_z(offset)[0]; + ray.tnear() = tnear(offset)[0]; + ray.dir.x = dir_x(offset)[0]; + ray.dir.y = dir_y(offset)[0]; + ray.dir.z = dir_z(offset)[0]; + ray.time() = time(offset)[0]; + ray.tfar = tfar(offset)[0]; + ray.mask = mask(offset)[0]; + ray.id = id(offset)[0]; + ray.flags = flags(offset)[0]; + return ray; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(size_t offset) + { + RayK<K> ray; + ray.org.x = vfloat<K>::loadu(org_x(offset)); + ray.org.y = vfloat<K>::loadu(org_y(offset)); + ray.org.z = vfloat<K>::loadu(org_z(offset)); + ray.tnear = vfloat<K>::loadu(tnear(offset)); + ray.dir.x = vfloat<K>::loadu(dir_x(offset)); + ray.dir.y = vfloat<K>::loadu(dir_y(offset)); + ray.dir.z = vfloat<K>::loadu(dir_z(offset)); + ray.time = vfloat<K>::loadu(time(offset)); + ray.tfar = vfloat<K>::loadu(tfar(offset)); + ray.mask = vint<K>::loadu(mask(offset)); + ray.id = vint<K>::loadu(id(offset)); + ray.flags = vint<K>::loadu(flags(offset)); + return ray; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset) + { + RayK<K> ray; + ray.org.x = vfloat<K>::loadu(valid, org_x(offset)); + ray.org.y = vfloat<K>::loadu(valid, org_y(offset)); + ray.org.z = vfloat<K>::loadu(valid, org_z(offset)); + ray.tnear() = vfloat<K>::loadu(valid, tnear(offset)); + ray.dir.x = vfloat<K>::loadu(valid, dir_x(offset)); + ray.dir.y = vfloat<K>::loadu(valid, dir_y(offset)); + ray.dir.z = vfloat<K>::loadu(valid, dir_z(offset)); + ray.time() = vfloat<K>::loadu(valid, time(offset)); + ray.tfar = vfloat<K>::loadu(valid, tfar(offset)); + +#if !defined(__AVX__) + /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults, + because the SSE masked loads always access the entire vector */ + if (unlikely(!all(valid))) + { + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + ray.mask[k] = mask(offset)[k]; + ray.id[k] = id(offset)[k]; + ray.flags[k] = flags(offset)[k]; + } + } + } + else +#endif + { + ray.mask = vint<K>::loadu(valid, mask(offset)); + ray.id = vint<K>::loadu(valid, id(offset)); + ray.flags = vint<K>::loadu(valid, flags(offset)); + } + + return ray; + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray) + { + /* + * valid_i: stores which of the input rays exist (do not access nonexistent rays!) + * valid: stores which of the rays actually hit something. + */ + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + vfloat<K>::storeu(valid, tfar(offset), ray.tfar); + vfloat<K>::storeu(valid, Ng_x(offset), ray.Ng.x); + vfloat<K>::storeu(valid, Ng_y(offset), ray.Ng.y); + vfloat<K>::storeu(valid, Ng_z(offset), ray.Ng.z); + vfloat<K>::storeu(valid, u(offset), ray.u); + vfloat<K>::storeu(valid, v(offset), ray.v); + +#if !defined(__AVX__) + /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults, + because the SSE masked stores always access the entire vector */ + if (unlikely(!all(valid_i))) + { + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + primID(offset)[k] = ray.primID[k]; + geomID(offset)[k] = ray.geomID[k]; + + instID(0, offset)[k] = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + instID(l, offset)[k] = ray.instID[l][k]; +#endif + } + } + } + else +#endif + { + vuint<K>::storeu(valid, primID(offset), ray.primID); + vuint<K>::storeu(valid, geomID(offset), ray.geomID); + + vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]); +#endif + } + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + vfloat<K>::storeu(valid, tfar(offset), ray.tfar); + } + + __forceinline size_t getOctantByOffset(size_t offset) + { + const float dx = dir_x(offset)[0]; + const float dy = dir_y(offset)[0]; + const float dz = dir_z(offset)[0]; + const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0); + return octantID; + } + + __forceinline bool isValidByOffset(size_t offset) + { + const float nnear = tnear(offset)[0]; + const float ffar = tfar(offset)[0]; + return nnear <= ffar; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset) + { + RayK<K> ray; + +#if defined(__AVX2__) + ray.org.x = vfloat<K>::template gather<1>(valid, org_x(), offset); + ray.org.y = vfloat<K>::template gather<1>(valid, org_y(), offset); + ray.org.z = vfloat<K>::template gather<1>(valid, org_z(), offset); + ray.tnear() = vfloat<K>::template gather<1>(valid, tnear(), offset); + ray.dir.x = vfloat<K>::template gather<1>(valid, dir_x(), offset); + ray.dir.y = vfloat<K>::template gather<1>(valid, dir_y(), offset); + ray.dir.z = vfloat<K>::template gather<1>(valid, dir_z(), offset); + ray.time() = vfloat<K>::template gather<1>(valid, time(), offset); + ray.tfar = vfloat<K>::template gather<1>(valid, tfar(), offset); + ray.mask = vint<K>::template gather<1>(valid, mask(), offset); + ray.id = vint<K>::template gather<1>(valid, id(), offset); + ray.flags = vint<K>::template gather<1>(valid, flags(), offset); +#else + ray.org = zero; + ray.tnear() = zero; + ray.dir = zero; + ray.time() = zero; + ray.tfar = zero; + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + const size_t ofs = offset[k]; + + ray.org.x[k] = *org_x(ofs); + ray.org.y[k] = *org_y(ofs); + ray.org.z[k] = *org_z(ofs); + ray.tnear()[k] = *tnear(ofs); + ray.dir.x[k] = *dir_x(ofs); + ray.dir.y[k] = *dir_y(ofs); + ray.dir.z[k] = *dir_z(ofs); + ray.time()[k] = *time(ofs); + ray.tfar[k] = *tfar(ofs); + ray.mask[k] = *mask(ofs); + ray.id[k] = *id(ofs); + ray.flags[k] = *flags(ofs); + } + } +#endif + + return ray; + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar); + vfloat<K>::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x); + vfloat<K>::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y); + vfloat<K>::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z); + vfloat<K>::template scatter<1>(valid, u(), offset, ray.u); + vfloat<K>::template scatter<1>(valid, v(), offset, ray.v); + vuint<K>::template scatter<1>(valid, primID(), offset, ray.primID); + vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID); + + vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]); +#endif +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *tfar(ofs) = ray.tfar[k]; + + *Ng_x(ofs) = ray.Ng.x[k]; + *Ng_y(ofs) = ray.Ng.y[k]; + *Ng_z(ofs) = ray.Ng.z[k]; + *u(ofs) = ray.u[k]; + *v(ofs) = ray.v[k]; + *primID(ofs) = ray.primID[k]; + *geomID(ofs) = ray.geomID[k]; + + *instID(0, ofs) = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + *instID(l, ofs) = ray.instID[l][k]; +#endif + } +#endif + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *tfar(ofs) = ray.tfar[k]; + } +#endif + } + } + + char* __restrict__ ptr; + size_t N; + }; + + template<size_t MAX_K> + struct StackRayStreamSOA : public RayStreamSOA + { + __forceinline StackRayStreamSOA(size_t K) + : RayStreamSOA(data, K) { assert(K <= MAX_K); } + + char data[MAX_K / 4 * sizeof(RayHit4)]; + }; + + + struct RayStreamSOP + { + template<class T> + __forceinline void init(T& t) + { + org_x = (float*)&t.org.x; + org_y = (float*)&t.org.y; + org_z = (float*)&t.org.z; + tnear = (float*)&t.tnear; + dir_x = (float*)&t.dir.x; + dir_y = (float*)&t.dir.y; + dir_z = (float*)&t.dir.z; + time = (float*)&t.time; + tfar = (float*)&t.tfar; + mask = (unsigned int*)&t.mask; + id = (unsigned int*)&t.id; + flags = (unsigned int*)&t.flags; + + Ng_x = (float*)&t.Ng.x; + Ng_y = (float*)&t.Ng.y; + Ng_z = (float*)&t.Ng.z; + u = (float*)&t.u; + v = (float*)&t.v; + primID = (unsigned int*)&t.primID; + geomID = (unsigned int*)&t.geomID; + + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = (unsigned int*)&t.instID[l]; + } + + __forceinline Ray getRayByOffset(size_t offset) + { + Ray ray; + ray.org.x = *(float* __restrict__)((char*)org_x + offset); + ray.org.y = *(float* __restrict__)((char*)org_y + offset); + ray.org.z = *(float* __restrict__)((char*)org_z + offset); + ray.dir.x = *(float* __restrict__)((char*)dir_x + offset); + ray.dir.y = *(float* __restrict__)((char*)dir_y + offset); + ray.dir.z = *(float* __restrict__)((char*)dir_z + offset); + ray.tfar = *(float* __restrict__)((char*)tfar + offset); + ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f; + ray.time() = time ? *(float* __restrict__)((char*)time + offset) : 0.0f; + ray.mask = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1; + ray.id = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1; + ray.flags = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1; + return ray; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset) + { + RayK<K> ray; + ray.org.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_x + offset)); + ray.org.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_y + offset)); + ray.org.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_z + offset)); + ray.dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset)); + ray.dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset)); + ray.dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset)); + ray.tfar = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset)); + ray.tnear() = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f; + ray.time() = time ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f; + ray.mask = mask ? vint<K>::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1; + ray.id = id ? vint<K>::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1; + ray.flags = flags ? vint<K>::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1; + return ray; + } + + template<int K> + __forceinline Vec3vf<K> getDirByOffset(const vbool<K>& valid, size_t offset) + { + Vec3vf<K> dir; + dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset)); + dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset)); + dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset)); + return dir; + } + + __forceinline void setHitByOffset(size_t offset, const RayHit& ray) + { + if (ray.geomID != RTC_INVALID_GEOMETRY_ID) + { + *(float* __restrict__)((char*)tfar + offset) = ray.tfar; + + if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x; + if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y; + if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z; + *(float* __restrict__)((char*)u + offset) = ray.u; + *(float* __restrict__)((char*)v + offset) = ray.v; + *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID; + *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID; + + if (likely(instID[0])) { + *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l) + *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l]; +#endif + } + } + } + + __forceinline void setHitByOffset(size_t offset, const Ray& ray) + { + *(float* __restrict__)((char*)tfar + offset) = ray.tfar; + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar); + + if (likely(Ng_x)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x); + if (likely(Ng_y)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y); + if (likely(Ng_z)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z); + vfloat<K>::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u); + vfloat<K>::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v); + vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID); + vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID); + + if (likely(instID[0])) { + vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]); +#endif + } + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar); + } + + __forceinline size_t getOctantByOffset(size_t offset) + { + const float dx = *(float* __restrict__)((char*)dir_x + offset); + const float dy = *(float* __restrict__)((char*)dir_y + offset); + const float dz = *(float* __restrict__)((char*)dir_z + offset); + const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0); + return octantID; + } + + __forceinline bool isValidByOffset(size_t offset) + { + const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f; + const float ffar = *(float* __restrict__)((char*)tfar + offset); + return nnear <= ffar; + } + + template<int K> + __forceinline vbool<K> isValidByOffset(const vbool<K>& valid, size_t offset) + { + const vfloat<K> nnear = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f; + const vfloat<K> ffar = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset)); + return nnear <= ffar; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset) + { + RayK<K> ray; + +#if defined(__AVX2__) + ray.org.x = vfloat<K>::template gather<1>(valid, org_x, offset); + ray.org.y = vfloat<K>::template gather<1>(valid, org_y, offset); + ray.org.z = vfloat<K>::template gather<1>(valid, org_z, offset); + ray.dir.x = vfloat<K>::template gather<1>(valid, dir_x, offset); + ray.dir.y = vfloat<K>::template gather<1>(valid, dir_y, offset); + ray.dir.z = vfloat<K>::template gather<1>(valid, dir_z, offset); + ray.tfar = vfloat<K>::template gather<1>(valid, tfar, offset); + ray.tnear() = tnear ? vfloat<K>::template gather<1>(valid, tnear, offset) : vfloat<K>(zero); + ray.time() = time ? vfloat<K>::template gather<1>(valid, time, offset) : vfloat<K>(zero); + ray.mask = mask ? vint<K>::template gather<1>(valid, (int*)mask, offset) : vint<K>(-1); + ray.id = id ? vint<K>::template gather<1>(valid, (int*)id, offset) : vint<K>(-1); + ray.flags = flags ? vint<K>::template gather<1>(valid, (int*)flags, offset) : vint<K>(-1); +#else + ray.org = zero; + ray.tnear() = zero; + ray.dir = zero; + ray.tfar = zero; + ray.time() = zero; + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + const size_t ofs = offset[k]; + + ray.org.x[k] = *(float* __restrict__)((char*)org_x + ofs); + ray.org.y[k] = *(float* __restrict__)((char*)org_y + ofs); + ray.org.z[k] = *(float* __restrict__)((char*)org_z + ofs); + ray.dir.x[k] = *(float* __restrict__)((char*)dir_x + ofs); + ray.dir.y[k] = *(float* __restrict__)((char*)dir_y + ofs); + ray.dir.z[k] = *(float* __restrict__)((char*)dir_z + ofs); + ray.tfar[k] = *(float* __restrict__)((char*)tfar + ofs); + ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f; + ray.time()[k] = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f; + ray.mask[k] = mask ? *(int* __restrict__)((char*)mask + ofs) : -1; + ray.id[k] = id ? *(int* __restrict__)((char*)id + ofs) : -1; + ray.flags[k] = flags ? *(int* __restrict__)((char*)flags + ofs) : -1; + } + } +#endif + + return ray; + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar); + + if (likely(Ng_x)) vfloat<K>::template scatter<1>(valid, Ng_x, offset, ray.Ng.x); + if (likely(Ng_y)) vfloat<K>::template scatter<1>(valid, Ng_y, offset, ray.Ng.y); + if (likely(Ng_z)) vfloat<K>::template scatter<1>(valid, Ng_z, offset, ray.Ng.z); + vfloat<K>::template scatter<1>(valid, u, offset, ray.u); + vfloat<K>::template scatter<1>(valid, v, offset, ray.v); + vuint<K>::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID); + vuint<K>::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID); + + if (likely(instID[0])) { + vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]); +#endif + } +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k]; + + if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k]; + if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k]; + if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k]; + *(float* __restrict__)((char*)u + ofs) = ray.u[k]; + *(float* __restrict__)((char*)v + ofs) = ray.v[k]; + *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k]; + *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k]; + + if (likely(instID[0])) { + *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k]; +#endif + } + } +#endif + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k]; + } +#endif + } + } + + /* ray data */ + float* __restrict__ org_x; // x coordinate of ray origin + float* __restrict__ org_y; // y coordinate of ray origin + float* __restrict__ org_z; // z coordinate of ray origin + float* __restrict__ tnear; // start of ray segment (optional) + + float* __restrict__ dir_x; // x coordinate of ray direction + float* __restrict__ dir_y; // y coordinate of ray direction + float* __restrict__ dir_z; // z coordinate of ray direction + float* __restrict__ time; // time of this ray for motion blur (optional) + + float* __restrict__ tfar; // end of ray segment (set to hit distance) + unsigned int* __restrict__ mask; // used to mask out objects during traversal (optional) + unsigned int* __restrict__ id; // ray ID + unsigned int* __restrict__ flags; // ray flags + + /* hit data */ + float* __restrict__ Ng_x; // x coordinate of geometry normal (optional) + float* __restrict__ Ng_y; // y coordinate of geometry normal (optional) + float* __restrict__ Ng_z; // z coordinate of geometry normal (optional) + + float* __restrict__ u; // barycentric u coordinate of hit + float* __restrict__ v; // barycentric v coordinate of hit + + unsigned int* __restrict__ primID; // primitive ID + unsigned int* __restrict__ geomID; // geometry ID + unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional) + }; + + + struct RayStreamAOS + { + __forceinline RayStreamAOS(void* rays) + : ptr((Ray*)rays) {} + + __forceinline Ray& getRayByOffset(size_t offset) + { + return *(Ray*)((char*)ptr + offset); + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vint<K>& offset); + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset) + { + const vint<K> valid_offset = select(valid, offset, vintx(zero)); + return getRayByOffset(valid_offset); + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v); + vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID); + vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID); + + vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]); +#endif +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]); + ray_k->tfar = ray.tfar[k]; + ray_k->Ng.x = ray.Ng.x[k]; + ray_k->Ng.y = ray.Ng.y[k]; + ray_k->Ng.z = ray.Ng.z[k]; + ray_k->u = ray.u[k]; + ray_k->v = ray.v[k]; + ray_k->primID = ray.primID[k]; + ray_k->geomID = ray.geomID[k]; + + instance_id_stack::copy(ray.instID, ray_k->instID, k); + } +#endif + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]); + ray_k->tfar = ray.tfar[k]; + } +#endif + } + } + + Ray* __restrict__ ptr; + }; + + template<> + __forceinline Ray4 RayStreamAOS::getRayByOffset(const vint4& offset) + { + Ray4 ray; + + /* load and transpose: org.x, org.y, org.z, tnear */ + const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org); + const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org); + const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org); + const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org); + + transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear()); + + /* load and transpose: dir.x, dir.y, dir.z, time */ + const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir); + const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir); + const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir); + const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir); + + transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar); + + vfloat4 maskf, idf, flagsf; + transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } + +#if defined(__AVX__) + template<> + __forceinline Ray8 RayStreamAOS::getRayByOffset(const vint8& offset) + { + Ray8 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org); + const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org); + const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org); + const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org); + const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org); + const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org); + const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org); + const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar); + const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar); + const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar); + const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar); + const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar); + + vfloat8 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + +#if defined(__AVX512F__) + template<> + __forceinline Ray16 RayStreamAOS::getRayByOffset(const vint16& offset) + { + Ray16 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org); + const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org); + const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org); + const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org); + const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org); + const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org); + const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org); + const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org); + const vfloat8 ab8 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org); + const vfloat8 ab9 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org); + const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org); + const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org); + const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org); + const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org); + const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org); + const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15, + ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar); + const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar); + const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar); + const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar); + const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar); + const vfloat4 c8 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar); + const vfloat4 c9 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar); + const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar); + const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar); + const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar); + const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar); + const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar); + const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar); + + vfloat16 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15, + ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + + + struct RayStreamAOP + { + __forceinline RayStreamAOP(void* rays) + : ptr((Ray**)rays) {} + + __forceinline Ray& getRayByIndex(size_t index) + { + return *ptr[index]; + } + + template<int K> + __forceinline RayK<K> getRayByIndex(const vint<K>& index); + + template<int K> + __forceinline RayK<K> getRayByIndex(const vbool<K>& valid, const vint<K>& index) + { + const vint<K> valid_index = select(valid, index, vintx(zero)); + return getRayByIndex(valid_index); + } + + template<int K> + __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]]; + + ray_k->tfar = ray.tfar[k]; + ray_k->Ng.x = ray.Ng.x[k]; + ray_k->Ng.y = ray.Ng.y[k]; + ray_k->Ng.z = ray.Ng.z[k]; + ray_k->u = ray.u[k]; + ray_k->v = ray.v[k]; + ray_k->primID = ray.primID[k]; + ray_k->geomID = ray.geomID[k]; + instance_id_stack::copy(ray.instID, ray_k->instID, k); + } + } + } + + template<int K> + __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + Ray* __restrict__ ray_k = ptr[index[k]]; + + ray_k->tfar = ray.tfar[k]; + } + } + } + + Ray** __restrict__ ptr; + }; + + template<> + __forceinline Ray4 RayStreamAOP::getRayByIndex(const vint4& index) + { + Ray4 ray; + + /* load and transpose: org.x, org.y, org.z, tnear */ + const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org); + const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org); + const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org); + const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org); + + transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear()); + + /* load and transpose: dir.x, dir.y, dir.z, time */ + const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir); + const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir); + const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir); + const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir); + + transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + + vfloat4 maskf, idf, flagsf; + transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } + +#if defined(__AVX__) + template<> + __forceinline Ray8 RayStreamAOP::getRayByIndex(const vint8& index) + { + Ray8 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org); + const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org); + const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org); + const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org); + const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org); + const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org); + const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org); + const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar); + const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar); + const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar); + const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar); + + vfloat8 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + +#if defined(__AVX512F__) + template<> + __forceinline Ray16 RayStreamAOP::getRayByIndex(const vint16& index) + { + Ray16 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org); + const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org); + const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org); + const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org); + const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org); + const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org); + const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org); + const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org); + const vfloat8 ab8 = vfloat8::loadu(&ptr[index[8]]->org); + const vfloat8 ab9 = vfloat8::loadu(&ptr[index[9]]->org); + const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org); + const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org); + const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org); + const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org); + const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org); + const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15, + ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar); + const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar); + const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar); + const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar); + const vfloat4 c8 = vfloat4::loadu(&ptr[index[8]]->tfar); + const vfloat4 c9 = vfloat4::loadu(&ptr[index[9]]->tfar); + const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar); + const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar); + const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar); + const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar); + const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar); + const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar); + + vfloat16 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15, + ray.tfar, maskf, idf, flagsf); + + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif +} diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp new file mode 100644 index 0000000000..625fbf6d4f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp @@ -0,0 +1,1799 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_EXPORT_API + +#include "default.h" +#include "device.h" +#include "scene.h" +#include "context.h" +#include "../../include/embree3/rtcore_ray.h" + +#if defined(__aarch64__) && defined(BUILD_IOS) +#include <mutex> +#endif + +using namespace embree; + +RTC_NAMESPACE_BEGIN; + + /* mutex to make API thread safe */ +#if defined(__aarch64__) && defined(BUILD_IOS) + static std::mutex g_mutex; +#else + static MutexSys g_mutex; +#endif + + RTC_API RTCDevice rtcNewDevice(const char* config) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewDevice); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock<MutexSys> lock(g_mutex); +#endif + Device* device = new Device(config); + return (RTCDevice) device->refInc(); + RTC_CATCH_END(nullptr); + return (RTCDevice) nullptr; + } + + RTC_API void rtcRetainDevice(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainDevice); + RTC_VERIFY_HANDLE(hdevice); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock<MutexSys> lock(g_mutex); +#endif + device->refInc(); + RTC_CATCH_END(nullptr); + } + + RTC_API void rtcReleaseDevice(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseDevice); + RTC_VERIFY_HANDLE(hdevice); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock<MutexSys> lock(g_mutex); +#endif + device->refDec(); + RTC_CATCH_END(nullptr); + } + + RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetDeviceProperty); + RTC_VERIFY_HANDLE(hdevice); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock<MutexSys> lock(g_mutex); +#endif + return device->getProperty(prop); + RTC_CATCH_END(device); + return 0; + } + + RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceProperty); + const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004; + if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock<MutexSys> lock(g_mutex); +#endif + device->setProperty(prop,val); + RTC_CATCH_END(device); + } + + RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetDeviceError); + if (device == nullptr) return Device::getThreadErrorCode(); + else return device->getDeviceErrorCode(); + RTC_CATCH_END(device); + return RTC_ERROR_UNKNOWN; + } + + RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceErrorFunction); + RTC_VERIFY_HANDLE(hdevice); + device->setErrorFunction(error, userPtr); + RTC_CATCH_END(device); + } + + RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceMemoryMonitorFunction); + device->setMemoryMonitorFunction(memoryMonitor, userPtr); + RTC_CATCH_END(device); + } + + RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewBuffer); + RTC_VERIFY_HANDLE(hdevice); + Buffer* buffer = new Buffer((Device*)hdevice, byteSize); + return (RTCBuffer)buffer->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewSharedBuffer); + RTC_VERIFY_HANDLE(hdevice); + Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr); + return (RTCBuffer)buffer->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API void* rtcGetBufferData(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetBufferData); + RTC_VERIFY_HANDLE(hbuffer); + return buffer->data(); + RTC_CATCH_END2(buffer); + return nullptr; + } + + RTC_API void rtcRetainBuffer(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainBuffer); + RTC_VERIFY_HANDLE(hbuffer); + buffer->refInc(); + RTC_CATCH_END2(buffer); + } + + RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseBuffer); + RTC_VERIFY_HANDLE(hbuffer); + buffer->refDec(); + RTC_CATCH_END2(buffer); + } + + RTC_API RTCScene rtcNewScene (RTCDevice hdevice) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewScene); + RTC_VERIFY_HANDLE(hdevice); + Scene* scene = new Scene((Device*)hdevice); + return (RTCScene) scene->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneDevice); + RTC_VERIFY_HANDLE(hscene); + return (RTCDevice)scene->device->refInc(); // user will own one additional device reference + RTC_CATCH_END2(scene); + return (RTCDevice)nullptr; + } + + RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneProgressMonitorFunction); + RTC_VERIFY_HANDLE(hscene); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock<MutexSys> lock(g_mutex); +#endif + scene->setProgressMonitorFunction(progress,ptr); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneBuildQuality); + RTC_VERIFY_HANDLE(hscene); + if (quality != RTC_BUILD_QUALITY_LOW && + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH) + // -- GODOT start -- + // throw std::runtime_error("invalid build quality"); + abort(); + // -- GODOT end -- + scene->setBuildQuality(quality); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneFlags); + RTC_VERIFY_HANDLE(hscene); + scene->setSceneFlags(flags); + RTC_CATCH_END2(scene); + } + + RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneFlags); + RTC_VERIFY_HANDLE(hscene); + return scene->getSceneFlags(); + RTC_CATCH_END2(scene); + return RTC_SCENE_FLAG_NONE; + } + + RTC_API void rtcCommitScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCommitScene); + RTC_VERIFY_HANDLE(hscene); + scene->commit(false); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcJoinCommitScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcJoinCommitScene); + RTC_VERIFY_HANDLE(hscene); + scene->commit(true); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneBounds); + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + BBox3fa bounds = scene->bounds.bounds(); + bounds_o->lower_x = bounds.lower.x; + bounds_o->lower_y = bounds.lower.y; + bounds_o->lower_z = bounds.lower.z; + bounds_o->align0 = 0; + bounds_o->upper_x = bounds.upper.x; + bounds_o->upper_y = bounds.upper.y; + bounds_o->upper_z = bounds.upper.z; + bounds_o->align1 = 0; + RTC_CATCH_END2(scene); + } + + RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneBounds); + RTC_VERIFY_HANDLE(hscene); + if (bounds_o == nullptr) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer"); + if (scene->isModified()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + + bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x; + bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y; + bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z; + bounds_o->bounds0.align0 = 0; + bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x; + bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y; + bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z; + bounds_o->bounds0.align1 = 0; + bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x; + bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y; + bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z; + bounds_o->bounds1.align0 = 0; + bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x; + bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y; + bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z; + bounds_o->bounds1.align1 = 0; + RTC_CATCH_END2(scene); + } + + RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr) + { + Scene* scene0 = (Scene*) hscene0; + Scene* scene1 = (Scene*) hscene1; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCollide); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene0); + RTC_VERIFY_HANDLE(hscene1); + if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices"); + auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false); + auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false); + if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep"); +#endif + scene0->intersectors.collide(scene0,scene1,callback,userPtr); + RTC_CATCH_END(scene0->device); + } + + inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr) + { + bool changed = false; + if (userContext->instStackSize > 0) + { + const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]); + + float similarityScale = 0.f; + const bool similtude = similarityTransform(transform, &similarityScale); + assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f)); + + PointQuery query_inst; + query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); + query_inst.radius = query->radius * similarityScale; + query_inst.time = query->time; + + PointQueryContext context_inst(scene, (PointQuery*)query, + similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB, + queryFunc, userContext, similarityScale, userPtr); + changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst); + } + else + { + PointQueryContext context(scene, (PointQuery*)query, + POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr); + changed = scene->intersectors.pointQuery((PointQuery*)query, &context); + } + return changed; + } + + RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(userContext); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); + if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes"); +#endif + + return pointQuery(scene, query, userContext, queryFunc, userPtr); + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery4* query4 = (PointQuery4*)query; + PointQuery query1; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + query4->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query4->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery8* query8 = (PointQuery8*)query; + PointQuery query1; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + query8->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query8->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery16* query16 = (PointQuery16*)query; + PointQuery query1; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + PointQuery query1; query16->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query16->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + STAT3(normal.travs,1,1,1); + IntersectContext context(scene,user_context); + scene->intersectors.intersect(*rayhit,&context); +#if defined(DEBUG) + ((RayHit*)rayhit)->verifyHit(); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit4* rayhit4 = (RayHit4*)rayhit; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + RayHit ray1; rayhit4->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + rayhit4->set(i,ray1); + } +#else + scene->intersectors.intersect4(valid,*rayhit,&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes"); + if (((size_t)rayhit) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit8* rayhit8 = (RayHit8*) rayhit; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + RayHit ray1; rayhit8->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + rayhit8->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector8)) + scene->intersectors.intersect8(valid,*rayhit,&context); + else + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes"); + if (((size_t)rayhit) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit16* rayhit16 = (RayHit16*) rayhit; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + RayHit ray1; rayhit16->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + rayhit16->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector16)) + scene->intersectors.intersect16(valid,*rayhit,&context); + else + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1M); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for single rays */ + if (likely(M == 1)) { + if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) + scene->intersectors.intersect(*rayhit,&context); + } + + /* codepath for streams */ + else { + scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1Mp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for single rays */ + if (likely(M == 1)) { + if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) + scene->intersectors.intersect(*rn[0],&context); + } + + /* codepath for streams */ + else { + scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersectNM); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,N*M,N*M,N*M); + IntersectContext context(scene,user_context); + + /* code path for single ray streams */ + if (likely(N == 1)) + { + /* fast code path for streams of size 1 */ + if (likely(M == 1)) { + if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar)) + scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context); + } + /* normal codepath for single ray streams */ + else { + scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context); + } + } + /* code path for ray packet streams */ + else { + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersectNp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes"); + if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes"); + if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes"); + if (((size_t)rayhit->ray.time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes"); + if (((size_t)rayhit->ray.mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes"); + if (((size_t)rayhit->hit.u ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes"); + if (((size_t)rayhit->hit.v ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes"); + if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes"); + if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes"); + if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes"); +#endif + STAT3(normal.travs,N,N,N); + IntersectContext context(scene,user_context); + scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1); + STAT3(shadow.travs,1,1,1); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + IntersectContext context(scene,user_context); + scene->intersectors.occluded(*ray,&context); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray4* ray4 = (Ray4*) ray; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + Ray ray1; ray4->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray4->set(i,ray1); + } +#else + scene->intersectors.occluded4(valid,*ray,&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes"); + if (((size_t)ray) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray8* ray8 = (Ray8*) ray; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + Ray ray1; ray8->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray8->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector8)) + scene->intersectors.occluded8(valid,*ray,&context); + else + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes"); + if (((size_t)ray) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray16* ray16 = (Ray16*) ray; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + Ray ray1; ray16->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray16->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector16)) + scene->intersectors.occluded16(valid,*ray,&context); + else + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1M); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,M,M,M); + IntersectContext context(scene,user_context); + /* fast codepath for streams of size 1 */ + if (likely(M == 1)) { + if (likely(ray->tnear <= ray->tfar)) + scene->intersectors.occluded (*ray,&context); + } + /* codepath for normal streams */ + else { + scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1Mp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for streams of size 1 */ + if (likely(M == 1)) { + if (likely(ray[0]->tnear <= ray[0]->tfar)) + scene->intersectors.occluded (*ray[0],&context); + } + /* codepath for normal streams */ + else { + scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccludedNM); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small"); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,N*M,N*N,N*N); + IntersectContext context(scene,user_context); + + /* codepath for single rays */ + if (likely(N == 1)) + { + /* fast path for streams of size 1 */ + if (likely(M == 1)) { + if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar)) + scene->intersectors.occluded (*(RTCRay*)ray,&context); + } + /* codepath for normal ray streams */ + else { + scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context); + } + } + /* code path for ray packet streams */ + else { + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccludedNp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes"); + if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes"); + if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes"); + if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes"); + if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes"); + if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes"); + if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes"); + if (((size_t)ray->tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes"); + if (((size_t)ray->time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes"); + if (((size_t)ray->mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,N,N,N); + IntersectContext context(scene,user_context); + scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcRetainScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainScene); + RTC_VERIFY_HANDLE(hscene); + scene->refInc(); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcReleaseScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseScene); + RTC_VERIFY_HANDLE(hscene); + scene->refDec(); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene) + { + Geometry* geometry = (Geometry*) hgeometry; + Ref<Scene> scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryInstancedScene); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(hscene); + geometry->setInstancedScene(scene); + RTC_CATCH_END2(geometry); + } + + AffineSpace3fa loadTransform(RTCFormat format, const float* xfm) + { + AffineSpace3fa space = one; + switch (format) + { + case RTC_FORMAT_FLOAT3X4_ROW_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]), + Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]), + Vec3fa(xfm[ 2], xfm[ 6], xfm[10]), + Vec3fa(xfm[ 3], xfm[ 7], xfm[11])); + break; + + case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]), + Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]), + Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]), + Vec3fa(xfm[ 9], xfm[10], xfm[11])); + break; + + case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]), + Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]), + Vec3fa(xfm[ 8], xfm[ 9], xfm[10]), + Vec3fa(xfm[12], xfm[13], xfm[14])); + break; + + default: + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format"); + break; + } + return space; + } + + void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm) + { + switch (format) + { + case RTC_FORMAT_FLOAT3X4_ROW_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vy.x; xfm[ 2] = space.l.vz.x; xfm[ 3] = space.p.x; + xfm[ 4] = space.l.vx.y; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vz.y; xfm[ 7] = space.p.y; + xfm[ 8] = space.l.vx.z; xfm[ 9] = space.l.vy.z; xfm[10] = space.l.vz.z; xfm[11] = space.p.z; + break; + + case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; + xfm[ 3] = space.l.vy.x; xfm[ 4] = space.l.vy.y; xfm[ 5] = space.l.vy.z; + xfm[ 6] = space.l.vz.x; xfm[ 7] = space.l.vz.y; xfm[ 8] = space.l.vz.z; + xfm[ 9] = space.p.x; xfm[10] = space.p.y; xfm[11] = space.p.z; + break; + + case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; xfm[ 3] = 0.f; + xfm[ 4] = space.l.vy.x; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vy.z; xfm[ 7] = 0.f; + xfm[ 8] = space.l.vz.x; xfm[ 9] = space.l.vz.y; xfm[10] = space.l.vz.z; xfm[11] = 0.f; + xfm[12] = space.p.x; xfm[13] = space.p.y; xfm[14] = space.p.z; xfm[15] = 1.f; + break; + + default: + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format"); + break; + } + } + + RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTransform); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(xfm); + const AffineSpace3fa transform = loadTransform(format, (const float*)xfm); + geometry->setTransform(transform, timeStep); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTransformQuaternion); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(qd); + + AffineSpace3fx transform; + transform.l.vx.x = qd->scale_x; + transform.l.vy.y = qd->scale_y; + transform.l.vz.z = qd->scale_z; + transform.l.vy.x = qd->skew_xy; + transform.l.vz.x = qd->skew_xz; + transform.l.vz.y = qd->skew_yz; + transform.l.vx.y = qd->translation_x; + transform.l.vx.z = qd->translation_y; + transform.l.vy.z = qd->translation_z; + transform.p.x = qd->shift_x; + transform.p.y = qd->shift_y; + transform.p.z = qd->shift_z; + + // normalize quaternion + Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k); + q = normalize(q); + transform.l.vx.w = q.i; + transform.l.vy.w = q.j; + transform.l.vz.w = q.k; + transform.p.w = q.r; + + geometry->setQuaternionDecomposition(transform, timeStep); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryTransform); + const AffineSpace3fa transform = geometry->getTransform(time); + storeTransform(transform, format, (float*)xfm); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) + { + IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i; + args->report(args,filter_args); + } + + RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) + { + OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i; + args->report(args,filter_args); + } + + RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewGeometry); + RTC_VERIFY_HANDLE(hdevice); + + switch (type) + { + case RTC_GEOMETRY_TYPE_TRIANGLE: + { +#if defined(EMBREE_GEOMETRY_TRIANGLE) + createTriangleMeshTy createTriangleMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createTriangleMesh); + Geometry* geom = createTriangleMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_QUAD: + { +#if defined(EMBREE_GEOMETRY_QUAD) + createQuadMeshTy createQuadMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createQuadMesh); + Geometry* geom = createQuadMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_SPHERE_POINT: + case RTC_GEOMETRY_TYPE_DISC_POINT: + case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT: + { +#if defined(EMBREE_GEOMETRY_POINT) + createPointsTy createPoints = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_builder_cpu_features, createPoints); + + Geometry *geom; + switch(type) { + case RTC_GEOMETRY_TYPE_SPHERE_POINT: + geom = createPoints(device, Geometry::GTY_SPHERE_POINT); + break; + case RTC_GEOMETRY_TYPE_DISC_POINT: + geom = createPoints(device, Geometry::GTY_DISC_POINT); + break; + case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT: + geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT); + break; + default: + geom = nullptr; + break; + } + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE: + case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE: + { +#if defined(EMBREE_GEOMETRY_CURVE) + createLineSegmentsTy createLineSegments = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createLineSegments); + createCurvesTy createCurves = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createCurves); + + Geometry* geom; + switch (type) { + case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break; + case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break; + //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break; + default: geom = nullptr; break; + } + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_SUBDIVISION: + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + createSubdivMeshTy createSubdivMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh); + //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason? + Geometry* geom = createSubdivMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_USER: + { +#if defined(EMBREE_GEOMETRY_USER) + createUserGeometryTy createUserGeometry = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createUserGeometry); + Geometry* geom = createUserGeometry(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_INSTANCE: + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + createInstanceTy createInstance = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createInstance); + Geometry* geom = createInstance(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_GRID: + { +#if defined(EMBREE_GEOMETRY_GRID) + createGridMeshTy createGridMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createGridMesh); + Geometry* geom = createGridMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported"); +#endif + } + + default: + throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type"); + } + + RTC_CATCH_END(device); + return nullptr; + } + + RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryUserPrimitiveCount); + RTC_VERIFY_HANDLE(hgeometry); + + if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); + + geometry->setNumPrimitives(userPrimitiveCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTimeStepCount); + RTC_VERIFY_HANDLE(hgeometry); + + if (timeStepCount > RTC_MAX_TIME_STEP_COUNT) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range"); + + geometry->setNumTimeSteps(timeStepCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime) + { + Ref<Geometry> geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTimeRange); + RTC_VERIFY_HANDLE(hgeometry); + + if (startTime > endTime) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime"); + + geometry->setTimeRange(BBox1f(startTime,endTime)); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryVertexAttributeCount); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setVertexAttributeCount(N); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTopologyCount); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setTopologyCount(N); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBuildQuality); + RTC_VERIFY_HANDLE(hgeometry); + if (quality != RTC_BUILD_QUALITY_LOW && + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH && + quality != RTC_BUILD_QUALITY_REFIT) + // -- GODOT start -- + // throw std::runtime_error("invalid build quality"); + abort(); + // -- GODOT end -- + geometry->setBuildQuality(quality); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryMaxRadiusScale); + RTC_VERIFY_HANDLE(hgeometry); +#if RTC_MIN_WIDTH + if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1"); + geometry->setMaxRadiusScale(maxRadiusScale); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled"); +#endif + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryMask); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setMask(mask); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometrySubdivisionMode); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setSubdivisionMode(topologyID,mode); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryVertexAttributeTopology); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setVertexAttributeTopology(vertexAttributeID, topologyID); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + Ref<Buffer> buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(hbuffer); + + if (geometry->device != buffer->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSharedGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset); + geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetNewGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */ + size_t bytes = itemCount*byteStride; + if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + bytes += (16 - (byteStride%16))%16; + + Ref<Buffer> buffer = new Buffer(geometry->device, bytes); + geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount); + return buffer->data(); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryBufferData); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->getBuffer(type, slot); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcEnableGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->enable(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcUpdateGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + geometry->updateBuffer(type, slot); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcDisableGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->disable(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTessellationRate); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setTessellationRate(tessellationRate); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryUserData); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setUserData(ptr); + RTC_CATCH_END2(geometry); + } + + RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; // no ref counting here! + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryUserData); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->getUserData(); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBoundsFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setBoundsFunction(bounds,userPtr); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryDisplacementFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setDisplacementFunction(displacement); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryIntersectFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setIntersectFunctionN(intersect); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryPointQueryFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setPointQueryFunction(pointQuery); + RTC_CATCH_END2(geometry); + } + + RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryFirstHalfEdge); + return geometry->getFirstHalfEdge(faceID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryFace); + return geometry->getFace(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryNextHalfEdge); + return geometry->getNextHalfEdge(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryPreviousHalfEdge); + return geometry->getPreviousHalfEdge(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryOppositeHalfEdge); + return geometry->getOppositeHalfEdge(topologyID,edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetOccludedFunctionN); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setOccludedFunctionN(occluded); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryIntersectFilterFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setIntersectionFilterFunctionN(filter); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryOccludedFilterFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setOcclusionFilterFunctionN(filter); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args) + { + Geometry* geometry = (Geometry*) args->geometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcInterpolate); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(args->geometry); +#endif + geometry->interpolate(args); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args) + { + Geometry* geometry = (Geometry*) args->geometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcInterpolateN); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(args->geometry); +#endif + geometry->interpolateN(args); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcCommitGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCommitGeometry); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->commit(); + RTC_CATCH_END2(geometry); + } + + RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry) + { + Scene* scene = (Scene*) hscene; + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcAttachGeometry); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(hgeometry); + if (scene->device != geometry->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry); + RTC_CATCH_END2(scene); + return -1; + } + + RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcAttachGeometryByID); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_GEOMID(geomID); + if (scene->device != geometry->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + scene->bind(geomID,geometry); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcDetachGeometry); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_GEOMID(geomID); + scene->detachGeometry(geomID); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcRetainGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->refInc(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->refDec(); + RTC_CATCH_END2(geometry); + } + + RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometry); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_GEOMID(geomID); +#endif + return (RTCGeometry) scene->get(geomID); + RTC_CATCH_END2(scene); + return nullptr; + } + +RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h new file mode 100644 index 0000000000..4b070e122b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h @@ -0,0 +1,142 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../include/embree3/rtcore.h" +RTC_NAMESPACE_USE + +namespace embree +{ + /*! decoding of intersection flags */ + __forceinline bool isCoherent (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; } + __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; } + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8) +# define USE_TASK_ARENA 1 +#else +# define USE_TASK_ARENA 0 +#endif + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9 +# define TASKING_TBB_USE_TASK_ISOLATION 1 +#else +# define TASKING_TBB_USE_TASK_ISOLATION 0 +#endif + +/*! Macros used in the rtcore API implementation */ +// -- GODOT start -- +// #define RTC_CATCH_BEGIN try { +#define RTC_CATCH_BEGIN + +// #define RTC_CATCH_END(device) \ +// } catch (std::bad_alloc&) { \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// } catch (rtcore_error& e) { \ +// Device::process_error(device,e.error,e.what()); \ +// } catch (std::exception& e) { \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// } catch (...) { \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// } +#define RTC_CATCH_END(device) + +// #define RTC_CATCH_END2(scene) \ +// } catch (std::bad_alloc&) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// } catch (rtcore_error& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,e.error,e.what()); \ +// } catch (std::exception& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// } catch (...) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// } +#define RTC_CATCH_END2(scene) + +// #define RTC_CATCH_END2_FALSE(scene) \ +// } catch (std::bad_alloc&) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// return false; \ +// } catch (rtcore_error& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,e.error,e.what()); \ +// return false; \ +// } catch (std::exception& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// return false; \ +// } catch (...) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// return false; \ +// } +#define RTC_CATCH_END2_FALSE(scene) return false; +// -- GODOT end -- + +#define RTC_VERIFY_HANDLE(handle) \ + if (handle == nullptr) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_GEOMID(id) \ + if (id == RTC_INVALID_GEOMETRY_ID) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_UPPER(id,upper) \ + if (id > upper) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_RANGE(id,lower,upper) \ + if (id < lower || id > upper) \ + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds"); + +#if 0 // enable to debug print all API calls +#define RTC_TRACE(x) std::cout << #x << std::endl; +#else +#define RTC_TRACE(x) +#endif + +// -- GODOT begin -- +// /*! used to throw embree API errors */ +// struct rtcore_error : public std::exception +// { +// __forceinline rtcore_error(RTCError error, const std::string& str) +// : error(error), str(str) {} +// +// ~rtcore_error() throw() {} +// +// const char* what () const throw () { +// return str.c_str(); +// } +// +// RTCError error; +// std::string str; +// }; +// -- GODOT end -- + +#if defined(DEBUG) // only report file and line in debug mode + // -- GODOT begin -- + // #define throw_RTCError(error,str) \ + // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define throw_RTCError(error,str) \ + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); + // -- GODOT end -- +#else + // -- GODOT begin -- + // #define throw_RTCError(error,str) \ + // throw rtcore_error(error,str); + #define throw_RTCError(error,str) \ + abort(); + // -- GODOT end -- +#endif + +#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ + (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) +} diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp new file mode 100644 index 0000000000..6bb96bba07 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp @@ -0,0 +1,442 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_EXPORT_API + +#include "default.h" +#include "device.h" +#include "scene.h" +#include "context.h" +#include "alloc.h" + +#include "../builders/bvh_builder_sah.h" +#include "../builders/bvh_builder_morton.h" + +namespace embree +{ + namespace isa // FIXME: support more ISAs for builders + { + struct BVH : public RefCount + { + BVH (Device* device) + : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0) + { + device->refInc(); + } + + ~BVH() { + device->refDec(); + } + + public: + Device* device; + FastAllocator allocator; + mvector<BVHBuilderMorton::BuildPrim> morton_src; + mvector<BVHBuilderMorton::BuildPrim> morton_tmp; + }; + + void* rtcBuildBVHMorton(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims_i = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic<size_t> progress(0); + + /* initialize temporary arrays for morton builder */ + PrimRef* prims = (PrimRef*) prims_i; + mvector<BVHBuilderMorton::BuildPrim>& morton_src = bvh->morton_src; + mvector<BVHBuilderMorton::BuildPrim>& morton_tmp = bvh->morton_tmp; + morton_src.resize(primitiveCount); + morton_tmp.resize(primitiveCount); + + /* compute centroid bounds */ + const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range<size_t>& r) -> BBox3fa { + + BBox3fa bounds(empty); + for (size_t i=r.begin(); i<r.end(); i++) + bounds.extend(prims[i].bounds().center2()); + return bounds; + }, BBox3fa::merge); + + /* compute morton codes */ + BVHBuilderMorton::MortonCodeMapping mapping(centBounds); + parallel_for ( size_t(0), primitiveCount, [&](const range<size_t>& r) { + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]); + for (size_t i=r.begin(); i<r.end(); i++) { + generator(prims[i].bounds(),(unsigned) i); + } + }); + + /* start morton build */ + std::pair<void*,BBox3fa> root = BVHBuilderMorton::build<std::pair<void*,BBox3fa>>( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that allocates BVH nodes */ + [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* { + return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + }, + + /* lambda function that sets bounds */ + [&] (void* node, const std::pair<void*,BBox3fa>* children, size_t N) -> std::pair<void*,BBox3fa> + { + BBox3fa bounds = empty; + void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR]; + const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i<N; i++) { + bounds.extend(children[i].second); + childptrs[i] = children[i].first; + cbounds[i] = (const RTCBounds*)&children[i].second; + } + setNodeBounds(node,cbounds,(unsigned int)N,userPtr); + setNodeChildren(node,childptrs, (unsigned int)N,userPtr); + return std::make_pair(node,bounds); + }, + + /* lambda function that creates BVH leaves */ + [&]( const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) -> std::pair<void*,BBox3fa> + { + RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF]; + BBox3fa bounds = empty; + for (size_t i=0;i<current.size();i++) + { + const size_t id = morton_src[current.begin()+i].index; + bounds.extend(prims[id].bounds()); + localBuildPrims[i] = prims_i[id]; + } + void* node = createLeaf((RTCThreadLocalAllocator)&alloc,localBuildPrims,current.size(),userPtr); + return std::make_pair(node,bounds); + }, + + /* lambda that calculates the bounds for some primitive */ + [&] (const BVHBuilderMorton::BuildPrim& morton) -> BBox3fa { + return prims[morton.index].bounds(); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + morton_src.data(),morton_tmp.data(),primitiveCount, + *arguments); + + bvh->allocator.cleanup(); + return root.first; + } + + void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic<size_t> progress(0); + + /* calculate priminfo */ + auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa + { + CentGeomBBox3fa bounds(empty); + for (size_t j=r.begin(); j<r.end(); j++) + bounds.extend((BBox3fa&)prims[j]); + return bounds; + }; + const CentGeomBBox3fa bounds = + parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2); + + const PrimInfo pinfo(0,primitiveCount,bounds); + + /* build BVH */ + void* root = BVHBuilderBinnedSAH::build<void*>( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that creates BVH nodes */ + [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void* + { + void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds; + setNodeBounds(node,cbounds, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that updates BVH nodes */ + [&](const BVHBuilderBinnedSAH::BuildRecord& precord, const BVHBuilderBinnedSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* { + setNodeChildren(node,children, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that creates BVH leaves */ + [&](const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* { + return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + (PrimRef*)prims,pinfo,*arguments); + + bvh->allocator.cleanup(); + return root; + } + + static __forceinline const std::pair<CentGeomBBox3fa,unsigned int> mergePair(const std::pair<CentGeomBBox3fa,unsigned int>& a, const std::pair<CentGeomBBox3fa,unsigned int>& b) { + CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first); + unsigned int maxGeomID = max(a.second,b.second); + return std::pair<CentGeomBBox3fa,unsigned int>(centBounds,maxGeomID); + } + + void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic<size_t> progress(0); + + /* calculate priminfo */ + + auto computeBounds = [&](const range<size_t>& r) -> std::pair<CentGeomBBox3fa,unsigned int> + { + CentGeomBBox3fa bounds(empty); + unsigned maxGeomID = 0; + for (size_t j=r.begin(); j<r.end(); j++) + { + bounds.extend((BBox3fa&)prims[j]); + maxGeomID = max(maxGeomID,prims[j].geomID); + } + return std::pair<CentGeomBBox3fa,unsigned int>(bounds,maxGeomID); + }; + + + const std::pair<CentGeomBBox3fa,unsigned int> pair = + parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair<CentGeomBBox3fa,unsigned int>(CentGeomBBox3fa(empty),0), computeBounds, mergePair); + + CentGeomBBox3fa bounds = pair.first; + const unsigned int maxGeomID = pair.second; + + if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)))) + { + /* fallback code for max geomID larger than threshold */ + return rtcBuildBVHBinnedSAH(arguments); + } + + const PrimInfo pinfo(0,primitiveCount,bounds); + + /* function that splits a build primitive */ + struct Splitter + { + Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr) + : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {} + + __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const + { + prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK; + splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr); + left_o.geomIDref() = geomID; left_o.primIDref() = primID; + right_o.geomIDref() = geomID; right_o.primIDref() = primID; + } + + __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const + { + PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID); + splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr); + } + + RTCSplitPrimitiveFunction splitPrimitive; + unsigned geomID; + unsigned primID; + void* userPtr; + }; + + /* build BVH */ + void* root = BVHBuilderBinnedFastSpatialSAH::build<void*>( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that creates BVH nodes */ + [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void* + { + void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds; + setNodeBounds(node,cbounds, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that updates BVH nodes */ + [&] (const BVHBuilderBinnedFastSpatialSAH::BuildRecord& precord, const BVHBuilderBinnedFastSpatialSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* { + setNodeChildren(node,children, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that creates BVH leaves */ + [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* { + return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr); + }, + + /* returns the splitter */ + [&] ( const PrimRef& prim ) -> Splitter { + return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + (PrimRef*)prims, + arguments->primitiveArrayCapacity, + pinfo,*arguments); + + bvh->allocator.cleanup(); + return root; + } + } +} + +using namespace embree; +using namespace embree::isa; + +RTC_NAMESPACE_BEGIN + + RTC_API RTCBVH rtcNewBVH(RTCDevice device) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewAllocator); + RTC_VERIFY_HANDLE(device); + BVH* bvh = new BVH((Device*)device); + return (RTCBVH) bvh->refInc(); + RTC_CATCH_END((Device*)device); + return nullptr; + } + + RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcBuildBVH); + RTC_VERIFY_HANDLE(bvh); + RTC_VERIFY_HANDLE(arguments); + RTC_VERIFY_HANDLE(arguments->createNode); + RTC_VERIFY_HANDLE(arguments->setNodeChildren); + RTC_VERIFY_HANDLE(arguments->setNodeBounds); + RTC_VERIFY_HANDLE(arguments->createLeaf); + + if (arguments->primitiveArrayCapacity < arguments->primitiveCount) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount") + + /* initialize the allocator */ + bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa)); + bvh->allocator.reset(); + + /* switch between differnet builders based on quality level */ + if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW) + return rtcBuildBVHMorton(arguments); + else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM) + return rtcBuildBVHBinnedSAH(arguments); + else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) { + if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount) + return rtcBuildBVHBinnedSAH(arguments); + else + return rtcBuildBVHSpatialSAH(arguments); + } + else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality"); + + /* if we are in dynamic mode, then do not clear temporary data */ + if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC)) + { + bvh->morton_src.clear(); + bvh->morton_tmp.clear(); + } + + RTC_CATCH_END(bvh->device); + return nullptr; + } + + RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align) + { + FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcThreadLocalAlloc); + return alloc->malloc0(bytes,align); + RTC_CATCH_END(alloc->alloc->getDevice()); + return nullptr; + } + + RTC_API void rtcMakeStaticBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcStaticBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->morton_src.clear(); + bvh->morton_tmp.clear(); + RTC_CATCH_END(bvh->device); + } + + RTC_API void rtcRetainBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + Device* device = bvh ? bvh->device : nullptr; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->refInc(); + RTC_CATCH_END(device); + } + + RTC_API void rtcReleaseBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + Device* device = bvh ? bvh->device : nullptr; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->refDec(); + RTC_CATCH_END(device); + } + +RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp new file mode 100644 index 0000000000..1e23aeb415 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp @@ -0,0 +1,976 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "scene.h" + +#include "../bvh/bvh4_factory.h" +#include "../bvh/bvh8_factory.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + /* error raising rtcIntersect and rtcOccluded functions */ + void missing_rtcCommit() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); } + void invalid_rtcIntersect1() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); } + void invalid_rtcIntersect4() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); } + void invalid_rtcIntersect8() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); } + void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); } + void invalid_rtcIntersectN() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); } + + Scene::Scene (Device* device) + : device(device), + flags_modified(true), enabled_geometry_types(0), + scene_flags(RTC_SCENE_FLAG_NONE), + quality_flags(RTC_BUILD_QUALITY_MEDIUM), + is_build(false), modified(true), + progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0) + { + device->refInc(); + + intersectors = Accel::Intersectors(missing_rtcCommit); + + /* one can overwrite flags through device for debugging */ + if (device->quality_flags != -1) + quality_flags = (RTCBuildQuality) device->quality_flags; + if (device->scene_flags != -1) + scene_flags = (RTCSceneFlags) device->scene_flags; + } + + Scene::~Scene() noexcept + { + device->refDec(); + } + + void Scene::printStatistics() + { + /* calculate maximum number of time segments */ + unsigned max_time_steps = 0; + for (size_t i=0; i<size(); i++) { + if (!get(i)) continue; + max_time_steps = max(max_time_steps,get(i)->numTimeSteps); + } + + /* initialize vectors*/ + std::vector<size_t> statistics[Geometry::GTY_END]; + for (size_t i=0; i<Geometry::GTY_END; i++) + statistics[i].resize(max_time_steps); + + /* gather statistics */ + for (size_t i=0; i<size(); i++) + { + if (!get(i)) continue; + int ty = get(i)->getType(); + assert(ty<Geometry::GTY_END); + int timesegments = get(i)->numTimeSegments(); + assert((unsigned int)timesegments < max_time_steps); + statistics[ty][timesegments] += get(i)->size(); + } + + /* print statistics */ + std::cout << std::setw(23) << "segments" << ": "; + for (size_t t=0; t<max_time_steps; t++) + std::cout << std::setw(10) << t; + std::cout << std::endl; + + std::cout << "-------------------------"; + for (size_t t=0; t<max_time_steps; t++) + std::cout << "----------"; + std::cout << std::endl; + + for (size_t p=0; p<Geometry::GTY_END; p++) + { + if (std::string(Geometry::gtype_names[p]) == "") continue; + std::cout << std::setw(23) << Geometry::gtype_names[p] << ": "; + for (size_t t=0; t<max_time_steps; t++) + std::cout << std::setw(10) << statistics[p][t]; + std::cout << std::endl; + } + } + + void Scene::createTriangleAccel() + { +#if defined(EMBREE_GEOMETRY_TRIANGLE) + if (device->tri_accel == "default") + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + + break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else /* dynamic */ + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + } + else if (device->tri_accel == "bvh4.triangle4") accels_add(device->bvh4_factory->BVH4Triangle4 (this)); + else if (device->tri_accel == "bvh4.triangle4v") accels_add(device->bvh4_factory->BVH4Triangle4v(this)); + else if (device->tri_accel == "bvh4.triangle4i") accels_add(device->bvh4_factory->BVH4Triangle4i(this)); + else if (device->tri_accel == "qbvh4.triangle4i") accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->tri_accel == "bvh8.triangle4") accels_add(device->bvh8_factory->BVH8Triangle4 (this)); + else if (device->tri_accel == "bvh8.triangle4v") accels_add(device->bvh8_factory->BVH8Triangle4v(this)); + else if (device->tri_accel == "bvh8.triangle4i") accels_add(device->bvh8_factory->BVH8Triangle4i(this)); + else if (device->tri_accel == "qbvh8.triangle4i") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this)); + else if (device->tri_accel == "qbvh8.triangle4") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel); +#endif + } + + void Scene::createTriangleMBAccel() + { +#if defined(EMBREE_GEOMETRY_TRIANGLE) + if (device->tri_accel_mb == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this)); + else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this)); + else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb); +#endif + } + + void Scene::createQuadAccel() + { +#if defined(EMBREE_GEOMETRY_QUAD) + if (device->quad_accel == "default") + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) + { + /* static */ + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + break; + + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else /* dynamic */ + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + } + else if (device->quad_accel == "bvh4.quad4v") accels_add(device->bvh4_factory->BVH4Quad4v(this)); + else if (device->quad_accel == "bvh4.quad4i") accels_add(device->bvh4_factory->BVH4Quad4i(this)); + else if (device->quad_accel == "qbvh4.quad4i") accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->quad_accel == "bvh8.quad4v") accels_add(device->bvh8_factory->BVH8Quad4v(this)); + else if (device->quad_accel == "bvh8.quad4i") accels_add(device->bvh8_factory->BVH8Quad4i(this)); + else if (device->quad_accel == "qbvh8.quad4i") accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel); +#endif + } + + void Scene::createQuadMBAccel() + { +#if defined(EMBREE_GEOMETRY_QUAD) + if (device->quad_accel_mb == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + break; + + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb); +#endif + } + + void Scene::createHairAccel() + { +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + if (device->hair_accel == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel); +#endif + } + + void Scene::createHairMBAccel() + { +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + if (device->hair_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower + { + if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST)); + else accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST)); + else accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST)); + } + } + else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb); +#endif + } + + void Scene::createSubdivAccel() + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + if (device->subdiv_accel == "default") { + accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + } + else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel); +#endif + } + + void Scene::createSubdivMBAccel() + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + if (device->subdiv_accel_mb == "default") { + accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this)); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb); +#endif + } + + void Scene::createUserGeometryAccel() + { +#if defined(EMBREE_GEOMETRY_USER) + if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel); +#endif + } + + void Scene::createUserGeometryMBAccel() + { +#if defined(EMBREE_GEOMETRY_USER) + if (device->object_accel_mb == "default" ) { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8UserGeometryMB(this)); + else +#endif + accels_add(device->bvh4_factory->BVH4UserGeometryMB(this)); + } + else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb); +#endif + } + + void Scene::createInstanceAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + // if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel); +#endif + } + + void Scene::createInstanceMBAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + //if (device->instance_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8InstanceMB(this, false)); + else +#endif + accels_add(device->bvh4_factory->BVH4InstanceMB(this, false)); + } + //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb); +#endif + } + + void Scene::createInstanceExpensiveAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + // if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel); +#endif + } + + void Scene::createInstanceExpensiveMBAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + //if (device->instance_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8InstanceMB(this, true)); + else +#endif + accels_add(device->bvh4_factory->BVH4InstanceMB(this, true)); + } + //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb); +#endif + } + + void Scene::createGridAccel() + { + BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST; +#if defined(EMBREE_GEOMETRY_GRID) + if (device->grid_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + { + accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); + } + else +#endif + { + accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); + } + } + else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel); +#endif + + } + + void Scene::createGridMBAccel() + { +#if defined(EMBREE_GEOMETRY_GRID) + if (device->grid_accel_mb == "default") + { + accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC)); + } + else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this)); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel); +#endif + + } + + void Scene::clear() { + } + + unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(geometriesMutex); +#else + Lock<SpinLock> lock(geometriesMutex); +#endif + if (geomID == RTC_INVALID_GEOMETRY_ID) { + geomID = id_pool.allocate(); + if (geomID == RTC_INVALID_GEOMETRY_ID) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene"); + } + else + { + if (!id_pool.add(geomID)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided"); + } + if (geomID >= geometries.size()) { + geometries.resize(geomID+1); + vertices.resize(geomID+1); + geometryModCounters_.resize(geomID+1); + } + geometries[geomID] = geometry; + geometryModCounters_[geomID] = 0; + if (geometry->isEnabled()) { + setModified (); + } + return geomID; + } + + void Scene::detachGeometry(size_t geomID) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(geometriesMutex); +#else + Lock<SpinLock> lock(geometriesMutex); +#endif + + if (geomID >= geometries.size()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID"); + + Ref<Geometry>& geometry = geometries[geomID]; + if (geometry == null) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry"); + + if (geometry->isEnabled()) { + setModified (); + } + accels_deleteGeometry(unsigned(geomID)); + id_pool.deallocate((unsigned)geomID); + geometries[geomID] = null; + vertices[geomID] = nullptr; + geometryModCounters_[geomID] = 0; + } + + void Scene::updateInterface() + { + is_build = true; + } + + void Scene::commit_task () + { + checkIfModifiedAndSet (); + if (!isModified()) { + return; + } + + /* print scene statistics */ + if (device->verbosity(2)) + printStatistics(); + + progress_monitor_counter = 0; + + /* gather scene stats and call preCommit function of each geometry */ + this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), + [this](const range<size_t>& r)->GeometryCounts + { + GeometryCounts c; + for (auto i=r.begin(); i<r.end(); ++i) + { + if (geometries[i] && geometries[i]->isEnabled()) + { + geometries[i]->preCommit(); + geometries[i]->addElementsToCount (c); + c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions(); + } + } + return c; + }, + std::plus<GeometryCounts>() + ); + + /* select acceleration structures to build */ + unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask(); + if (flags_modified || new_enabled_geometry_types != enabled_geometry_types) + { + accels_init(); + + /* we need to make all geometries modified, otherwise two level builder will + not rebuild currently not modified geometries */ + parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) { + geometryModCounters_[i] = 0; + }); + + if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel(); + if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel(); + if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel(); + if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel(); + if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel(); + if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel(); + if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel(); + if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel(); + if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel(); + if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel(); + if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel(); + if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel(); + + flags_modified = false; + enabled_geometry_types = new_enabled_geometry_types; + } + + /* select fast code path if no filter function is present */ + accels_select(hasFilterFunction()); + + /* build all hierarchies of this scene */ + accels_build(); + + /* make static geometry immutable */ + if (!isDynamicAccel()) { + accels_immutable(); + flags_modified = true; // in non-dynamic mode we have to re-create accels + } + + /* call postCommit function of each geometry */ + parallel_for(geometries.size(), [&] ( const size_t i ) { + if (geometries[i] && geometries[i]->isEnabled()) { + geometries[i]->postCommit(); + vertices[i] = geometries[i]->getCompactVertexArray(); + geometryModCounters_[i] = geometries[i]->getModCounter(); + } + }); + + updateInterface(); + + if (device->verbosity(2)) { + std::cout << "created scene intersector" << std::endl; + accels_print(2); + std::cout << "selected scene intersector" << std::endl; + intersectors.print(2); + } + + setModified(false); + } + + void Scene::setBuildQuality(RTCBuildQuality quality_flags_i) + { + if (quality_flags == quality_flags_i) return; + quality_flags = quality_flags_i; + flags_modified = true; + } + + RTCBuildQuality Scene::getBuildQuality() const { + return quality_flags; + } + + void Scene::setSceneFlags(RTCSceneFlags scene_flags_i) + { + if (scene_flags == scene_flags_i) return; + scene_flags = scene_flags_i; + flags_modified = true; + } + + RTCSceneFlags Scene::getSceneFlags() const { + return scene_flags; + } + +#if defined(TASKING_INTERNAL) + + void Scene::commit (bool join) + { + Lock<MutexSys> buildLock(buildMutex,false); + + /* allocates own taskscheduler for each build */ + Ref<TaskScheduler> scheduler = nullptr; + { + Lock<MutexSys> lock(schedulerMutex); + scheduler = this->scheduler; + if (scheduler == null) { + buildLock.lock(); + this->scheduler = scheduler = new TaskScheduler; + } + } + + /* worker threads join build */ + if (!buildLock.isLocked()) + { + if (!join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation"); + + scheduler->join(); + return; + } + + /* initiate build */ + // -- GODOT start -- + // try { + scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join); + // } + // catch (...) { + // accels_clear(); + // updateInterface(); + // Lock<MutexSys> lock(schedulerMutex); + // this->scheduler = nullptr; + // throw; + // } + // -- GODOT end -- + } + +#endif + +#if defined(TASKING_TBB) || defined(TASKING_GCD) + + void Scene::commit (bool join) + { +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) + if (join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version"); +#endif + + /* try to obtain build lock */ + Lock<MutexSys> lock(buildMutex,buildMutex.try_lock()); + + /* join hierarchy build */ + if (!lock.isLocked()) + { +#if !TASKING_TBB_USE_TASK_ISOLATION + if (!join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version"); +#endif + + do { + +#if defined(TASKING_GCD) + // Do Nothing +#else +#if USE_TASK_ARENA + if (join) { + device->arena->execute([&]{ group.wait(); }); + } + else +#endif + { + group.wait(); + } +#endif + + pause_cpu(); + yield(); + + } while (!buildMutex.try_lock()); + + buildMutex.unlock(); + return; + } + + /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ + const unsigned int mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); + + try { +#if defined(TASKING_TBB) +#if TBB_INTERFACE_VERSION_MAJOR < 8 + tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits); +#else + tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings ); +#endif + //ctx.set_priority(tbb::priority_high); + +#if USE_TASK_ARENA + if (join) + { + device->arena->execute([&]{ + group.run([&]{ + tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx); + }); + group.wait(); + }); + } + else +#endif + { + group.run([&]{ + tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx); + }); + group.wait(); + } + + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + +#elif defined(TASKING_GCD) + + commit_task(); + +#endif // #if defined(TASKING_TBB) + + } + catch (...) + { + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + + accels_clear(); + updateInterface(); + throw; + } + } +#endif + +#if defined(TASKING_PPL) + + void Scene::commit (bool join) + { +#if defined(TASKING_PPL) + if (join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL"); +#endif + + /* try to obtain build lock */ + Lock<MutexSys> lock(buildMutex); + + checkIfModifiedAndSet (); + if (!isModified()) { + return; + } + + /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ + const unsigned int mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); + + try { + + group.run([&]{ + concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); }); + }); + group.wait(); + + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + } + catch (...) + { + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + + accels_clear(); + updateInterface(); + throw; + } + } +#endif + + void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr) + { + progress_monitor_function = func; + progress_monitor_ptr = ptr; + } + + void Scene::progressMonitor(double dn) + { + if (progress_monitor_function) { + size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn)); + if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) { + throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination"); + } + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene.h b/thirdparty/embree-aarch64/kernels/common/scene.h new file mode 100644 index 0000000000..b41c6cde91 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene.h @@ -0,0 +1,390 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" +#include "builder.h" +#include "../../common/algorithms/parallel_any_of.h" +#include "scene_triangle_mesh.h" +#include "scene_quad_mesh.h" +#include "scene_user_geometry.h" +#include "scene_instance.h" +#include "scene_curves.h" +#include "scene_line_segments.h" +#include "scene_subdiv_mesh.h" +#include "scene_grid_mesh.h" +#include "scene_points.h" +#include "../subdiv/tessellation_cache.h" + +#include "acceln.h" +#include "geometry.h" + +namespace embree +{ + /*! Base class all scenes are derived from */ + class Scene : public AccelN + { + ALIGNED_CLASS_(std::alignment_of<Scene>::value); + + public: + template<typename Ty, bool mblur = false> + class Iterator + { + public: + Iterator () {} + + Iterator (Scene* scene, bool all = false) + : scene(scene), all(all) {} + + __forceinline Ty* at(const size_t i) + { + Geometry* geom = scene->geometries[i].ptr; + if (geom == nullptr) return nullptr; + if (!all && !geom->isEnabled()) return nullptr; + const size_t mask = geom->getTypeMask() & Ty::geom_type; + if (!(mask)) return nullptr; + if ((geom->numTimeSteps != 1) != mblur) return nullptr; + return (Ty*) geom; + } + + __forceinline Ty* operator[] (const size_t i) { + return at(i); + } + + __forceinline size_t size() const { + return scene->size(); + } + + __forceinline size_t numPrimitives() const { + return scene->getNumPrimitives(Ty::geom_type,mblur); + } + + __forceinline size_t maxPrimitivesPerGeometry() + { + size_t ret = 0; + for (size_t i=0; i<scene->size(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,mesh->size()); + } + return ret; + } + + __forceinline unsigned int maxGeomID() + { + unsigned int ret = 0; + for (size_t i=0; i<scene->size(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,(unsigned int)i); + } + return ret; + } + + __forceinline unsigned maxTimeStepsPerGeometry() + { + unsigned ret = 0; + for (size_t i=0; i<scene->size(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,mesh->numTimeSteps); + } + return ret; + } + + private: + Scene* scene; + bool all; + }; + + class Iterator2 + { + public: + Iterator2 () {} + + Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) + : scene(scene), typemask(typemask), mblur(mblur) {} + + __forceinline Geometry* at(const size_t i) + { + Geometry* geom = scene->geometries[i].ptr; + if (geom == nullptr) return nullptr; + if (!geom->isEnabled()) return nullptr; + if (!(geom->getTypeMask() & typemask)) return nullptr; + if ((geom->numTimeSteps != 1) != mblur) return nullptr; + return geom; + } + + __forceinline Geometry* operator[] (const size_t i) { + return at(i); + } + + __forceinline size_t size() const { + return scene->size(); + } + + private: + Scene* scene; + Geometry::GTypeMask typemask; + bool mblur; + }; + + public: + + /*! Scene construction */ + Scene (Device* device); + + /*! Scene destruction */ + ~Scene () noexcept; + + private: + /*! class is non-copyable */ + Scene (const Scene& other) DELETED; // do not implement + Scene& operator= (const Scene& other) DELETED; // do not implement + + public: + void createTriangleAccel(); + void createTriangleMBAccel(); + void createQuadAccel(); + void createQuadMBAccel(); + void createHairAccel(); + void createHairMBAccel(); + void createSubdivAccel(); + void createSubdivMBAccel(); + void createUserGeometryAccel(); + void createUserGeometryMBAccel(); + void createInstanceAccel(); + void createInstanceMBAccel(); + void createInstanceExpensiveAccel(); + void createInstanceExpensiveMBAccel(); + void createGridAccel(); + void createGridMBAccel(); + + /*! prints statistics about the scene */ + void printStatistics(); + + /*! clears the scene */ + void clear(); + + /*! detaches some geometry */ + void detachGeometry(size_t geomID); + + void setBuildQuality(RTCBuildQuality quality_flags); + RTCBuildQuality getBuildQuality() const; + + void setSceneFlags(RTCSceneFlags scene_flags); + RTCSceneFlags getSceneFlags() const; + + void commit (bool join); + void commit_task (); + void build () {} + + void updateInterface(); + + /* return number of geometries */ + __forceinline size_t size() const { return geometries.size(); } + + /* bind geometry to the scene */ + unsigned int bind (unsigned geomID, Ref<Geometry> geometry); + + /* determines if scene is modified */ + __forceinline bool isModified() const { return modified; } + + /* sets modified flag */ + __forceinline void setModified(bool f = true) { + modified = f; + } + + __forceinline bool isGeometryModified(size_t geomID) + { + Ref<Geometry>& g = geometries[geomID]; + if (!g) return false; + return g->getModCounter() > geometryModCounters_[geomID]; + } + + protected: + + __forceinline void checkIfModifiedAndSet () + { + if (isModified ()) return; + + auto geometryIsModified = [this](size_t geomID)->bool { + return isGeometryModified(geomID); + }; + + if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) { + setModified (); + } + } + + public: + + /* get mesh by ID */ + __forceinline Geometry* get(size_t i) { assert(i < geometries.size()); return geometries[i].ptr; } + __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; } + + template<typename Mesh> + __forceinline Mesh* get(size_t i) { + assert(i < geometries.size()); + assert(geometries[i]->getTypeMask() & Mesh::geom_type); + return (Mesh*)geometries[i].ptr; + } + template<typename Mesh> + __forceinline const Mesh* get(size_t i) const { + assert(i < geometries.size()); + assert(geometries[i]->getTypeMask() & Mesh::geom_type); + return (Mesh*)geometries[i].ptr; + } + + template<typename Mesh> + __forceinline Mesh* getSafe(size_t i) { + assert(i < geometries.size()); + if (geometries[i] == null) return nullptr; + if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr; + else return (Mesh*) geometries[i].ptr; + } + + __forceinline Ref<Geometry> get_locked(size_t i) { + Lock<SpinLock> lock(geometriesMutex); + assert(i < geometries.size()); + return geometries[i]; + } + + /* flag decoding */ + __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); } + __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; } + __forceinline bool isRobustAccel() const { return scene_flags & RTC_SCENE_FLAG_ROBUST; } + __forceinline bool isStaticAccel() const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); } + __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; } + + __forceinline bool hasContextFilterFunction() const { + return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION; + } + + __forceinline bool hasGeometryFilterFunction() { + return world.numFilterFunctions != 0; + } + + __forceinline bool hasFilterFunction() { + return hasContextFilterFunction() || hasGeometryFilterFunction(); + } + + /* test if scene got already build */ + __forceinline bool isBuild() const { return is_build; } + + public: + IDPool<unsigned,0xFFFFFFFE> id_pool; + vector<Ref<Geometry>> geometries; //!< list of all user geometries + vector<unsigned int> geometryModCounters_; + vector<float*> vertices; + + public: + Device* device; + + /* these are to detect if we need to recreate the acceleration structures */ + bool flags_modified; + unsigned int enabled_geometry_types; + + RTCSceneFlags scene_flags; + RTCBuildQuality quality_flags; + MutexSys buildMutex; + SpinLock geometriesMutex; + bool is_build; + private: + bool modified; //!< true if scene got modified + + public: + + /*! global lock step task scheduler */ +#if defined(TASKING_INTERNAL) + MutexSys schedulerMutex; + Ref<TaskScheduler> scheduler; +#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION + tbb::isolated_task_group group; +#elif defined(TASKING_TBB) + tbb::task_group group; +#elif defined(TASKING_PPL) + concurrency::task_group group; +#endif + + public: + struct BuildProgressMonitorInterface : public BuildProgressMonitor { + BuildProgressMonitorInterface(Scene* scene) + : scene(scene) {} + void operator() (size_t dn) const { scene->progressMonitor(double(dn)); } + private: + Scene* scene; + }; + BuildProgressMonitorInterface progressInterface; + RTCProgressMonitorFunction progress_monitor_function; + void* progress_monitor_ptr; + std::atomic<size_t> progress_monitor_counter; + void progressMonitor(double nprims); + void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr); + + private: + GeometryCounts world; //!< counts for geometry + + public: + + __forceinline size_t numPrimitives() const { + return world.size(); + } + + __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const + { + size_t count = 0; + + if (mask & Geometry::MTY_TRIANGLE_MESH) + count += mblur ? world.numMBTriangles : world.numTriangles; + + if (mask & Geometry::MTY_QUAD_MESH) + count += mblur ? world.numMBQuads : world.numQuads; + + if (mask & Geometry::MTY_CURVE2) + count += mblur ? world.numMBLineSegments : world.numLineSegments; + + if (mask & Geometry::MTY_CURVE4) + count += mblur ? world.numMBBezierCurves : world.numBezierCurves; + + if (mask & Geometry::MTY_POINTS) + count += mblur ? world.numMBPoints : world.numPoints; + + if (mask & Geometry::MTY_SUBDIV_MESH) + count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches; + + if (mask & Geometry::MTY_USER_GEOMETRY) + count += mblur ? world.numMBUserGeometries : world.numUserGeometries; + + if (mask & Geometry::MTY_INSTANCE_CHEAP) + count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap; + + if (mask & Geometry::MTY_INSTANCE_EXPENSIVE) + count += mblur ? world.numMBInstancesExpensive : world.numInstancesExpensive; + + if (mask & Geometry::MTY_GRID_MESH) + count += mblur ? world.numMBGrids : world.numGrids; + + return count; + } + + template<typename Mesh, bool mblur> + __forceinline unsigned getNumTimeSteps() + { + if (!mblur) + return 1; + + Scene::Iterator<Mesh,mblur> iter(this); + return iter.maxTimeStepsPerGeometry(); + } + + template<typename Mesh, bool mblur> + __forceinline unsigned int getMaxGeomID() + { + Scene::Iterator<Mesh,mblur> iter(this); + return iter.maxGeomID(); + } + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_curves.h b/thirdparty/embree-aarch64/kernels/common/scene_curves.h new file mode 100644 index 0000000000..2649ab0e3e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_curves.h @@ -0,0 +1,341 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! represents an array of bicubic bezier curves */ + struct CurveGeometry : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4; + + public: + + /*! bezier curve construction */ + CurveGeometry (Device* device, Geometry::GType gtype); + + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void setTessellationRate(float N); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns the i'th curve */ + __forceinline const unsigned int& curve(size_t i) const { + return curves[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th tangent of the first time step */ + __forceinline Vec3ff tangent(size_t i) const { + return tangents0[i]; + } + + /*! returns i'th normal derivative of the first time step */ + __forceinline Vec3fa dnormal(size_t i) const { + return dnormals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th tangent of itime'th timestep */ + __forceinline Vec3ff tangent(size_t i, size_t itime) const { + return tangents[itime][i]; + } + + /*! returns i'th normal derivative of itime'th timestep */ + __forceinline Vec3fa dnormal(size_t i, size_t itime) const { + return dnormals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! gathers the curve starting with i'th vertex */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const + { + p0 = vertex(i+0); + p1 = vertex(i+1); + p2 = vertex(i+2); + p3 = vertex(i+3); + } + + /*! gathers the curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const + { + p0 = vertex(i+0,itime); + p1 = vertex(i+1,itime); + p2 = vertex(i+2,itime); + p3 = vertex(i+3,itime); + } + + /*! gathers the curve starting with i'th vertex */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const + { + p0 = vertex(i+0); + p1 = vertex(i+1); + p2 = vertex(i+2); + p3 = vertex(i+3); + n0 = normal(i+0); + n1 = normal(i+1); + n2 = normal(i+2); + n3 = normal(i+3); + } + + /*! gathers the curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const + { + p0 = vertex(i+0,itime); + p1 = vertex(i+1,itime); + p2 = vertex(i+2,itime); + p3 = vertex(i+3,itime); + n0 = normal(i+0,itime); + n1 = normal(i+1,itime); + n2 = normal(i+2,itime); + n3 = normal(i+3,itime); + } + + /*! prefetches the curve starting with i'th vertex of itime'th timestep */ + __forceinline void prefetchL1_vertices(size_t i) const + { + prefetchL1(vertices0.getPtr(i)+0); + prefetchL1(vertices0.getPtr(i)+64); + } + + /*! prefetches the curve starting with i'th vertex of itime'th timestep */ + __forceinline void prefetchL2_vertices(size_t i) const + { + prefetchL2(vertices0.getPtr(i)+0); + prefetchL2(vertices0.getPtr(i)+64); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + + const float t0 = 1.0f - ftime; + const float t1 = ftime; + Vec3ff a0,a1,a2,a3; + gather(a0,a1,a2,a3,i,itime); + Vec3ff b0,b1,b2,b3; + gather(b0,b1,b2,b3,i,itime+1); + p0 = madd(Vec3ff(t0),a0,t1*b0); + p1 = madd(Vec3ff(t0),a1,t1*b1); + p2 = madd(Vec3ff(t0),a2,t1*b2); + p3 = madd(Vec3ff(t0),a3,t1*b3); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + + const float t0 = 1.0f - ftime; + const float t1 = ftime; + Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3; + gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime); + Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3; + gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1); + p0 = madd(Vec3ff(t0),a0,t1*b0); + p1 = madd(Vec3ff(t0),a1,t1*b1); + p2 = madd(Vec3ff(t0),a2,t1*b2); + p3 = madd(Vec3ff(t0),a3,t1*b3); + n0 = madd(Vec3ff(t0),an0,t1*bn0); + n1 = madd(Vec3ff(t0),an1,t1*bn1); + n2 = madd(Vec3ff(t0),an2,t1*bn2); + n3 = madd(Vec3ff(t0),an3,t1*bn3); + } + + template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa> + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const + { + Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3; + unsigned int vertexID = curve(primID); + gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime); + SourceCurve3ff ccurve(v0,v1,v2,v3); + SourceCurve3fa ncurve(n0,n1,n2,n3); + ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve); + return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + } + + template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa> + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0); + const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1); + return clerp(curve0,curve1,ftime); + } + + /*! gathers the hermite curve starting with i'th vertex */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const + { + p0 = vertex (i+0); + p1 = vertex (i+1); + t0 = tangent(i+0); + t1 = tangent(i+1); + } + + /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const + { + p0 = vertex (i+0,itime); + p1 = vertex (i+1,itime); + t0 = tangent(i+0,itime); + t1 = tangent(i+1,itime); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const float f0 = 1.0f - ftime, f1 = ftime; + Vec3ff ap0,at0,ap1,at1; + gather_hermite(ap0,at0,ap1,at1,i,itime); + Vec3ff bp0,bt0,bp1,bt1; + gather_hermite(bp0,bt0,bp1,bt1,i,itime+1); + p0 = madd(Vec3ff(f0),ap0,f1*bp0); + t0 = madd(Vec3ff(f0),at0,f1*bt0); + p1 = madd(Vec3ff(f0),ap1,f1*bp1); + t1 = madd(Vec3ff(f0),at1,f1*bt1); + } + + /*! gathers the hermite curve starting with i'th vertex */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const + { + p0 = vertex (i+0); + p1 = vertex (i+1); + t0 = tangent(i+0); + t1 = tangent(i+1); + n0 = normal(i+0); + n1 = normal(i+1); + dn0 = dnormal(i+0); + dn1 = dnormal(i+1); + } + + /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const + { + p0 = vertex (i+0,itime); + p1 = vertex (i+1,itime); + t0 = tangent(i+0,itime); + t1 = tangent(i+1,itime); + n0 = normal(i+0,itime); + n1 = normal(i+1,itime); + dn0 = dnormal(i+0,itime); + dn1 = dnormal(i+1,itime); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const float f0 = 1.0f - ftime, f1 = ftime; + Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1; + gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime); + Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1; + gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1); + p0 = madd(Vec3ff(f0),ap0,f1*bp0); + t0 = madd(Vec3ff(f0),at0,f1*bt0); + n0 = madd(Vec3ff(f0),an0,f1*bn0); + dn0= madd(Vec3ff(f0),adn0,f1*bdn0); + p1 = madd(Vec3ff(f0),ap1,f1*bp1); + t1 = madd(Vec3ff(f0),at1,f1*bt1); + n1 = madd(Vec3ff(f0),an1,f1*bn1); + dn1= madd(Vec3ff(f0),adn1,f1*bdn1); + } + + template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa> + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const + { + Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1; + unsigned int vertexID = curve(primID); + gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime); + + SourceCurve3ff ccurve(v0,t0,v1,t1); + SourceCurve3fa ncurve(n0,dn0,n1,dn1); + ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve); + return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + } + + template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa> + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0); + const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1); + return clerp(curve0,curve1,ftime); + } + + private: + void resizeBuffers(unsigned int numSteps); + + public: + BufferView<unsigned int> curves; //!< array of curve indices + BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer + BufferView<Vec3fa> normals0; //!< fast access to first normal buffer + BufferView<Vec3ff> tangents0; //!< fast access to first tangent buffer + BufferView<Vec3fa> dnormals0; //!< fast access to first normal derivative buffer + vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep + vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep + vector<BufferView<Vec3ff>> tangents; //!< tangent array for each timestep + vector<BufferView<Vec3fa>> dnormals; //!< normal derivative array for each timestep + BufferView<char> flags; //!< start, end flag per segment + vector<BufferView<char>> vertexAttribs; //!< user buffers + int tessellationRate; //!< tessellation rate for flat curve + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h new file mode 100644 index 0000000000..c08658466a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h @@ -0,0 +1,215 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! Grid Mesh */ + struct GridMesh : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH; + + /*! grid */ + struct Grid + { + unsigned int startVtxID; + unsigned int lineVtxOffset; + unsigned short resX,resY; + + /* border flags due to 3x3 vertex pattern */ + __forceinline unsigned int get3x3FlagsX(const unsigned int x) const + { + return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0; + } + + /* border flags due to 3x3 vertex pattern */ + __forceinline unsigned int get3x3FlagsY(const unsigned int y) const + { + return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0; + } + + /*! outputs grid structure */ + __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) { + return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }"; + } + }; + + public: + + /*! grid mesh construction */ + GridMesh (Device* device); + + /* geometry interface */ + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + void addElementsToCount (GeometryCounts & counts) const; + + __forceinline unsigned int getNumSubGrids(const size_t gridID) + { + const Grid &g = grid(gridID); + return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1)); + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th grid*/ + __forceinline const Grid& grid(size_t i) const { + return grids[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th vertex of the first timestep */ + __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const { + assert(x < (size_t)g.resX); + assert(y < (size_t)g.resY); + return g.startVtxID + x + y * g.lineVtxOffset; + } + + /*! returns i'th vertex of the first timestep */ + __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const { + const size_t index = grid_vertex_index(g,x,y); + return vertex(index); + } + + /*! returns i'th vertex of the itime'th timestep */ + __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const { + const size_t index = grid_vertex_index(g,x,y); + return vertex(index,itime); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const + { + BBox3fa b(empty); + for (size_t t=0; t<numTimeSteps; t++) + { + for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++) + for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++) + { + const Vec3fa v = grid_vertex(g,x,y,t); + if (unlikely(!isvalid(v))) return false; + b.extend(v); + } + } + + bbox = b; + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const + { + assert(itime < numTimeSteps); + BBox3fa b0(empty); + for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++) + for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++) + { + const Vec3fa v = grid_vertex(g,x,y,itime); + if (unlikely(!isvalid(v))) return false; + b0.extend(v); + } + + /* use bounds of first time step in builder */ + bbox = b0; + return true; + } + + __forceinline bool valid(size_t gridID, size_t itime=0) const { + return valid(gridID, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const + { + if (unlikely(gridID >= grids.size())) return false; + const Grid &g = grid(gridID); + if (unlikely(g.startVtxID + 0 >= vertices0.size())) return false; + if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false; + + for (size_t y=0;y<g.resY;y++) + for (size_t x=0;x<g.resX;x++) + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + if (!isvalid(grid_vertex(g,x,y,itime))) return false; + return true; + } + + + __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const + { + BBox3fa box(empty); + buildBounds(g,sx,sy,itime,box); + return box; + } + + __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const { + BBox3fa bounds0, bounds1; + buildBounds(g,sx,sy,itime+0,bounds0); + buildBounds(g,sx,sy,itime+1,bounds1); + return LBBox3fa(bounds0,bounds1); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments); + } + + public: + BufferView<Grid> grids; //!< array of triangles + BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer + vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep + vector<RawBufferView> vertexAttribs; //!< vertex attributes + }; + + namespace isa + { + struct GridMeshISA : public GridMesh + { + GridMeshISA (Device* device) + : GridMesh(device) {} + }; + } + + DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_instance.h b/thirdparty/embree-aarch64/kernels/common/scene_instance.h new file mode 100644 index 0000000000..7ff82a4fb8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_instance.h @@ -0,0 +1,272 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "accel.h" + +namespace embree +{ + struct MotionDerivativeCoefficients; + + /*! Instanced acceleration structure */ + struct Instance : public Geometry + { + ALIGNED_STRUCT_(16); + static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE; + + public: + Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1); + ~Instance(); + + private: + Instance (const Instance& other) DELETED; // do not implement + Instance& operator= (const Instance& other) DELETED; // do not implement + + private: + LBBox3fa nonlinearBounds(const BBox1f& time_range_in, + const BBox1f& geom_time_range, + float geom_time_segments) const; + + BBox3fa boundSegment(size_t itime, + BBox3fa const& obbox0, BBox3fa const& obbox1, + BBox3fa const& bbox0, BBox3fa const& bbox1, + float t_min, float t_max) const; + + /* calculates the (correct) interpolated bounds */ + __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(slerp(local2world[itime0], local2world[itime1], f), + lerp(getObjectBounds(itime0), getObjectBounds(itime1), f)); + return xfmBounds(lerp(local2world[itime0], local2world[itime1], f), + lerp(getObjectBounds(itime0), getObjectBounds(itime1), f)); + } + + public: + virtual void setNumTimeSteps (unsigned int numTimeSteps) override; + virtual void setInstancedScene(const Ref<Scene>& scene) override; + virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override; + virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override; + virtual AffineSpace3fa getTransform(float time) override; + virtual void setMask (unsigned mask) override; + virtual void build() {} + virtual void addElementsToCount (GeometryCounts & counts) const override; + virtual void commit() override; + + public: + + /*! calculates the bounds of instance */ + __forceinline BBox3fa bounds(size_t i) const { + assert(i == 0); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds()); + return xfmBounds(local2world[0],object->bounds.bounds()); + } + + /*! gets the bounds of the instanced scene */ + __forceinline BBox3fa getObjectBounds(size_t itime) const { + return object->getBounds(timeStep(itime)); + } + + /*! calculates the bounds of instance */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const { + assert(i == 0); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime)); + return xfmBounds(local2world[itime],getObjectBounds(itime)); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const { + assert(i == 0); + LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments); + return lbbox; + } + + /*! calculates the build bounds of the i'th item, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + assert(i==0); + const BBox3fa b = bounds(i); + if (bbox) *bbox = b; + return isvalid(b); + } + + /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + assert(i==0); + const LBBox3fa bounds = linearBounds(i,itime); + bbox = bounds.bounds (); + return isvalid(bounds); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return numPrimitives; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return numPrimitives != otherVersion; + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + assert(i == 0); + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + if (!isvalid(bounds(i,itime))) return false; + + return true; + } + + __forceinline AffineSpace3fa getLocal2World() const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return quaternionDecompositionToAffineSpace(local2world[0]); + return local2world[0]; + } + + __forceinline AffineSpace3fa getLocal2World(float t) const + { + float ftime; const unsigned int itime = timeSegment(t, ftime); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return slerp(local2world[itime+0],local2world[itime+1],ftime); + return lerp(local2world[itime+0],local2world[itime+1],ftime); + } + + __forceinline AffineSpace3fa getWorld2Local() const { + return world2local0; + } + + __forceinline AffineSpace3fa getWorld2Local(float t) const { + return rcp(getLocal2World(t)); + } + + template<int K> + __forceinline AffineSpace3vf<K> getWorld2Local(const vbool<K>& valid, const vfloat<K>& t) const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return getWorld2LocalSlerp(valid, t); + return getWorld2LocalLerp(valid, t); + } + + private: + + template<int K> + __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(const vbool<K>& valid, const vfloat<K>& t) const + { + vfloat<K> ftime; + const vint<K> itime_k = timeSegment(t, ftime); + assert(any(valid)); + const size_t index = bsf(movemask(valid)); + const int itime = itime_k[index]; + if (likely(all(valid, itime_k == vint<K>(itime)))) { + return rcp(slerp(AffineSpace3vff<K>(local2world[itime+0]), + AffineSpace3vff<K>(local2world[itime+1]), + ftime)); + } + else { + AffineSpace3vff<K> space0,space1; + vbool<K> valid1 = valid; + while (any(valid1)) { + vbool<K> valid2; + const int itime = next_unique(valid1, itime_k, valid2); + space0 = select(valid2, AffineSpace3vff<K>(local2world[itime+0]), space0); + space1 = select(valid2, AffineSpace3vff<K>(local2world[itime+1]), space1); + } + return rcp(slerp(space0, space1, ftime)); + } + } + + template<int K> + __forceinline AffineSpace3vf<K> getWorld2LocalLerp(const vbool<K>& valid, const vfloat<K>& t) const + { + vfloat<K> ftime; + const vint<K> itime_k = timeSegment(t, ftime); + assert(any(valid)); + const size_t index = bsf(movemask(valid)); + const int itime = itime_k[index]; + if (likely(all(valid, itime_k == vint<K>(itime)))) { + return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), + AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), + ftime)); + } else { + AffineSpace3vf<K> space0,space1; + vbool<K> valid1 = valid; + while (any(valid1)) { + vbool<K> valid2; + const int itime = next_unique(valid1, itime_k, valid2); + space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), space0); + space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), space1); + } + return rcp(lerp(space0, space1, ftime)); + } + } + + public: + Accel* object; //!< pointer to instanced acceleration structure + AffineSpace3ff* local2world; //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition) + AffineSpace3fa world2local0; //!< transformation from world space to local space for timestep 0 + }; + + namespace isa + { + struct InstanceISA : public Instance + { + InstanceISA (Device* device) + : Instance(device) {} + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfo pinfo(empty); + BBox3fa b = empty; + if (!buildBounds(0,&b)) return pinfo; + // const BBox3fa b = bounds(0); + // if (!isvalid(b)) return pinfo; + + const PrimRef prim(b,geomID,unsigned(0)); + pinfo.add_center2(prim); + prims[k++] = prim; + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfo pinfo(empty); + BBox3fa b = empty; + if (!buildBounds(0,&b)) return pinfo; + // if (!valid(0,range<size_t>(itime))) return pinfo; + // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0)); + const PrimRef prim(b,geomID,unsigned(0)); + pinfo.add_center2(prim); + prims[k++] = prim; + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfoMB pinfo(empty); + if (!valid(0, timeSegmentRange(t0t1))) return pinfo; + const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0)); + pinfo.add_primref(prim); + prims[k++] = prim; + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h new file mode 100644 index 0000000000..c0f9ee8f77 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h @@ -0,0 +1,307 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! represents an array of line segments */ + struct LineSegments : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2; + + public: + + /*! line segments construction */ + LineSegments (Device* device, Geometry::GType gtype); + + public: + void setMask (unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify (); + void interpolate(const RTCInterpolateArguments* const args); + void setTessellationRate(float N); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns the i'th segment */ + __forceinline const unsigned int& segment(size_t i) const { + return segments[i]; + } + + /*! returns the segment to the left of the i'th segment */ + __forceinline bool segmentLeftExists(size_t i) const { + assert (flags); + return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0; + } + + /*! returns the segment to the right of the i'th segment */ + __forceinline bool segmentRightExists(size_t i) const { + assert (flags); + return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0; + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const + { + const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1)); + return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w))); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(size_t i) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0); + const Vec3ff v1 = vertex(index+1); + return bounds(v0,v1); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + return bounds(v0,v1); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0); + const Vec3ff v1 = vertex(index+1); + const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w); + const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w); + return bounds(w0,w1); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w); + const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w); + return bounds(w0,w1); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + const unsigned int index = segment(i); + if (index+1 >= numVertices()) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false; + const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false; + if (min(v0.w,v1.w) < 0.0f) return false; + } + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const + { + if (!valid(i,0)) return false; + *bbox = bounds(i); + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + if (!valid(i,itime+0) || !valid(i,itime+1)) return false; + bbox = bounds(i,itime); // use bounds of first time step in builder + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + BufferView<unsigned int> segments; //!< array of line segment indices + BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer + BufferView<Vec3fa> normals0; //!< fast access to first normal buffer + BufferView<char> flags; //!< start, end flag per segment + vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep + vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep + vector<BufferView<char>> vertexAttribs; //!< user buffers + int tessellationRate; //!< tessellation rate for bezier curve + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + namespace isa + { + struct LineSegmentsISA : public LineSegments + { + LineSegmentsISA (Device* device, Geometry::GType gtype) + : LineSegments(device,gtype) {} + + Vec3fa computeDirection(unsigned int primID) const + { + const unsigned vtxID = segment(primID); + const Vec3fa v0 = vertex(vtxID+0); + const Vec3fa v1 = vertex(vtxID+1); + return v1-v0; + } + + Vec3fa computeDirection(unsigned int primID, size_t time) const + { + const unsigned vtxID = segment(primID); + const Vec3fa v0 = vertex(vtxID+0,time); + const Vec3fa v1 = vertex(vtxID+1,time); + return v1-v0; + } + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,itime,bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!valid(j, timeSegmentRange(t0t1))) continue; + const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + + BBox3fa vbounds(size_t i) const { + return bounds(i); + } + + BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const { + return bounds(space,i); + } + + LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const { + return linearBounds(primID,time_range); + } + + LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + return linearBounds(space,primID,time_range); + } + }; + } + + DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_points.h b/thirdparty/embree-aarch64/kernels/common/scene_points.h new file mode 100644 index 0000000000..1d39ed07ba --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_points.h @@ -0,0 +1,282 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "buffer.h" +#include "default.h" +#include "geometry.h" + +namespace embree +{ + /*! represents an array of points */ + struct Points : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS; + + public: + /*! line segments construction */ + Points(Device* device, Geometry::GType gtype); + + public: + void setMask(unsigned mask); + void setNumTimeSteps(unsigned int numTimeSteps); + void setVertexAttributeCount(unsigned int N); + void setBuffer(RTCBufferType type, + unsigned int slot, + RTCFormat format, + const Ref<Buffer>& buffer, + size_t offset, + size_t stride, + unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + public: + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const Vec3ff& v0) const { + return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w)); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(size_t i) const + { + const Vec3ff v0 = vertex(i); + return bounds(v0); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Vec3ff v0 = vertex(i, itime); + return bounds(v0); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const + { + const Vec3ff v0 = vertex(i); + const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w); + return bounds(w0); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const + { + const Vec3ff v0 = vertex(i, itime); + const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w); + return bounds(w0); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + const unsigned int index = (unsigned int)i; + if (index >= numVertices()) + return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) { + const Vec3ff v0 = vertex(index + 0, itime); + if (unlikely(!isvalid4(v0))) + return false; + if (v0.w < 0.0f) + return false; + } + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const + { + if (!valid(i, 0)) + return false; + *bbox = bounds(i); + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + if (!valid(i, itime + 0) || !valid(i, itime + 1)) + return false; + bbox = bounds(i, itime); // use bounds of first time step in builder + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const { + return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer + BufferView<Vec3fa> normals0; //!< fast access to first normal buffer + vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep + vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep + vector<BufferView<char>> vertexAttribs; //!< user buffers + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + namespace isa + { + struct PointsISA : public Points + { + PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {} + + Vec3fa computeDirection(unsigned int primID) const + { + return Vec3fa(1, 0, 0); + } + + Vec3fa computeDirection(unsigned int primID, size_t time) const + { + return Vec3fa(1, 0, 0); + } + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + BBox3fa bounds = empty; + if (!buildBounds(j, &bounds)) + continue; + const PrimRef prim(bounds, geomID, unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + BBox3fa bounds = empty; + if (!buildBounds(j, itime, bounds)) + continue; + const PrimRef prim(bounds, geomID, unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, + const BBox1f& t0t1, + const range<size_t>& r, + size_t k, + unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + if (!valid(j, timeSegmentRange(t0t1))) + continue; + const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + + BBox3fa vbounds(size_t i) const + { + return bounds(i); + } + + BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const + { + return bounds(space, i); + } + + LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const + { + return linearBounds(primID, time_range); + } + + LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const + { + return linearBounds(space, primID, time_range); + } + }; + } // namespace isa + + DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType); +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h new file mode 100644 index 0000000000..d5bb054b14 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h @@ -0,0 +1,277 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! Quad Mesh */ + struct QuadMesh : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH; + + /*! triangle indices */ + struct Quad + { + uint32_t v[4]; + + /*! outputs triangle indices */ + __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) { + return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }"; + } + }; + + public: + + /*! quad mesh construction */ + QuadMesh (Device* device); + + /* geometry interface */ + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th quad */ + __forceinline const Quad& quad(size_t i) const { + return quads[i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! calculates the bounds of the i'th quad */ + __forceinline BBox3fa bounds(size_t i) const + { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0]); + const Vec3fa v1 = vertex(q.v[1]); + const Vec3fa v2 = vertex(q.v[2]); + const Vec3fa v3 = vertex(q.v[3]); + return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3)); + } + + /*! calculates the bounds of the i'th quad at the itime'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0],itime); + const Vec3fa v1 = vertex(q.v[1],itime); + const Vec3fa v2 = vertex(q.v[2],itime); + const Vec3fa v3 = vertex(q.v[3],itime); + return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3)); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + const Quad& q = quad(i); + if (unlikely(q.v[0] >= numVertices())) return false; + if (unlikely(q.v[1] >= numVertices())) return false; + if (unlikely(q.v[2] >= numVertices())) return false; + if (unlikely(q.v[3] >= numVertices())) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + if (!isvalid(vertex(q.v[0],itime))) return false; + if (!isvalid(vertex(q.v[1],itime))) return false; + if (!isvalid(vertex(q.v[2],itime))) return false; + if (!isvalid(vertex(q.v[3],itime))) return false; + } + + return true; + } + + /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const Quad& q = quad(i); + if (q.v[0] >= numVertices()) return false; + if (q.v[1] >= numVertices()) return false; + if (q.v[2] >= numVertices()) return false; + if (q.v[3] >= numVertices()) return false; + + for (unsigned int t=0; t<numTimeSteps; t++) + { + const Vec3fa v0 = vertex(q.v[0],t); + const Vec3fa v1 = vertex(q.v[1],t); + const Vec3fa v2 = vertex(q.v[2],t); + const Vec3fa v3 = vertex(q.v[3],t); + + if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3))) + return false; + } + + if (bbox) + *bbox = bounds(i); + + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + const Quad& q = quad(i); + if (unlikely(q.v[0] >= numVertices())) return false; + if (unlikely(q.v[1] >= numVertices())) return false; + if (unlikely(q.v[2] >= numVertices())) return false; + if (unlikely(q.v[3] >= numVertices())) return false; + + assert(itime+1 < numTimeSteps); + const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false; + const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false; + const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false; + const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false; + const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false; + const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false; + const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false; + const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false; + + /* use bounds of first time step in builder */ + bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3)); + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(dt))) return false; + bbox = linearBounds(i, dt); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return quads.modCounter; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return quads.isModified(otherVersion); // || numPrimitivesChanged; + } + + /* returns the projected area */ + __forceinline float projectedPrimitiveArea(const size_t i) const { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0]); + const Vec3fa v1 = vertex(q.v[1]); + const Vec3fa v2 = vertex(q.v[2]); + const Vec3fa v3 = vertex(q.v[3]); + return areaProjectedTriangle(v0,v1,v3) + + areaProjectedTriangle(v1,v2,v3); + } + + public: + BufferView<Quad> quads; //!< array of quads + BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer + vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep + vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers + }; + + namespace isa + { + struct QuadMeshISA : public QuadMesh + { + QuadMeshISA (Device* device) + : QuadMesh(device) {} + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,itime,bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!valid(j, timeSegmentRange(t0t1))) continue; + const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h new file mode 100644 index 0000000000..d0246009db --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h @@ -0,0 +1,326 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" +#include "../subdiv/half_edge.h" +#include "../subdiv/tessellation_cache.h" +#include "../subdiv/catmullclark_coefficients.h" +#include "../subdiv/patch.h" +#include "../../common/algorithms/parallel_map.h" +#include "../../common/algorithms/parallel_set.h" + +namespace embree +{ + class SubdivMesh : public Geometry + { + ALIGNED_CLASS_(16); + public: + + typedef HalfEdge::Edge Edge; + + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH; + + /*! structure used to sort half edges using radix sort by their key */ + struct KeyHalfEdge + { + KeyHalfEdge() {} + + KeyHalfEdge (uint64_t key, HalfEdge* edge) + : key(key), edge(edge) {} + + __forceinline operator uint64_t() const { + return key; + } + + friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) { + return e0.key < e1.key; + } + + public: + uint64_t key; + HalfEdge* edge; + }; + + public: + + /*! subdiv mesh construction */ + SubdivMesh(Device* device); + + public: + void setMask (unsigned mask); + void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode); + void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setTopologyCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void setTessellationRate(float N); + bool verify(); + void commit(); + void addElementsToCount (GeometryCounts & counts) const; + void setDisplacementFunction (RTCDisplacementFunctionN func); + unsigned int getFirstHalfEdge(unsigned int faceID); + unsigned int getFace(unsigned int edgeID); + unsigned int getNextHalfEdge(unsigned int edgeID); + unsigned int getPreviousHalfEdge(unsigned int edgeID); + unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID); + + public: + + /*! return the number of faces */ + size_t numFaces() const { + return faceVertices.size(); + } + + /*! return the number of edges */ + size_t numEdges() const { + return topology[0].vertexIndices.size(); + } + + /*! return the number of vertices */ + size_t numVertices() const { + return vertices[0].size(); + } + + /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t j = 0) const { + return topology[0].getHalfEdge(i)->bounds(vertices[j]); + } + + /*! check if the i'th primitive is valid */ + __forceinline bool valid(size_t i) const { + return topology[0].valid(i) && !invalidFace(i); + } + + /*! check if the i'th primitive is valid for the j'th time range */ + __forceinline bool valid(size_t i, size_t j) const { + return topology[0].valid(i) && !invalidFace(i,j); + } + + /*! prints some statistics */ + void printStatistics(); + + /*! initializes the half edge data structure */ + void initializeHalfEdgeStructures (); + + public: + + /*! returns the vertex buffer for some time step */ + __forceinline const BufferView<Vec3fa>& getVertexBuffer( const size_t t = 0 ) const { + return vertices[t]; + } + + /* returns tessellation level of edge */ + __forceinline float getEdgeLevel(const size_t i) const + { + if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level? + else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level? + } + + public: + RTCDisplacementFunctionN displFunc; //!< displacement function + + /*! all buffers in this section are provided by the application */ + public: + + /*! the topology contains all data that may differ when + * interpolating different user data buffers */ + struct Topology + { + public: + + /*! Default topology construction */ + Topology () : halfEdges(nullptr,0) {} + + /*! Topology initialization */ + Topology (SubdivMesh* mesh); + + /*! make the class movable */ + public: + Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows + : mesh(std::move(other.mesh)), + vertexIndices(std::move(other.vertexIndices)), + subdiv_mode(std::move(other.subdiv_mode)), + halfEdges(std::move(other.halfEdges)), + halfEdges0(std::move(other.halfEdges0)), + halfEdges1(std::move(other.halfEdges1)) {} + + Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows + { + mesh = std::move(other.mesh); + vertexIndices = std::move(other.vertexIndices); + subdiv_mode = std::move(other.subdiv_mode); + halfEdges = std::move(other.halfEdges); + halfEdges0 = std::move(other.halfEdges0); + halfEdges1 = std::move(other.halfEdges1); + return *this; + } + + public: + /*! check if the i'th primitive is valid in this topology */ + __forceinline bool valid(size_t i) const + { + if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) { + if (getHalfEdge(i)->faceHasBorder()) return false; + } + return true; + } + + /*! updates the interpolation mode for the topology */ + void setSubdivisionMode (RTCSubdivisionMode mode); + + /*! marks all buffers as modified */ + void update (); + + /*! verifies index array */ + bool verify (size_t numVertices); + + /*! initializes the half edge data structure */ + void initializeHalfEdgeStructures (); + + private: + + /*! recalculates the half edges */ + void calculateHalfEdges(); + + /*! updates half edges when recalculation is not necessary */ + void updateHalfEdges(); + + /*! user input data */ + public: + + SubdivMesh* mesh; + + /*! indices of the vertices composing each face */ + BufferView<unsigned int> vertexIndices; + + /*! subdiv interpolation mode */ + RTCSubdivisionMode subdiv_mode; + + /*! generated data */ + public: + + /*! returns the start half edge for face f */ + __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { + return &halfEdges[mesh->faceStartEdge[f]]; + } + + /*! Half edge structure, generated by initHalfEdgeStructures */ + mvector<HalfEdge> halfEdges; + + /*! the following data is only required during construction of the + * half edge structure and can be cleared for static scenes */ + private: + + /*! two arrays used to sort the half edges */ + std::vector<KeyHalfEdge> halfEdges0; + std::vector<KeyHalfEdge> halfEdges1; + }; + + /*! returns the start half edge for topology t and face f */ + __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { + return topology[t].getHalfEdge(f); + } + + /*! buffer containing the number of vertices for each face */ + BufferView<unsigned int> faceVertices; + + /*! array of topologies */ + vector<Topology> topology; + + /*! vertex buffer (one buffer for each time step) */ + vector<BufferView<Vec3fa>> vertices; + + /*! user data buffers */ + vector<RawBufferView> vertexAttribs; + + /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */ + BufferView<Edge> edge_creases; + + /*! edge crease weights for each edge of the edge_creases buffer */ + BufferView<float> edge_crease_weights; + + /*! vertex crease buffer containing all vertices that carry vertex crease weights */ + BufferView<unsigned int> vertex_creases; + + /*! vertex crease weights for each vertex of the vertex_creases buffer */ + BufferView<float> vertex_crease_weights; + + /*! subdivision level for each half edge of the vertexIndices buffer */ + BufferView<float> levels; + float tessellationRate; // constant rate that is used when levels is not set + + /*! buffer that marks specific faces as holes */ + BufferView<unsigned> holes; + + /*! all data in this section is generated by initializeHalfEdgeStructures function */ + private: + + /*! number of half edges used by faces */ + size_t numHalfEdges; + + /*! fast lookup table to find the first half edge for some face */ + mvector<uint32_t> faceStartEdge; + + /*! fast lookup table to find the face for some half edge */ + mvector<uint32_t> halfEdgeFace; + + /*! set with all holes */ + parallel_set<uint32_t> holeSet; + + /*! fast lookup table to detect invalid faces */ + mvector<int8_t> invalid_face; + + /*! test if face i is invalid in timestep j */ + __forceinline int8_t& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; } + __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; } + + /*! interpolation cache */ + public: + static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; } + static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; } + static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) { + const size_t slots = numInterpolationSlots4(stride); + assert(slot < slots); + return slots*prim+slot; + } + std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_buffer_tags; + std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_attrib_buffer_tags; + std::vector<Patch3fa::Ref> patch_eval_trees; + + /*! the following data is only required during construction of the + * half edge structure and can be cleared for static scenes */ + private: + + /*! map with all vertex creases */ + parallel_map<uint32_t,float> vertexCreaseMap; + + /*! map with all edge creases */ + parallel_map<uint64_t,float> edgeCreaseMap; + + protected: + + /*! counts number of geometry commits */ + size_t commitCounter; + }; + + namespace isa + { + struct SubdivMeshISA : public SubdivMesh + { + SubdivMeshISA (Device* device) + : SubdivMesh(device) {} + + void interpolate(const RTCInterpolateArguments* const args); + void interpolateN(const RTCInterpolateNArguments* const args); + }; + } + + DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*); +}; diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp new file mode 100644 index 0000000000..d1c2750f14 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp @@ -0,0 +1,243 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "scene_triangle_mesh.h" +#include "scene.h" + +namespace embree +{ +#if defined(EMBREE_LOWEST_ISA) + + TriangleMesh::TriangleMesh (Device* device) + : Geometry(device,GTY_TRIANGLE_MESH,0,1) + { + vertices.resize(numTimeSteps); + } + + void TriangleMesh::setMask (unsigned mask) + { + this->mask = mask; + Geometry::update(); + } + + void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps) + { + vertices.resize(numTimeSteps); + Geometry::setNumTimeSteps(numTimeSteps); + } + + void TriangleMesh::setVertexAttributeCount (unsigned int N) + { + vertexAttribs.resize(N); + Geometry::update(); + } + + void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) + { + /* verify that all accesses are 4 bytes aligned */ + if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned"); + + if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (format != RTC_FORMAT_FLOAT3) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format"); + + /* if buffer is larger than 16GB the premultiplied index optimization does not work */ + if (stride*num > 16ll*1024ll*1024ll*1024ll) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large"); + + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot"); + + vertices[slot].set(buffer, offset, stride, num, format); + vertices[slot].checkPadding16(); + vertices0 = vertices[0]; + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format"); + + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot"); + + vertexAttribs[slot].set(buffer, offset, stride, num, format); + vertexAttribs[slot].checkPadding16(); + } + else if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + if (format != RTC_FORMAT_UINT3) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format"); + + triangles.set(buffer, offset, stride, num, format); + setNumPrimitives(num); + } + else + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + } + + void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot) + { + if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return triangles.getPtr(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return vertices[slot].getPtr(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return vertexAttribs[slot].getPtr(); + } + else + { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + return nullptr; + } + } + + void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot) + { + if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + triangles.setModified(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + vertices[slot].setModified(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + vertexAttribs[slot].setModified(); + } + else + { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + } + + Geometry::update(); + } + + void TriangleMesh::commit() + { + /* verify that stride of all time steps are identical */ + for (unsigned int t=0; t<numTimeSteps; t++) + if (vertices[t].getStride() != vertices[0].getStride()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"stride of vertex buffers have to be identical for each time step"); + + Geometry::commit(); + } + + void TriangleMesh::addElementsToCount (GeometryCounts & counts) const + { + if (numTimeSteps == 1) counts.numTriangles += numPrimitives; + else counts.numMBTriangles += numPrimitives; + } + + bool TriangleMesh::verify() + { + /*! verify size of vertex arrays */ + if (vertices.size() == 0) return false; + for (const auto& buffer : vertices) + if (buffer.size() != numVertices()) + return false; + + /*! verify size of user vertex arrays */ + for (const auto& buffer : vertexAttribs) + if (buffer.size() != numVertices()) + return false; + + /*! verify triangle indices */ + for (size_t i=0; i<size(); i++) { + if (triangles[i].v[0] >= numVertices()) return false; + if (triangles[i].v[1] >= numVertices()) return false; + if (triangles[i].v[2] >= numVertices()) return false; + } + + /*! verify vertices */ + for (const auto& buffer : vertices) + for (size_t i=0; i<buffer.size(); i++) + if (!isvalid(buffer[i])) + return false; + + return true; + } + + void TriangleMesh::interpolate(const RTCInterpolateArguments* const args) + { + unsigned int primID = args->primID; + float u = args->u; + float v = args->v; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* dPdv = args->dPdv; + float* ddPdudu = args->ddPdudu; + float* ddPdvdv = args->ddPdvdv; + float* ddPdudv = args->ddPdudv; + unsigned int valueCount = args->valueCount; + + /* calculate base pointer and stride */ + assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || + (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); + const char* src = nullptr; + size_t stride = 0; + if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { + src = vertexAttribs[bufferSlot].getPtr(); + stride = vertexAttribs[bufferSlot].getStride(); + } else { + src = vertices[bufferSlot].getPtr(); + stride = vertices[bufferSlot].getStride(); + } + + for (unsigned int i=0; i<valueCount; i+=4) + { + size_t ofs = i*sizeof(float); + const float w = 1.0f-u-v; + const Triangle& tri = triangle(primID); + const vbool4 valid = vint4((int)i)+vint4(step) < vint4(int(valueCount)); + const vfloat4 p0 = vfloat4::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]); + const vfloat4 p1 = vfloat4::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]); + const vfloat4 p2 = vfloat4::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]); + + if (P) { + vfloat4::storeu(valid,P+i,madd(w,p0,madd(u,p1,v*p2))); + } + if (dPdu) { + assert(dPdu); vfloat4::storeu(valid,dPdu+i,p1-p0); + assert(dPdv); vfloat4::storeu(valid,dPdv+i,p2-p0); + } + if (ddPdudu) { + assert(ddPdudu); vfloat4::storeu(valid,ddPdudu+i,vfloat4(zero)); + assert(ddPdvdv); vfloat4::storeu(valid,ddPdvdv+i,vfloat4(zero)); + assert(ddPdudv); vfloat4::storeu(valid,ddPdudv+i,vfloat4(zero)); + } + } + } + +#endif + + namespace isa + { + TriangleMesh* createTriangleMesh(Device* device) { + return new TriangleMeshISA(device); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h new file mode 100644 index 0000000000..eaf2e1799a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h @@ -0,0 +1,264 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! Triangle Mesh */ + struct TriangleMesh : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_TRIANGLE_MESH; + + /*! triangle indices */ + struct Triangle + { + uint32_t v[3]; + + /*! outputs triangle indices */ + __forceinline friend embree_ostream operator<<(embree_ostream cout, const Triangle& t) { + return cout << "Triangle { " << t.v[0] << ", " << t.v[1] << ", " << t.v[2] << " }"; + } + }; + + public: + + /*! triangle mesh construction */ + TriangleMesh (Device* device); + + /* geometry interface */ + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th triangle*/ + __forceinline const Triangle& triangle(size_t i) const { + return triangles[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const Vec3fa vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! calculates the bounds of the i'th triangle */ + __forceinline BBox3fa bounds(size_t i) const + { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0]); + const Vec3fa v1 = vertex(tri.v[1]); + const Vec3fa v2 = vertex(tri.v[2]); + return BBox3fa(min(v0,v1,v2),max(v0,v1,v2)); + } + + /*! calculates the bounds of the i'th triangle at the itime'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0],itime); + const Vec3fa v1 = vertex(tri.v[1],itime); + const Vec3fa v2 = vertex(tri.v[2],itime); + return BBox3fa(min(v0,v1,v2),max(v0,v1,v2)); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + const Triangle& tri = triangle(i); + if (unlikely(tri.v[0] >= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + if (!isvalid(vertex(tri.v[0],itime))) return false; + if (!isvalid(vertex(tri.v[1],itime))) return false; + if (!isvalid(vertex(tri.v[2],itime))) return false; + } + + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const Triangle& tri = triangle(i); + if (unlikely(tri.v[0] >= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + for (size_t t=0; t<numTimeSteps; t++) + { + const Vec3fa v0 = vertex(tri.v[0],t); + const Vec3fa v1 = vertex(tri.v[1],t); + const Vec3fa v2 = vertex(tri.v[2],t); + if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2))) + return false; + } + + if (likely(bbox)) + *bbox = bounds(i); + + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + const Triangle& tri = triangle(i); + if (unlikely(tri.v[0] >= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + assert(itime+1 < numTimeSteps); + const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false; + const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false; + const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false; + const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false; + const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false; + const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false; + + /* use bounds of first time step in builder */ + bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2)); + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const { + if (!valid(i, timeSegmentRange(dt))) return false; + bbox = linearBounds(i, dt); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return triangles.modCounter; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return triangles.isModified(otherVersion); // || numPrimitivesChanged; + } + + /* returns the projected area */ + __forceinline float projectedPrimitiveArea(const size_t i) const { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0]); + const Vec3fa v1 = vertex(tri.v[1]); + const Vec3fa v2 = vertex(tri.v[2]); + return areaProjectedTriangle(v0,v1,v2); + } + + public: + BufferView<Triangle> triangles; //!< array of triangles + BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer + vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep + vector<RawBufferView> vertexAttribs; //!< vertex attributes + }; + + namespace isa + { + struct TriangleMeshISA : public TriangleMesh + { + TriangleMeshISA (Device* device) + : TriangleMesh(device) {} + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,itime,bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!valid(j, timeSegmentRange(t0t1))) continue; + const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h new file mode 100644 index 0000000000..8d11ed6986 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h @@ -0,0 +1,77 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "accelset.h" + +namespace embree +{ + /*! User geometry with user defined intersection functions */ + struct UserGeometry : public AccelSet + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY; + + public: + UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1); + virtual void setMask (unsigned mask); + virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr); + virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect); + virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded); + virtual void build() {} + virtual void addElementsToCount (GeometryCounts & counts) const; + }; + + namespace isa + { + struct UserGeometryISA : public UserGeometry + { + UserGeometryISA (Device* device) + : UserGeometry(device) {} + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,itime,bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!valid(j, timeSegmentRange(t0t1))) continue; + const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/stack_item.h b/thirdparty/embree-aarch64/kernels/common/stack_item.h new file mode 100644 index 0000000000..533c385365 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/stack_item.h @@ -0,0 +1,125 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! An item on the stack holds the node ID and distance of that node. */ + template<typename T> + struct __aligned(16) StackItemT + { + /*! assert that the xchg function works */ + static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed"); + + __forceinline StackItemT() {} + + __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {} + + /*! use SSE instructions to swap stack items */ + __forceinline static void xchg(StackItemT& a, StackItemT& b) + { + const vfloat4 sse_a = vfloat4::load((float*)&a); + const vfloat4 sse_b = vfloat4::load((float*)&b); + vfloat4::store(&a,sse_b); + vfloat4::store(&b,sse_a); + } + + /*! Sort 2 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2) { + if (s2.dist < s1.dist) xchg(s2,s1); + } + + /*! Sort 3 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3) + { + if (s2.dist < s1.dist) xchg(s2,s1); + if (s3.dist < s2.dist) xchg(s3,s2); + if (s2.dist < s1.dist) xchg(s2,s1); + } + + /*! Sort 4 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4) + { + if (s2.dist < s1.dist) xchg(s2,s1); + if (s4.dist < s3.dist) xchg(s4,s3); + if (s3.dist < s1.dist) xchg(s3,s1); + if (s4.dist < s2.dist) xchg(s4,s2); + if (s3.dist < s2.dist) xchg(s3,s2); + } + + /*! use SSE instructions to swap stack items */ + __forceinline static void cmp_xchg(vint4& a, vint4& b) + { +#if defined(__AVX512VL__) + const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a)); +#else + const vboolf4 mask0(b < a); + const vboolf4 mask(shuffle<2,2,2,2>(mask0)); +#endif + const vint4 c = select(mask,b,a); + const vint4 d = select(mask,a,b); + a = c; + b = d; + } + + /*! Sort 3 stack items. */ + __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3) + { + cmp_xchg(s2,s1); + cmp_xchg(s3,s2); + cmp_xchg(s2,s1); + } + + /*! Sort 4 stack items. */ + __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4) + { + cmp_xchg(s2,s1); + cmp_xchg(s4,s3); + cmp_xchg(s3,s1); + cmp_xchg(s4,s2); + cmp_xchg(s3,s2); + } + + + /*! Sort N stack items. */ + __forceinline friend void sort(StackItemT* begin, StackItemT* end) + { + for (StackItemT* i = begin+1; i != end; ++i) + { + const vfloat4 item = vfloat4::load((float*)i); + const unsigned dist = i->dist; + StackItemT* j = i; + + while ((j != begin) && ((j-1)->dist < dist)) + { + vfloat4::store(j, vfloat4::load((float*)(j-1))); + --j; + } + + vfloat4::store(j, item); + } + } + + public: + T ptr; + unsigned dist; + }; + + /*! An item on the stack holds the node ID and active ray mask. */ + template<typename T> + struct __aligned(8) StackItemMaskT + { + T ptr; + size_t mask; + }; + + struct __aligned(8) StackItemMaskCoherent + { + size_t mask; + size_t parent; + size_t child; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/stat.cpp b/thirdparty/embree-aarch64/kernels/common/stat.cpp new file mode 100644 index 0000000000..b73c3a8c76 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/stat.cpp @@ -0,0 +1,128 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "stat.h" + +namespace embree +{ + Stat Stat::instance; + + Stat::Stat () { + } + + Stat::~Stat () + { +#ifdef EMBREE_STAT_COUNTERS + Stat::print(std::cout); +#endif + } + + void Stat::print(std::ostream& cout) + { + Counters& cntrs = instance.cntrs; + Counters::Data& data = instance.cntrs.code; + //Counters::Data& data = instance.cntrs.active; + + /* print absolute numbers */ + cout << "--------- ABSOLUTE ---------" << std::endl; + cout << " #normal_travs = " << float(data.normal.travs )*1E-6 << "M" << std::endl; + cout << " #nodes = " << float(data.normal.trav_nodes )*1E-6 << "M" << std::endl; + cout << " #nodes_xfm = " << float(data.normal.trav_xfm_nodes )*1E-6 << "M" << std::endl; + cout << " #leaves = " << float(data.normal.trav_leaves )*1E-6 << "M" << std::endl; + cout << " #prims = " << float(data.normal.trav_prims )*1E-6 << "M" << std::endl; + cout << " #prim_hits = " << float(data.normal.trav_prim_hits )*1E-6 << "M" << std::endl; + + cout << " #stack nodes = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl; + cout << " #stack pop = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl; + + size_t normal_box_hits = 0; + size_t weighted_box_hits = 0; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) { + normal_box_hits += data.normal.trav_hit_boxes[i]; + weighted_box_hits += data.normal.trav_hit_boxes[i]*i; + } + cout << " #hit_boxes = " << normal_box_hits << " (total) distribution: "; + float average = 0.0f; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) + { + float value = 100.0f * data.normal.trav_hit_boxes[i] / normal_box_hits; + cout << "[" << i << "] " << value << " "; + average += (float)i*data.normal.trav_hit_boxes[i] / normal_box_hits; + } + cout << " average = " << average << std::endl; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.normal.trav_hit_boxes[i]*i / weighted_box_hits << " "; + cout << std::endl; + + if (data.shadow.travs) { + cout << " #shadow_travs = " << float(data.shadow.travs )*1E-6 << "M" << std::endl; + cout << " #nodes = " << float(data.shadow.trav_nodes )*1E-6 << "M" << std::endl; + cout << " #nodes_xfm = " << float(data.shadow.trav_xfm_nodes)*1E-6 << "M" << std::endl; + cout << " #leaves = " << float(data.shadow.trav_leaves )*1E-6 << "M" << std::endl; + cout << " #prims = " << float(data.shadow.trav_prims )*1E-6 << "M" << std::endl; + cout << " #prim_hits = " << float(data.shadow.trav_prim_hits)*1E-6 << "M" << std::endl; + + cout << " #stack nodes = " << float(data.shadow.trav_stack_nodes )*1E-6 << "M" << std::endl; + cout << " #stack pop = " << float(data.shadow.trav_stack_pop )*1E-6 << "M" << std::endl; + + size_t shadow_box_hits = 0; + size_t weighted_shadow_box_hits = 0; + + for (size_t i=0;i<SIZE_HISTOGRAM;i++) { + shadow_box_hits += data.shadow.trav_hit_boxes[i]; + weighted_shadow_box_hits += data.shadow.trav_hit_boxes[i]*i; + } + cout << " #hit_boxes = "; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i] / shadow_box_hits << " "; + cout << std::endl; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i]*i / weighted_shadow_box_hits << " "; + cout << std::endl; + } + cout << std::endl; + + /* print per traversal numbers */ + cout << "--------- PER TRAVERSAL ---------" << std::endl; + float active_normal_travs = float(cntrs.active.normal.travs )/float(cntrs.all.normal.travs ); + float active_normal_trav_nodes = float(cntrs.active.normal.trav_nodes )/float(cntrs.all.normal.trav_nodes ); + float active_normal_trav_xfm_nodes = float(cntrs.active.normal.trav_xfm_nodes )/float(cntrs.all.normal.trav_xfm_nodes ); + float active_normal_trav_leaves = float(cntrs.active.normal.trav_leaves)/float(cntrs.all.normal.trav_leaves); + float active_normal_trav_prims = float(cntrs.active.normal.trav_prims )/float(cntrs.all.normal.trav_prims ); + float active_normal_trav_prim_hits = float(cntrs.active.normal.trav_prim_hits )/float(cntrs.all.normal.trav_prim_hits ); + float active_normal_trav_stack_pop = float(cntrs.active.normal.trav_stack_pop )/float(cntrs.all.normal.trav_stack_pop ); + + cout << " #normal_travs = " << float(cntrs.code.normal.travs )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_travs << "% active" << std::endl; + cout << " #nodes = " << float(cntrs.code.normal.trav_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_nodes << "% active" << std::endl; + cout << " #node_xfm = " << float(cntrs.code.normal.trav_xfm_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_xfm_nodes << "% active" << std::endl; + cout << " #leaves = " << float(cntrs.code.normal.trav_leaves)/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_leaves << "% active" << std::endl; + cout << " #prims = " << float(cntrs.code.normal.trav_prims )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prims << "% active" << std::endl; + cout << " #prim_hits = " << float(cntrs.code.normal.trav_prim_hits )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prim_hits << "% active" << std::endl; + cout << " #stack_pop = " << float(cntrs.code.normal.trav_stack_pop )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_stack_pop << "% active" << std::endl; + + if (cntrs.all.shadow.travs) { + float active_shadow_travs = float(cntrs.active.shadow.travs )/float(cntrs.all.shadow.travs ); + float active_shadow_trav_nodes = float(cntrs.active.shadow.trav_nodes )/float(cntrs.all.shadow.trav_nodes ); + float active_shadow_trav_xfm_nodes = float(cntrs.active.shadow.trav_xfm_nodes )/float(cntrs.all.shadow.trav_xfm_nodes ); + float active_shadow_trav_leaves = float(cntrs.active.shadow.trav_leaves)/float(cntrs.all.shadow.trav_leaves); + float active_shadow_trav_prims = float(cntrs.active.shadow.trav_prims )/float(cntrs.all.shadow.trav_prims ); + float active_shadow_trav_prim_hits = float(cntrs.active.shadow.trav_prim_hits )/float(cntrs.all.shadow.trav_prim_hits ); + + cout << " #shadow_travs = " << float(cntrs.code.shadow.travs )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_travs << "% active" << std::endl; + cout << " #nodes = " << float(cntrs.code.shadow.trav_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_nodes << "% active" << std::endl; + cout << " #nodes_xfm = " << float(cntrs.code.shadow.trav_xfm_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_xfm_nodes << "% active" << std::endl; + cout << " #leaves = " << float(cntrs.code.shadow.trav_leaves)/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_leaves << "% active" << std::endl; + cout << " #prims = " << float(cntrs.code.shadow.trav_prims )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prims << "% active" << std::endl; + cout << " #prim_hits = " << float(cntrs.code.shadow.trav_prim_hits )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prim_hits << "% active" << std::endl; + + } + cout << std::endl; + + /* print user counters for performance tuning */ + cout << "--------- USER ---------" << std::endl; + for (size_t i=0; i<10; i++) + cout << "#user" << i << " = " << float(cntrs.user[i])/float(cntrs.all.normal.travs+cntrs.all.shadow.travs) << " per traversal" << std::endl; + + cout << "#user5/user3 " << 100.0f*float(cntrs.user[5])/float(cntrs.user[3]) << "%" << std::endl; + cout << "#user6/user3 " << 100.0f*float(cntrs.user[6])/float(cntrs.user[3]) << "%" << std::endl; + cout << "#user7/user3 " << 100.0f*float(cntrs.user[7])/float(cntrs.user[3]) << "%" << std::endl; + cout << std::endl; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/stat.h b/thirdparty/embree-aarch64/kernels/common/stat.h new file mode 100644 index 0000000000..3cda2bd014 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/stat.h @@ -0,0 +1,116 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +/* Macros to gather statistics */ +#ifdef EMBREE_STAT_COUNTERS +# define STAT(x) x +# define STAT3(s,x,y,z) \ + STAT(Stat::get().code .s+=x); \ + STAT(Stat::get().active.s+=y); \ + STAT(Stat::get().all .s+=z); +# define STAT_USER(i,x) Stat::get().user[i]+=x; +#else +# define STAT(x) +# define STAT3(s,x,y,z) +# define STAT_USER(i,x) +#endif + +namespace embree +{ + /*! Gathers ray tracing statistics. We count 1) how often a code + * location is reached, 2) how many SIMD lanes are active, 3) how + * many SIMD lanes reach the code location */ + class Stat + { + public: + + static const size_t SIZE_HISTOGRAM = 64+1; + + /*! constructs stat counter class */ + Stat (); + + /*! destructs stat counter class */ + ~Stat (); + + class Counters + { + public: + Counters () { + clear(); + } + + void clear() + { + all.clear(); + active.clear(); + code.clear(); + for (auto& u : user) u.store(0); + } + + public: + + /* per packet and per ray stastics */ + struct Data + { + void clear () { + normal.clear(); + shadow.clear(); + point_query.clear(); + } + + /* normal and shadow ray statistics */ + struct + { + void clear() + { + travs.store(0); + trav_nodes.store(0); + trav_leaves.store(0); + trav_prims.store(0); + trav_prim_hits.store(0); + for (auto& v : trav_hit_boxes) v.store(0); + trav_stack_pop.store(0); + trav_stack_nodes.store(0); + trav_xfm_nodes.store(0); + } + + public: + std::atomic<size_t> travs; + std::atomic<size_t> trav_nodes; + std::atomic<size_t> trav_leaves; + std::atomic<size_t> trav_prims; + std::atomic<size_t> trav_prim_hits; + std::atomic<size_t> trav_hit_boxes[SIZE_HISTOGRAM+1]; + std::atomic<size_t> trav_stack_pop; + std::atomic<size_t> trav_stack_nodes; + std::atomic<size_t> trav_xfm_nodes; + + } normal, shadow, point_query; + } all, active, code; + + std::atomic<size_t> user[10]; + }; + + public: + + static __forceinline Counters& get() { + return instance.cntrs; + } + + static void clear() { + instance.cntrs.clear(); + } + + static void print(embree_ostream cout); + + private: + Counters cntrs; + + private: + static Stat instance; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/state.cpp b/thirdparty/embree-aarch64/kernels/common/state.cpp new file mode 100644 index 0000000000..51fc9b7826 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/state.cpp @@ -0,0 +1,543 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "state.h" +#include "../../common/lexers/streamfilters.h" + +namespace embree +{ + MutexSys g_printMutex; + + State::ErrorHandler State::g_errorHandler; + + State::ErrorHandler::ErrorHandler() + : thread_error(createTls()) {} + + State::ErrorHandler::~ErrorHandler() + { + Lock<MutexSys> lock(errors_mutex); + for (size_t i=0; i<thread_errors.size(); i++) + delete thread_errors[i]; + destroyTls(thread_error); + thread_errors.clear(); + } + + RTCError* State::ErrorHandler::error() + { + RTCError* stored_error = (RTCError*) getTls(thread_error); + if (stored_error) return stored_error; + + Lock<MutexSys> lock(errors_mutex); + stored_error = new RTCError(RTC_ERROR_NONE); + thread_errors.push_back(stored_error); + setTls(thread_error,stored_error); + return stored_error; + } + + State::State () + : enabled_cpu_features(getCPUFeatures()), + enabled_builder_cpu_features(enabled_cpu_features), + frequency_level(FREQUENCY_SIMD256) + { + tri_accel = "default"; + tri_builder = "default"; + tri_traverser = "default"; + + tri_accel_mb = "default"; + tri_builder_mb = "default"; + tri_traverser_mb = "default"; + + quad_accel = "default"; + quad_builder = "default"; + quad_traverser = "default"; + + quad_accel_mb = "default"; + quad_builder_mb = "default"; + quad_traverser_mb = "default"; + + line_accel = "default"; + line_builder = "default"; + line_traverser = "default"; + + line_accel_mb = "default"; + line_builder_mb = "default"; + line_traverser_mb = "default"; + + hair_accel = "default"; + hair_builder = "default"; + hair_traverser = "default"; + + hair_accel_mb = "default"; + hair_builder_mb = "default"; + hair_traverser_mb = "default"; + + object_accel = "default"; + object_builder = "default"; + object_accel_min_leaf_size = 1; + object_accel_max_leaf_size = 1; + + object_accel_mb = "default"; + object_builder_mb = "default"; + object_accel_mb_min_leaf_size = 1; + object_accel_mb_max_leaf_size = 1; + + max_spatial_split_replications = 1.2f; + useSpatialPreSplits = false; + + tessellation_cache_size = 128*1024*1024; + + subdiv_accel = "default"; + subdiv_accel_mb = "default"; + + grid_accel = "default"; + grid_builder = "default"; + grid_accel_mb = "default"; + grid_builder_mb = "default"; + + instancing_open_min = 0; + instancing_block_size = 0; + instancing_open_factor = 8.0f; + instancing_open_max_depth = 32; + instancing_open_max = 50000000; + + ignore_config_files = false; + float_exceptions = false; + quality_flags = -1; + scene_flags = -1; + verbose = 0; + benchmark = 0; + + numThreads = 0; + numUserThreads = 0; + +#if TASKING_INTERNAL + set_affinity = true; +#else + set_affinity = false; +#endif + /* per default enable affinity on KNL */ + if (hasISA(AVX512KNL)) set_affinity = true; + + start_threads = false; + enable_selockmemoryprivilege = false; +#if defined(__LINUX__) + hugepages = true; +#else + hugepages = false; +#endif + hugepages_success = true; + + alloc_main_block_size = 0; + alloc_num_main_slots = 0; + alloc_thread_block_size = 0; + alloc_single_thread_alloc = -1; + + error_function = nullptr; + error_function_userptr = nullptr; + + memory_monitor_function = nullptr; + memory_monitor_userptr = nullptr; + } + + State::~State() { + } + + bool State::hasISA(const int isa) { + return (enabled_cpu_features & isa) == isa; + } + + bool State::checkISASupport() { +#if defined(__ARM_NEON) + /* + * NEON CPU type is a mixture of NEON and SSE2 + */ + + bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2; + + /* this will be true when explicitly initialize Device with `isa=neon` config */ + bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON; + + return hasSSE2 || hasNEON; +#else + return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features; +#endif + } + + void State::verify() + { + /* verify that calculations stay in range */ + assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX); + + /* here we verify that CPP files compiled for a specific ISA only + * call that same or lower ISA version of non-inlined class member + * functions */ +#if defined(DEBUG) +#if defined(EMBREE_TARGET_SSE2) +#if !defined(__ARM_NEON) + assert(sse2::getISA() <= SSE2); +#endif +#endif +#if defined(EMBREE_TARGET_SSE42) + assert(sse42::getISA() <= SSE42); +#endif +#if defined(EMBREE_TARGET_AVX) + assert(avx::getISA() <= AVX); +#endif +#if defined(EMBREE_TARGET_AVX2) + assert(avx2::getISA() <= AVX2); +#endif +#if defined (EMBREE_TARGET_AVX512KNL) + assert(avx512knl::getISA() <= AVX512KNL); +#endif +#if defined (EMBREE_TARGET_AVX512SKX) + assert(avx512skx::getISA() <= AVX512SKX); +#endif +#endif + } + + const char* symbols[3] = { "=", ",", "|" }; + + bool State::parseFile(const FileName& fileName) + { + FILE* f = fopen(fileName.c_str(),"r"); + if (!f) return false; + Ref<Stream<int> > file = new FileStream(f,fileName); + + std::vector<std::string> syms; + for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) + syms.push_back(symbols[i]); + + Ref<TokenStream> cin = new TokenStream(new LineCommentFilter(file,"#"), + TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.", + TokenStream::separators,syms); + parse(cin); + return true; + } + + void State::parseString(const char* cfg) + { + if (cfg == nullptr) return; + + std::vector<std::string> syms; + for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) + syms.push_back(symbols[i]); + + Ref<TokenStream> cin = new TokenStream(new StrStream(cfg), + TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.", + TokenStream::separators,syms); + parse(cin); + } + + int string_to_cpufeatures(const std::string& isa) + { + if (isa == "sse" ) return SSE; + else if (isa == "sse2") return SSE2; + else if (isa == "sse3") return SSE3; + else if (isa == "ssse3") return SSSE3; + else if (isa == "sse41") return SSE41; + else if (isa == "sse4.1") return SSE41; + else if (isa == "sse42") return SSE42; + else if (isa == "sse4.2") return SSE42; + else if (isa == "avx") return AVX; + else if (isa == "avxi") return AVXI; + else if (isa == "avx2") return AVX2; + else if (isa == "avx512knl") return AVX512KNL; + else if (isa == "avx512skx") return AVX512SKX; + else return SSE2; + } + + void State::parse(Ref<TokenStream> cin) + { + /* parse until end of stream */ + while (cin->peek() != Token::Eof()) + { + const Token tok = cin->get(); + + if (tok == Token::Id("threads") && cin->trySymbol("=")) + numThreads = cin->get().Int(); + + else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) + numUserThreads = cin->get().Int(); + + else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) + set_affinity = cin->get().Int(); + + else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) + set_affinity = cin->get().Int(); + + else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) + start_threads = cin->get().Int(); + + else if (tok == Token::Id("isa") && cin->trySymbol("=")) { + std::string isa = toLowerCase(cin->get().Identifier()); + enabled_cpu_features = string_to_cpufeatures(isa); + enabled_builder_cpu_features = enabled_cpu_features; + } + + else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) { + std::string isa = toLowerCase(cin->get().Identifier()); + enabled_cpu_features &= string_to_cpufeatures(isa); + enabled_builder_cpu_features &= enabled_cpu_features; + } + + else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) { + std::string isa = toLowerCase(cin->get().Identifier()); + enabled_builder_cpu_features &= string_to_cpufeatures(isa); + } + + else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) { + std::string freq = cin->get().Identifier(); + if (freq == "simd128") frequency_level = FREQUENCY_SIMD128; + else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256; + else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512; + } + + else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) { + enable_selockmemoryprivilege = cin->get().Int(); + } + else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) { + hugepages = cin->get().Int(); + } + + else if (tok == Token::Id("ignore_config_files") && cin->trySymbol("=")) + ignore_config_files = cin->get().Int(); + else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) + float_exceptions = cin->get().Int(); + + else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("=")) + tri_accel = cin->get().Identifier(); + else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("=")) + tri_builder = cin->get().Identifier(); + else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("=")) + tri_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("=")) + tri_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("=")) + tri_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("=")) + tri_traverser_mb = cin->get().Identifier(); + + else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("=")) + quad_accel = cin->get().Identifier(); + else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("=")) + quad_builder = cin->get().Identifier(); + else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("=")) + quad_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("=")) + quad_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("=")) + quad_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("=")) + quad_traverser_mb = cin->get().Identifier(); + + else if ((tok == Token::Id("line_accel")) && cin->trySymbol("=")) + line_accel = cin->get().Identifier(); + else if ((tok == Token::Id("line_builder")) && cin->trySymbol("=")) + line_builder = cin->get().Identifier(); + else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("=")) + line_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("=")) + line_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("=")) + line_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("=")) + line_traverser_mb = cin->get().Identifier(); + + else if (tok == Token::Id("hair_accel") && cin->trySymbol("=")) + hair_accel = cin->get().Identifier(); + else if (tok == Token::Id("hair_builder") && cin->trySymbol("=")) + hair_builder = cin->get().Identifier(); + else if (tok == Token::Id("hair_traverser") && cin->trySymbol("=")) + hair_traverser = cin->get().Identifier(); + + else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("=")) + hair_accel_mb = cin->get().Identifier(); + else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("=")) + hair_builder_mb = cin->get().Identifier(); + else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("=")) + hair_traverser_mb = cin->get().Identifier(); + + else if (tok == Token::Id("object_accel") && cin->trySymbol("=")) + object_accel = cin->get().Identifier(); + else if (tok == Token::Id("object_builder") && cin->trySymbol("=")) + object_builder = cin->get().Identifier(); + else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("=")) + object_accel_min_leaf_size = cin->get().Int(); + else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("=")) + object_accel_max_leaf_size = cin->get().Int(); + + else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("=")) + object_accel_mb = cin->get().Identifier(); + else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("=")) + object_builder_mb = cin->get().Identifier(); + else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("=")) + object_accel_mb_min_leaf_size = cin->get().Int(); + else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("=")) + object_accel_mb_max_leaf_size = cin->get().Int(); + + else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("=")) + instancing_open_min = cin->get().Int(); + else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) { + instancing_block_size = cin->get().Int(); + instancing_open_factor = 0.0f; + } + else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("=")) + instancing_open_max_depth = cin->get().Int(); + else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) { + instancing_block_size = 0; + instancing_open_factor = cin->get().Float(); + } + else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("=")) + instancing_open_max = cin->get().Int(); + + else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("=")) + subdiv_accel = cin->get().Identifier(); + else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("=")) + subdiv_accel_mb = cin->get().Identifier(); + + else if (tok == Token::Id("grid_accel") && cin->trySymbol("=")) + grid_accel = cin->get().Identifier(); + else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("=")) + grid_accel_mb = cin->get().Identifier(); + + else if (tok == Token::Id("verbose") && cin->trySymbol("=")) + verbose = cin->get().Int(); + else if (tok == Token::Id("benchmark") && cin->trySymbol("=")) + benchmark = cin->get().Int(); + + else if (tok == Token::Id("quality")) { + if (cin->trySymbol("=")) { + Token flag = cin->get(); + if (flag == Token::Id("low")) quality_flags = RTC_BUILD_QUALITY_LOW; + else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM; + else if (flag == Token::Id("high")) quality_flags = RTC_BUILD_QUALITY_HIGH; + } + } + + else if (tok == Token::Id("scene_flags")) { + scene_flags = 0; + if (cin->trySymbol("=")) { + do { + Token flag = cin->get(); + if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC; + else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT; + else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST; + } while (cin->trySymbol("|")); + } + } + + else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("=")) + max_spatial_split_replications = cin->get().Float(); + + else if (tok == Token::Id("presplits") && cin->trySymbol("=")) + useSpatialPreSplits = cin->get().Int() != 0 ? true : false; + + else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("=")) + tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f); + else if (tok == Token::Id("cache_size") && cin->trySymbol("=")) + tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f); + + else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("=")) + alloc_main_block_size = cin->get().Int(); + else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("=")) + alloc_num_main_slots = cin->get().Int(); + else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("=")) + alloc_thread_block_size = cin->get().Int(); + else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("=")) + alloc_single_thread_alloc = cin->get().Int(); + + cin->trySymbol(","); // optional , separator + } + } + + bool State::verbosity(size_t N) { + return N <= verbose; + } + + void State::print() + { + std::cout << "general:" << std::endl; + std::cout << " build threads = " << numThreads << std::endl; + std::cout << " build user threads = " << numUserThreads << std::endl; + std::cout << " start_threads = " << start_threads << std::endl; + std::cout << " affinity = " << set_affinity << std::endl; + std::cout << " frequency_level = "; + switch (frequency_level) { + case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break; + case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break; + case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break; + default: std::cout << "error" << std::endl; break; + } + + std::cout << " hugepages = "; + if (!hugepages) std::cout << "disabled" << std::endl; + else if (hugepages_success) std::cout << "enabled" << std::endl; + else std::cout << "failed" << std::endl; + + std::cout << " verbosity = " << verbose << std::endl; + std::cout << " cache_size = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl; + std::cout << " max_spatial_split_replications = " << max_spatial_split_replications << std::endl; + + std::cout << "triangles:" << std::endl; + std::cout << " accel = " << tri_accel << std::endl; + std::cout << " builder = " << tri_builder << std::endl; + std::cout << " traverser = " << tri_traverser << std::endl; + + std::cout << "motion blur triangles:" << std::endl; + std::cout << " accel = " << tri_accel_mb << std::endl; + std::cout << " builder = " << tri_builder_mb << std::endl; + std::cout << " traverser = " << tri_traverser_mb << std::endl; + + std::cout << "quads:" << std::endl; + std::cout << " accel = " << quad_accel << std::endl; + std::cout << " builder = " << quad_builder << std::endl; + std::cout << " traverser = " << quad_traverser << std::endl; + + std::cout << "motion blur quads:" << std::endl; + std::cout << " accel = " << quad_accel_mb << std::endl; + std::cout << " builder = " << quad_builder_mb << std::endl; + std::cout << " traverser = " << quad_traverser_mb << std::endl; + + std::cout << "line segments:" << std::endl; + std::cout << " accel = " << line_accel << std::endl; + std::cout << " builder = " << line_builder << std::endl; + std::cout << " traverser = " << line_traverser << std::endl; + + std::cout << "motion blur line segments:" << std::endl; + std::cout << " accel = " << line_accel_mb << std::endl; + std::cout << " builder = " << line_builder_mb << std::endl; + std::cout << " traverser = " << line_traverser_mb << std::endl; + + std::cout << "hair:" << std::endl; + std::cout << " accel = " << hair_accel << std::endl; + std::cout << " builder = " << hair_builder << std::endl; + std::cout << " traverser = " << hair_traverser << std::endl; + + std::cout << "motion blur hair:" << std::endl; + std::cout << " accel = " << hair_accel_mb << std::endl; + std::cout << " builder = " << hair_builder_mb << std::endl; + std::cout << " traverser = " << hair_traverser_mb << std::endl; + + std::cout << "subdivision surfaces:" << std::endl; + std::cout << " accel = " << subdiv_accel << std::endl; + + std::cout << "grids:" << std::endl; + std::cout << " accel = " << grid_accel << std::endl; + std::cout << " builder = " << grid_builder << std::endl; + + std::cout << "motion blur grids:" << std::endl; + std::cout << " accel = " << grid_accel_mb << std::endl; + std::cout << " builder = " << grid_builder_mb << std::endl; + + std::cout << "object_accel:" << std::endl; + std::cout << " min_leaf_size = " << object_accel_min_leaf_size << std::endl; + std::cout << " max_leaf_size = " << object_accel_max_leaf_size << std::endl; + + std::cout << "object_accel_mb:" << std::endl; + std::cout << " min_leaf_size = " << object_accel_mb_min_leaf_size << std::endl; + std::cout << " max_leaf_size = " << object_accel_mb_max_leaf_size << std::endl; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/state.h b/thirdparty/embree-aarch64/kernels/common/state.h new file mode 100644 index 0000000000..d0fccc023f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/state.h @@ -0,0 +1,197 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /* mutex to make printing to cout thread safe */ + extern MutexSys g_printMutex; + + struct State : public RefCount + { + public: + /*! state construction */ + State (); + + /*! state destruction */ + ~State(); + + /*! verifies that state is correct */ + void verify(); + + /*! parses state from a configuration file */ + bool parseFile(const FileName& fileName); + + /*! parses the state from a string */ + void parseString(const char* cfg); + + /*! parses the state from a stream */ + void parse(Ref<TokenStream> cin); + + /*! prints the state */ + void print(); + + /*! checks if verbosity level is at least N */ + bool verbosity(size_t N); + + /*! checks if some particular ISA is enabled */ + bool hasISA(const int isa); + + /*! check whether selected ISA is supported by the HW */ + bool checkISASupport(); + + public: + std::string tri_accel; //!< acceleration structure to use for triangles + std::string tri_builder; //!< builder to use for triangles + std::string tri_traverser; //!< traverser to use for triangles + + public: + std::string tri_accel_mb; //!< acceleration structure to use for motion blur triangles + std::string tri_builder_mb; //!< builder to use for motion blur triangles + std::string tri_traverser_mb; //!< traverser to use for triangles + + public: + std::string quad_accel; //!< acceleration structure to use for quads + std::string quad_builder; //!< builder to use for quads + std::string quad_traverser; //!< traverser to use for quads + + public: + std::string quad_accel_mb; //!< acceleration structure to use for motion blur quads + std::string quad_builder_mb; //!< builder to use for motion blur quads + std::string quad_traverser_mb; //!< traverser to use for motion blur quads + + public: + std::string line_accel; //!< acceleration structure to use for line segments + std::string line_builder; //!< builder to use for line segments + std::string line_traverser; //!< traverser to use for line segments + + public: + std::string line_accel_mb; //!< acceleration structure to use for motion blur line segments + std::string line_builder_mb; //!< builder to use for motion blur line segments + std::string line_traverser_mb; //!< traverser to use for motion blur line segments + + public: + std::string hair_accel; //!< hair acceleration structure to use + std::string hair_builder; //!< builder to use for hair + std::string hair_traverser; //!< traverser to use for hair + + public: + std::string hair_accel_mb; //!< acceleration structure to use for motion blur hair + std::string hair_builder_mb; //!< builder to use for motion blur hair + std::string hair_traverser_mb; //!< traverser to use for motion blur hair + + public: + std::string object_accel; //!< acceleration structure for user geometries + std::string object_builder; //!< builder for user geometries + int object_accel_min_leaf_size; //!< minimum leaf size for object acceleration structure + int object_accel_max_leaf_size; //!< maximum leaf size for object acceleration structure + + public: + std::string object_accel_mb; //!< acceleration structure for user geometries + std::string object_builder_mb; //!< builder for user geometries + int object_accel_mb_min_leaf_size; //!< minimum leaf size for mblur object acceleration structure + int object_accel_mb_max_leaf_size; //!< maximum leaf size for mblur object acceleration structure + + public: + std::string subdiv_accel; //!< acceleration structure to use for subdivision surfaces + std::string subdiv_accel_mb; //!< acceleration structure to use for subdivision surfaces + + public: + std::string grid_accel; //!< acceleration structure to use for grids + std::string grid_builder; //!< builder for grids + std::string grid_accel_mb; //!< acceleration structure to use for motion blur grids + std::string grid_builder_mb; //!< builder for motion blur grids + + public: + float max_spatial_split_replications; //!< maximally replications*N many primitives in accel for spatial splits + bool useSpatialPreSplits; //!< use spatial pre-splits instead of the full spatial split builder + size_t tessellation_cache_size; //!< size of the shared tessellation cache + + public: + size_t instancing_open_min; //!< instancing opens tree to minimally that number of subtrees + size_t instancing_block_size; //!< instancing opens tree up to average block size of primitives + float instancing_open_factor; //!< instancing opens tree up to x times the number of instances + size_t instancing_open_max_depth; //!< maximum open depth for geometries + size_t instancing_open_max; //!< instancing opens tree to maximally that number of subtrees + + public: + bool ignore_config_files; //!< if true no more config files get parse + bool float_exceptions; //!< enable floating point exceptions + int quality_flags; + int scene_flags; + size_t verbose; //!< verbosity of output + size_t benchmark; //!< true + + public: + size_t numThreads; //!< number of threads to use in builders + size_t numUserThreads; //!< number of user provided threads to use in builders + bool set_affinity; //!< sets affinity for worker threads + bool start_threads; //!< true when threads should be started at device creation time + int enabled_cpu_features; //!< CPU ISA features to use + int enabled_builder_cpu_features; //!< CPU ISA features to use for builders only + enum FREQUENCY_LEVEL { + FREQUENCY_SIMD128, + FREQUENCY_SIMD256, + FREQUENCY_SIMD512 + } frequency_level; //!< frequency level the app wants to run on (default is SIMD256) + bool enable_selockmemoryprivilege; //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages + bool hugepages; //!< true if huge pages should get used + bool hugepages_success; //!< status for enabling huge pages + + public: + size_t alloc_main_block_size; //!< main allocation block size (shared between threads) + int alloc_num_main_slots; //!< number of such shared blocks to be used to allocate + size_t alloc_thread_block_size; //!< size of thread local allocator block size + int alloc_single_thread_alloc; //!< in single mode nodes and leaves use same thread local allocator + + public: + + /*! checks if we can use AVX */ + bool canUseAVX() { + return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128; + } + + /*! checks if we can use AVX2 */ + bool canUseAVX2() { + return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128; + } + + struct ErrorHandler + { + public: + ErrorHandler(); + ~ErrorHandler(); + RTCError* error(); + + public: + tls_t thread_error; + std::vector<RTCError*> thread_errors; + MutexSys errors_mutex; + }; + ErrorHandler errorHandler; + static ErrorHandler g_errorHandler; + + public: + void setErrorFunction(RTCErrorFunction fptr, void* uptr) + { + error_function = fptr; + error_function_userptr = uptr; + } + + RTCErrorFunction error_function; + void* error_function_userptr; + + public: + void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) + { + memory_monitor_function = fptr; + memory_monitor_userptr = uptr; + } + + RTCMemoryMonitorFunction memory_monitor_function; + void* memory_monitor_userptr; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/vector.h b/thirdparty/embree-aarch64/kernels/common/vector.h new file mode 100644 index 0000000000..b478762240 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/vector.h @@ -0,0 +1,76 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "default.h" + +namespace embree +{ + /*! invokes the memory monitor callback */ + struct MemoryMonitorInterface { + virtual void memoryMonitor(ssize_t bytes, bool post) = 0; + }; + + /*! allocator that performs aligned monitored allocations */ + template<typename T, size_t alignment = 64> + struct aligned_monitored_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) + : device(device), hugepages(false) {} + + __forceinline pointer allocate( size_type n ) + { + if (n) { + assert(device); + device->memoryMonitor(n*sizeof(T),false); + } + if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M) + { + pointer p = (pointer) os_malloc(n*sizeof(value_type),hugepages); + assert(p); + return p; + } + return (pointer) alignedMalloc(n*sizeof(value_type),alignment); + } + + __forceinline void deallocate( pointer p, size_type n ) + { + if (p) + { + if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M) + os_free(p,n*sizeof(value_type),hugepages); + else + alignedFree(p); + } + else assert(n == 0); + + if (n) { + assert(device); + device->memoryMonitor(-ssize_t(n)*sizeof(T),true); + } + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + + private: + MemoryMonitorInterface* device; + bool hugepages; + }; + + /*! monitored vector */ + template<typename T> + using mvector = vector_t<T,aligned_monitored_allocator<T,std::alignment_of<T>::value> >; +} |