diff options
Diffstat (limited to 'thirdparty/embree/kernels')
51 files changed, 575 insertions, 330 deletions
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_morton.h b/thirdparty/embree/kernels/builders/bvh_builder_morton.h index 8f21e3254f..cba32ca73c 100644 --- a/thirdparty/embree/kernels/builders/bvh_builder_morton.h +++ b/thirdparty/embree/kernels/builders/bvh_builder_morton.h @@ -411,7 +411,7 @@ namespace embree ReductionTy bounds[MAX_BRANCHING_FACTOR]; if (current.size() > singleThreadThreshold) { - /*! parallel_for is faster than spawing sub-tasks */ + /*! parallel_for is faster than spawning sub-tasks */ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { for (size_t i=r.begin(); i<r.end(); i++) { bounds[i] = recurse(depth+1,children[i],nullptr,true); diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h index f9a08d65cd..6e73c0d250 100644 --- a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h +++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h @@ -374,7 +374,7 @@ namespace embree const size_t begin = set.begin(); const size_t end = set.end(); - const size_t center = (begin + end)/2; + const size_t center = (begin + end + 1) / 2; PrimInfoMB linfo = empty; for (size_t i=begin; i<center; i++) @@ -594,7 +594,7 @@ namespace embree /* spawn tasks */ if (unlikely(current.size() > cfg.singleThreadThreshold)) { - /*! parallel_for is faster than spawing sub-tasks */ + /*! parallel_for is faster than spawning sub-tasks */ parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { for (size_t i=r.begin(); i<r.end(); i++) { values[i] = recurse(children[i],nullptr,true); diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h index fff4bf2a35..24c5faf8be 100644 --- a/thirdparty/embree/kernels/builders/bvh_builder_sah.h +++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h @@ -298,7 +298,7 @@ namespace embree /* spawn tasks */ if (current.size() > cfg.singleThreadThreshold) { - /*! parallel_for is faster than spawing sub-tasks */ + /*! parallel_for is faster than spawning sub-tasks */ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here for (size_t i=r.begin(); i<r.end(); i++) { values[i] = recurse(children[i],nullptr,true); diff --git a/thirdparty/embree/kernels/builders/heuristic_binning.h b/thirdparty/embree/kernels/builders/heuristic_binning.h index ee29d09ac9..41be6183b8 100644 --- a/thirdparty/embree/kernels/builders/heuristic_binning.h +++ b/thirdparty/embree/kernels/builders/heuristic_binning.h @@ -57,14 +57,12 @@ namespace embree __forceinline Vec3ia bin(const Vec3fa& p) const { const vint4 i = floori((vfloat4(p)-ofs)*scale); -#if 1 assert(i[0] >= 0 && (size_t)i[0] < num); assert(i[1] >= 0 && (size_t)i[1] < num); assert(i[2] >= 0 && (size_t)i[2] < num); - return Vec3ia(i); -#else + + // we clamp to handle corner cases that could calculate out of bounds bin return Vec3ia(clamp(i,vint4(0),vint4(num-1))); -#endif } /*! faster but unsafe binning */ diff --git a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h index 4249d16ea1..354e283557 100644 --- a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h +++ b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h @@ -275,7 +275,7 @@ namespace embree openNodesBasedOnExtend(set); #endif - /* disable opening when unsufficient space for opening a node available */ + /* disable opening when insufficient space for opening a node available */ if (set.ext_range_size() < max_open_size-1) set.set_ext_range(set.end()); /* disable opening */ } diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial.h b/thirdparty/embree/kernels/builders/heuristic_spatial.h index a6939ba258..8b3499ac8d 100644 --- a/thirdparty/embree/kernels/builders/heuristic_spatial.h +++ b/thirdparty/embree/kernels/builders/heuristic_spatial.h @@ -159,27 +159,25 @@ namespace embree assert(binID < BINS); bounds [binID][dim].extend(b); } - - /*! bins an array of triangles */ - template<typename SplitPrimitive> - __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping) + + /*! bins an array of primitives */ + template<typename PrimitiveSplitterFactory> + __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) { - for (size_t i=0; i<N; i++) + for (size_t i=begin; i<end; i++) { - const PrimRef prim = prims[i]; + const PrimRef& prim = source[i]; unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); - - if (unlikely(splits == 1)) + + if (unlikely(splits <= 1)) { const vint4 bin = mapping.bin(center(prim.bounds())); for (size_t dim=0; dim<3; dim++) { assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS); - numBegin[bin[dim]][dim]++; - numEnd [bin[dim]][dim]++; - bounds [bin[dim]][dim].extend(prim.bounds()); + add(dim,bin[dim],bin[dim],bin[dim],prim.bounds()); } - } + } else { const vint4 bin0 = mapping.bin(prim.bounds().lower); @@ -187,89 +185,44 @@ namespace embree for (size_t dim=0; dim<3; dim++) { + if (unlikely(mapping.invalid(dim))) + continue; + size_t bin; - PrimRef rest = prim; size_t l = bin0[dim]; size_t r = bin1[dim]; - + // same bin optimization if (likely(l == r)) { - numBegin[l][dim]++; - numEnd [l][dim]++; - bounds [l][dim].extend(prim.bounds()); + add(dim,l,l,l,prim.bounds()); continue; } - - for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) + size_t bin_start = bin0[dim]; + size_t bin_end = bin1[dim]; + BBox3fa rest = prim.bounds(); + + /* assure that split position always overlaps the primitive bounds */ + while (bin_start < bin_end && mapping.pos(bin_start+1,dim) <= rest.lower[dim]) bin_start++; + while (bin_start < bin_end && mapping.pos(bin_end ,dim) >= rest.upper[dim]) bin_end--; + + const auto splitter = splitterFactory(prim); + for (bin=bin_start; bin<bin_end; bin++) { const float pos = mapping.pos(bin+1,dim); + BBox3fa left,right; + splitter(rest,dim,pos,left,right); - PrimRef left,right; - splitPrimitive(rest,(int)dim,pos,left,right); - if (unlikely(left.bounds().empty())) l++; - bounds[bin][dim].extend(left.bounds()); + if (unlikely(left.empty())) l++; + extend(dim,bin,left); rest = right; } - if (unlikely(rest.bounds().empty())) r--; - numBegin[l][dim]++; - numEnd [r][dim]++; - bounds [bin][dim].extend(rest.bounds()); + if (unlikely(rest.empty())) r--; + add(dim,l,r,bin,rest); } - } + } } } - - /*! bins a range of primitives inside an array */ - template<typename SplitPrimitive> - void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) { - bin(splitPrimitive,prims+begin,end-begin,mapping); - } - - /*! bins an array of primitives */ - template<typename PrimitiveSplitterFactory> - __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) - { - for (size_t i=begin; i<end; i++) - { - const PrimRef &prim = source[i]; - const vint4 bin0 = mapping.bin(prim.bounds().lower); - const vint4 bin1 = mapping.bin(prim.bounds().upper); - - for (size_t dim=0; dim<3; dim++) - { - if (unlikely(mapping.invalid(dim))) - continue; - - size_t bin; - size_t l = bin0[dim]; - size_t r = bin1[dim]; - - // same bin optimization - if (likely(l == r)) - { - add(dim,l,l,l,prim.bounds()); - continue; - } - const size_t bin_start = bin0[dim]; - const size_t bin_end = bin1[dim]; - BBox3fa rest = prim.bounds(); - const auto splitter = splitterFactory(prim); - for (bin=bin_start; bin<bin_end; bin++) - { - const float pos = mapping.pos(bin+1,dim); - BBox3fa left,right; - splitter(rest,dim,pos,left,right); - if (unlikely(left.empty())) l++; - extend(dim,bin,left); - rest = right; - } - if (unlikely(rest.empty())) r--; - add(dim,l,r,bin,rest); - } - } - } - /*! bins an array of primitives */ diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h index 60d235f48d..2584c19bda 100644 --- a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h +++ b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h @@ -241,7 +241,7 @@ namespace embree SpatialBinner binner(empty); const SpatialBinMapping<SPATIAL_BINS> mapping(set); binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping); - /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + /* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/ return binner.best(mapping,logBlockSize); //,set.ext_size()); } @@ -256,7 +256,7 @@ namespace embree binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping); return binner; }, [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); }); - /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + /* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/ return binner.best(mapping,logBlockSize); //,set.ext_size()); } @@ -286,6 +286,7 @@ namespace embree //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim]; //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim]; //if (unlikely(bin0 < split.pos && bin1 >= split.pos)) + if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos)) { assert(splits > 1); @@ -384,8 +385,8 @@ namespace embree new (&lset) PrimInfoExtRange(begin,center,center,local_left); new (&rset) PrimInfoExtRange(center,end,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); + assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f); + assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f); return std::pair<size_t,size_t>(left_weight,right_weight); } @@ -410,7 +411,7 @@ namespace embree begin,end,local_left,local_right, [&] (const PrimRef& ref) { const Vec3fa c = ref.bounds().center(); - return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); + return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); }, [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); @@ -419,8 +420,8 @@ namespace embree new (&lset) PrimInfoExtRange(begin,center,center,local_left); new (&rset) PrimInfoExtRange(center,end,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); + assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f); + assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f); return std::pair<size_t,size_t>(left_weight,right_weight); } diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp index d279dc4993..e2d7c27bd8 100644 --- a/thirdparty/embree/kernels/builders/primrefgen.cpp +++ b/thirdparty/embree/kernels/builders/primrefgen.cpp @@ -184,9 +184,7 @@ namespace embree // special variants for grid meshes -// -- GODOT start -- #if defined(EMBREE_GEOMETRY_GRID) -// -- GODOT end -- PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids) { PrimInfo pinfo(empty); @@ -296,9 +294,7 @@ namespace embree return pinfo; } -// -- GODOT start -- #endif -// -- GODOT end -- // ==================================================================================================== // ==================================================================================================== diff --git a/thirdparty/embree/kernels/builders/primrefgen_presplit.h b/thirdparty/embree/kernels/builders/primrefgen_presplit.h index 8cd251ddd2..aa2026a85e 100644 --- a/thirdparty/embree/kernels/builders/primrefgen_presplit.h +++ b/thirdparty/embree/kernels/builders/primrefgen_presplit.h @@ -266,7 +266,7 @@ namespace embree /* anything to split ? */ if (center < numPrimitives) { - const size_t numPrimitivesToSplit = numPrimitives - center; + size_t numPrimitivesToSplit = numPrimitives - center; assert(presplitItem[center].priority >= 1.0f); /* sort presplit items in ascending order */ @@ -279,8 +279,8 @@ namespace embree }); ); - unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem; - unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; + unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem; + unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */ const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t { @@ -317,11 +317,16 @@ namespace embree sum += numSubPrims; } new_center++; + + primOffset0 += new_center - center; + numPrimitivesToSplit -= new_center - center; center = new_center; + assert(numPrimitivesToSplit == (numPrimitives - center)); } /* parallel prefix sum to compute offsets for storing sub-primitives */ const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>()); + assert(numPrimitives+offset <= alloc_numPrimitives); /* iterate over range, and split primitives into sub primitives and append them to prims array */ parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void { @@ -338,7 +343,7 @@ namespace embree unsigned int numSubPrims = 0; splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); const size_t newID = numPrimitives + primOffset1[j-center]; - assert(newID+numSubPrims <= alloc_numPrimitives); + assert(newID+numSubPrims-1 <= alloc_numPrimitives); prims[primrefID] = subPrims[0]; for (size_t i=1;i<numSubPrims;i++) prims[newID+i-1] = subPrims[i]; diff --git a/thirdparty/embree/kernels/builders/splitter.h b/thirdparty/embree/kernels/builders/splitter.h index f7720bd284..da89d0b178 100644 --- a/thirdparty/embree/kernels/builders/splitter.h +++ b/thirdparty/embree/kernels/builders/splitter.h @@ -128,28 +128,30 @@ namespace embree const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask ); QuadMesh::Quad quad = mesh->quad(prim.primID()); - v[0] = mesh->vertex(quad.v[0]); - v[1] = mesh->vertex(quad.v[1]); - v[2] = mesh->vertex(quad.v[2]); - v[3] = mesh->vertex(quad.v[3]); - v[4] = mesh->vertex(quad.v[0]); - inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); - inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); - inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]); - inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]); + v[0] = mesh->vertex(quad.v[1]); + v[1] = mesh->vertex(quad.v[2]); + v[2] = mesh->vertex(quad.v[3]); + v[3] = mesh->vertex(quad.v[0]); + v[4] = mesh->vertex(quad.v[1]); + v[5] = mesh->vertex(quad.v[3]); + inv_length[0] = Vec3fa(1.0f) / (v[1] - v[0]); + inv_length[1] = Vec3fa(1.0f) / (v[2] - v[1]); + inv_length[2] = Vec3fa(1.0f) / (v[3] - v[2]); + inv_length[3] = Vec3fa(1.0f) / (v[4] - v[3]); + inv_length[4] = Vec3fa(1.0f) / (v[5] - v[4]); } __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { - splitPolygon<4>(prim,dim,pos,v,left_o,right_o); + splitPolygon<5>(prim,dim,pos,v,left_o,right_o); } __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { - splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o); + splitPolygon<5>(prim,dim,pos,v,inv_length,left_o,right_o); } private: - Vec3fa v[5]; - Vec3fa inv_length[4]; + Vec3fa v[6]; + Vec3fa inv_length[5]; }; struct QuadSplitterFactory diff --git a/thirdparty/embree/kernels/bvh/bvh.cpp b/thirdparty/embree/kernels/bvh/bvh.cpp index a84295f0da..f6cf626465 100644 --- a/thirdparty/embree/kernels/bvh/bvh.cpp +++ b/thirdparty/embree/kernels/bvh/bvh.cpp @@ -183,7 +183,7 @@ namespace embree template class BVHN<8>; #endif -#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) +#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__) template class BVHN<4>; #endif } diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp index 6e9a5a538e..1d393fd06b 100644 --- a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp @@ -230,7 +230,7 @@ namespace embree continue; /* switch to single ray traversal */ -#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__)) #if FORCE_SINGLE_MODE == 0 if (single) #endif @@ -676,7 +676,7 @@ namespace embree continue; /* switch to single ray traversal */ -#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__)) #if FORCE_SINGLE_MODE == 0 if (single) #endif diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h index 717f559677..c7e040fadb 100644 --- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h @@ -170,12 +170,23 @@ namespace embree TravRayKStream<K,robust> &p = packets[rayID / K]; const size_t i = rayID % K; const vint<N> bitmask(shiftTable[rayID]); + +#if defined (__aarch64__) + const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]); + const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]); + const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]); + const vfloat<N> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]); + const vfloat<N> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]); + const vfloat<N> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); +#else const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); const vfloat<N> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); const vfloat<N> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); const vfloat<N> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); +#endif + const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i])); const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i])); diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h index 57530692bc..3fd9fc7d18 100644 --- a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h @@ -46,6 +46,14 @@ namespace embree template<typename BuildRecord> __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i] == NodeRef::emptyNode); + assert(emptyChild == (children[i] == NodeRef::emptyNode)); + } +#endif AABBNode_t* node = ref.getAABBNode(); for (size_t i=0; i<num; i++) node->setRef(i,children[i]); return ref; @@ -60,6 +68,14 @@ namespace embree template<typename BuildRecord> __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i] == NodeRef::emptyNode); + assert(emptyChild == (children[i] == NodeRef::emptyNode)); + } +#endif AABBNode_t* node = ref.getAABBNode(); for (size_t i=0; i<num; i++) node->setRef(i,children[i]); diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h index c4cea7d8ba..001f526c25 100644 --- a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h @@ -31,6 +31,14 @@ namespace embree template<typename BuildRecord> __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i].ref == NodeRef::emptyNode); + assert(emptyChild == (children[i].ref == NodeRef::emptyNode)); + } +#endif AABBNodeMB_t* node = ref.getAABBNodeMB(); LBBox3fa bounds = empty; diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h index 46a81d7581..3b966fd054 100644 --- a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h @@ -41,6 +41,14 @@ namespace embree template<typename BuildRecord> __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i].ref == NodeRef::emptyNode); + assert(emptyChild == (children[i].ref == NodeRef::emptyNode)); + } +#endif if (likely(ref.isAABBNodeMB())) { for (size_t i=0; i<num; i++) ref.getAABBNodeMB()->set(i, children[i]); diff --git a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h index 2afc8c98e7..99671ddc5a 100644 --- a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h +++ b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h @@ -190,6 +190,14 @@ namespace embree template<typename BuildRecord> __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i] == NodeRef::emptyNode); + assert(emptyChild == (children[i] == NodeRef::emptyNode)); + } +#endif QuantizedNode_t* node = ref.quantizedNode(); for (size_t i=0; i<num; i++) node->setRef(i,children[i]); return ref; diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp index d857ff7d95..57f75bfd7e 100644 --- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp +++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp @@ -162,7 +162,7 @@ namespace embree template class BVHNStatistics<8>; #endif -#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) +#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__) template class BVHNStatistics<4>; #endif } diff --git a/thirdparty/embree/kernels/bvh/node_intersector1.h b/thirdparty/embree/kernels/bvh/node_intersector1.h index 1ec4fc63fc..17641fa888 100644 --- a/thirdparty/embree/kernels/bvh/node_intersector1.h +++ b/thirdparty/embree/kernels/bvh/node_intersector1.h @@ -5,6 +5,15 @@ #include "node_intersector.h" +#if defined(__AVX2__) +#define __FMA_X4__ +#endif + +#if defined(__aarch64__) +#define __FMA_X4__ +#endif + + namespace embree { namespace isa @@ -29,9 +38,15 @@ namespace embree org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z); dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z); rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) const Vec3fa ray_org_rdir = ray_org*ray_rdir; +#if !defined(__aarch64__) org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z); +#else + //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd + //x86 will use msub + neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z); +#endif #endif nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>); nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>); @@ -49,8 +64,12 @@ namespace embree org = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]); dir = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); -#if defined(__AVX2__) || defined(__ARM_NEON) - org_rdir = org*rdir; +#if defined(__FMA_X4__) +#if !defined(__aarch64__) + org_rdir = org*rdir; +#else + neg_org_rdir = -(org*rdir); +#endif #endif nearX = nearXYZ.x[k]; nearY = nearXYZ.y[k]; @@ -62,8 +81,14 @@ namespace embree Vec3fa org_xyz, dir_xyz; Vec3vf<N> org, dir, rdir; -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if !defined(__aarch64__) Vec3vf<N> org_rdir; +#else + //aarch64 version are keeping negation of the org_rdir and use madd + //x86 uses msub + Vec3vf<N> neg_org_rdir; +#endif #endif size_t nearX, nearY, nearZ; size_t farX, farY, farZ; @@ -404,13 +429,22 @@ namespace embree template<> __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist) { -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; @@ -450,13 +484,23 @@ namespace embree template<> __forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist) { -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__AVX2__) +#if defined(__aarch64__) + const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#endif + #else const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; @@ -522,13 +566,22 @@ namespace embree const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; @@ -537,7 +590,7 @@ namespace embree const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; #endif -#if defined(__AVX2__) && !defined(__AVX512F__) // HSW +#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); const vbool<N> vmask = asInt(tNear) > asInt(tFar); @@ -598,13 +651,22 @@ namespace embree const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); -#if defined (__AVX2__) || defined(__ARM_NEON) +#if defined (__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; @@ -613,7 +675,7 @@ namespace embree const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; #endif -#if defined(__AVX2__) && !defined(__AVX512F__) +#if defined(__FMA_X4__) && !defined(__AVX512F__) const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear)); const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar )); #else @@ -687,13 +749,22 @@ namespace embree const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y; @@ -703,7 +774,7 @@ namespace embree const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; #endif -#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW +#if defined(__aarch64__) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); const vbool4 vmask = asInt(tNear) > asInt(tFar); @@ -775,13 +846,22 @@ namespace embree const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__AVX2__) +#if defined(__aarch64__) + const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y; @@ -857,13 +937,22 @@ namespace embree const vfloat<N> upper_y = node->dequantizeUpperY(time); const vfloat<N> lower_z = node->dequantizeLowerZ(time); const vfloat<N> upper_z = node->dequantizeUpperZ(time); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y; diff --git a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h index 1f7215e5df..cad4e6de2d 100644 --- a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h +++ b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h @@ -75,9 +75,13 @@ namespace embree min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); +#if defined (__aarch64__) + neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org)); + neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org)); +#else min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org); max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org); - +#endif min_dist = reduced_min_dist; max_dist = reduced_max_dist; @@ -95,9 +99,13 @@ namespace embree Vec3fa min_rdir; Vec3fa max_rdir; +#if defined (__aarch64__) + Vec3fa neg_min_org_rdir; + Vec3fa neg_max_org_rdir; +#else Vec3fa min_org_rdir; Vec3fa max_org_rdir; - +#endif float min_dist; float max_dist; }; @@ -191,13 +199,21 @@ namespace embree const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY); const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ); +#if defined (__aarch64__) + const vfloat<N> fminX = madd(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.neg_min_org_rdir.x)); + const vfloat<N> fminY = madd(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.neg_min_org_rdir.y)); + const vfloat<N> fminZ = madd(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.neg_min_org_rdir.z)); + const vfloat<N> fmaxX = madd(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.neg_max_org_rdir.x)); + const vfloat<N> fmaxY = madd(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.neg_max_org_rdir.y)); + const vfloat<N> fmaxZ = madd(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.neg_max_org_rdir.z)); +#else const vfloat<N> fminX = msub(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.min_org_rdir.x)); const vfloat<N> fminY = msub(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.min_org_rdir.y)); const vfloat<N> fminZ = msub(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.min_org_rdir.z)); const vfloat<N> fmaxX = msub(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.max_org_rdir.x)); const vfloat<N> fmaxY = msub(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.max_org_rdir.y)); const vfloat<N> fmaxZ = msub(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.max_org_rdir.z)); - +#endif const vfloat<N> fmin = maxi(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist)); dist = fmin; const vfloat<N> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist)); diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet.h b/thirdparty/embree/kernels/bvh/node_intersector_packet.h index d5498fc5db..4deacd620d 100644 --- a/thirdparty/embree/kernels/bvh/node_intersector_packet.h +++ b/thirdparty/embree/kernels/bvh/node_intersector_packet.h @@ -39,7 +39,9 @@ namespace embree org = ray_org; dir = ray_dir; rdir = rcp_safe(ray_dir); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + neg_org_rdir = -(org * rdir); +#elif defined(__AVX2__) org_rdir = org * rdir; #endif @@ -55,7 +57,9 @@ namespace embree Vec3vf<K> org; Vec3vf<K> dir; Vec3vf<K> rdir; -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + Vec3vf<K> neg_org_rdir; +#elif defined(__AVX2__) Vec3vf<K> org_rdir; #endif Vec3vi<K> nearXYZ; @@ -119,7 +123,14 @@ namespace embree const TravRayKFast<K>& ray, vfloat<K>& dist) { - #if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z); @@ -199,7 +210,14 @@ namespace embree const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); @@ -302,7 +320,14 @@ namespace embree const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); @@ -464,7 +489,14 @@ namespace embree const vfloat<N> lower_z = node->dequantizeLowerZ(); const vfloat<N> upper_z = node->dequantizeUpperZ(); - #if defined(__AVX2__) || defined(__ARM_NEON) + #if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); + #elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z); @@ -549,7 +581,14 @@ namespace embree const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time); const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h index 55b2c27231..943fd7043f 100644 --- a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h +++ b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h @@ -32,11 +32,19 @@ namespace embree __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir) { rdir = rcp_safe(ray_dir); +#if defined(__aarch64__) + neg_org_rdir = -(ray_org * rdir); +#else org_rdir = ray_org * rdir; +#endif } Vec3vf<K> rdir; +#if defined(__aarch64__) + Vec3vf<K> neg_org_rdir; +#else Vec3vf<K> org_rdir; +#endif vfloat<K> tnear; vfloat<K> tfar; }; @@ -87,12 +95,21 @@ namespace embree const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); +#if defined (__aarch64__) + const vfloat<N> rminX = madd(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k])); + const vfloat<N> rminY = madd(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k])); + const vfloat<N> rminZ = madd(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k])); + const vfloat<N> rmaxX = madd(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k])); + const vfloat<N> rmaxY = madd(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k])); + const vfloat<N> rmaxZ = madd(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k])); +#else const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k])); const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k])); const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k])); const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k])); const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k])); const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k])); +#endif const vfloat<N> rmin = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k])); const vfloat<N> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k])); @@ -113,12 +130,21 @@ namespace embree const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY); const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ); +#if defined (__aarch64__) + const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x); const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y); const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z); const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x); const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y); const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z); +#endif const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear); const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar); diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h index cc4ea1805b..d24326ce92 100644 --- a/thirdparty/embree/kernels/common/accel.h +++ b/thirdparty/embree/kernels/common/accel.h @@ -332,7 +332,7 @@ namespace embree intersectorN.intersect(this,rayN,N,context); } -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) { const vint<4> mask = valid.mask32(); intersect4(&mask,(RTCRayHit4&)ray,context); @@ -388,7 +388,7 @@ namespace embree intersectorN.occluded(this,rayN,N,context); } -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) { const vint<4> mask = valid.mask32(); occluded4(&mask,(RTCRay4&)ray,context); diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp index 32a27c560a..111c62083d 100644 --- a/thirdparty/embree/kernels/common/acceln.cpp +++ b/thirdparty/embree/kernels/common/acceln.cpp @@ -97,7 +97,7 @@ namespace embree for (size_t i=0; i<This->accels.size(); i++) { if (This->accels[i]->isEmpty()) continue; This->accels[i]->intersectors.occluded4(valid,ray,context); -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(__ARM_NEON) vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); if (unlikely(none(valid0 & hit0))) break; @@ -111,7 +111,7 @@ namespace embree for (size_t i=0; i<This->accels.size(); i++) { if (This->accels[i]->isEmpty()) continue; This->accels[i]->intersectors.occluded8(valid,ray,context); -#if defined(__SSE2__) // FIXME: use higher ISA +#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); vbool4 valid1 = asBool(((vint4*)valid)[1]); @@ -127,7 +127,7 @@ namespace embree for (size_t i=0; i<This->accels.size(); i++) { if (This->accels[i]->isEmpty()) continue; This->accels[i]->intersectors.occluded16(valid,ray,context); -#if defined(__SSE2__) // FIXME: use higher ISA +#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); vbool4 valid1 = asBool(((vint4*)valid)[1]); diff --git a/thirdparty/embree/kernels/common/accelset.h b/thirdparty/embree/kernels/common/accelset.h index 90b184a07b..1b67120c97 100644 --- a/thirdparty/embree/kernels/common/accelset.h +++ b/thirdparty/embree/kernels/common/accelset.h @@ -14,21 +14,14 @@ namespace embree struct IntersectFunctionNArguments; struct OccludedFunctionNArguments; - typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); - typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); - struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments { - IntersectContext* internal_context; Geometry* geometry; - ReportIntersectionFunc report; }; struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments { - IntersectContext* internal_context; Geometry* geometry; - ReportOcclusionFunc report; }; /*! Base class for set of acceleration structures. */ @@ -145,7 +138,7 @@ namespace embree public: /*! Intersects a single ray with the scene. */ - __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) { assert(primID < size()); assert(intersectorN.intersect); @@ -159,15 +152,13 @@ namespace embree args.N = 1; args.geomID = geomID; args.primID = primID; - args.internal_context = context; args.geometry = this; - args.report = report; intersectorN.intersect(&args); } /*! Tests if single ray is occluded by the scene. */ - __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) { assert(primID < size()); assert(intersectorN.occluded); @@ -181,16 +172,14 @@ namespace embree args.N = 1; args.geomID = geomID; args.primID = primID; - args.internal_context = context; args.geometry = this; - args.report = report; intersectorN.occluded(&args); } /*! Intersects a packet of K rays with the scene. */ template<int K> - __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) { assert(primID < size()); assert(intersectorN.intersect); @@ -204,16 +193,14 @@ namespace embree args.N = K; args.geomID = geomID; args.primID = primID; - args.internal_context = context; args.geometry = this; - args.report = report; intersectorN.intersect(&args); } /*! Tests if a packet of K rays is occluded by the scene. */ template<int K> - __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) { assert(primID < size()); assert(intersectorN.occluded); @@ -227,9 +214,7 @@ namespace embree args.N = K; args.geomID = geomID; args.primID = primID; - args.internal_context = context; args.geometry = this; - args.report = report; intersectorN.occluded(&args); } diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp index 1a0e1aeed3..38a76225f4 100644 --- a/thirdparty/embree/kernels/common/alloc.cpp +++ b/thirdparty/embree/kernels/common/alloc.cpp @@ -3,6 +3,9 @@ #include "alloc.h" #include "../../common/sys/thread.h" +#if defined(APPLE) && defined(__aarch64__) +#include "../../common/sys/barrier.h" +#endif namespace embree { diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h index 4458e35c24..12769df2c8 100644 --- a/thirdparty/embree/kernels/common/alloc.h +++ b/thirdparty/embree/kernels/common/alloc.h @@ -8,6 +8,10 @@ #include "scene.h" #include "primref.h" +#if defined(APPLE) && defined(__aarch64__) +#include <mutex> +#endif + namespace embree { class FastAllocator @@ -26,7 +30,7 @@ namespace embree public: struct ThreadLocal2; - enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE }; + enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE }; /*! Per thread structure holding the current memory block. */ struct __aligned(64) ThreadLocal @@ -132,7 +136,11 @@ namespace embree { assert(alloc_i); if (alloc.load() == alloc_i) return; +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(mutex); +#else Lock<SpinLock> lock(mutex); +#endif //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind if (alloc.load()) { alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); @@ -150,7 +158,11 @@ namespace embree { assert(alloc_i); if (alloc.load() != alloc_i) return; +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(mutex); +#else Lock<SpinLock> lock(mutex); +#endif if (alloc.load() != alloc_i) return; // required as a different thread calls unbind alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); @@ -161,7 +173,11 @@ namespace embree } public: +#if defined(APPLE) && defined(__aarch64__) + std::mutex mutex; +#else SpinLock mutex; //!< required as unbind is called from other threads +#endif std::atomic<FastAllocator*> alloc; //!< parent allocator ThreadLocal alloc0; ThreadLocal alloc1; @@ -169,7 +185,7 @@ namespace embree FastAllocator (Device* device, bool osAllocation) : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), - growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC), + growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC), primrefarray(device,0) { for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) @@ -206,7 +222,7 @@ namespace embree void setOSallocation(bool flag) { - atype = flag ? OS_MALLOC : ALIGNED_MALLOC; + atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC; } private: @@ -217,7 +233,11 @@ namespace embree ThreadLocal2* alloc = thread_local_allocator2; if (alloc == nullptr) { thread_local_allocator2 = alloc = new ThreadLocal2; +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(s_thread_local_allocators_lock); +#else Lock<SpinLock> lock(s_thread_local_allocators_lock); +#endif s_thread_local_allocators.push_back(make_unique(alloc)); } return alloc; @@ -227,7 +247,11 @@ namespace embree __forceinline void join(ThreadLocal2* alloc) { +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(s_thread_local_allocators_lock); +#else Lock<SpinLock> lock(thread_local_allocators_lock); +#endif thread_local_allocators.push_back(alloc); } @@ -492,7 +516,11 @@ namespace embree /* parallel block creation in case of no freeBlocks, avoids single global mutex */ if (likely(freeBlocks.load() == nullptr)) { +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(slotMutex[slot]); +#else Lock<SpinLock> lock(slotMutex[slot]); +#endif if (myUsedBlocks == threadUsedBlocks[slot]) { const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes); @@ -505,7 +533,11 @@ namespace embree /* if this fails allocate new block */ { - Lock<SpinLock> lock(mutex); +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(mutex); +#else + Lock<SpinLock> lock(mutex); +#endif if (myUsedBlocks == threadUsedBlocks[slot]) { if (freeBlocks.load() != nullptr) { @@ -527,7 +559,11 @@ namespace embree /*! add new block */ void addBlock(void* ptr, ssize_t bytes) { +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(mutex); +#else Lock<SpinLock> lock(mutex); +#endif const size_t sizeof_Header = offsetof(Block,data[0]); void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); size_t ofs = (size_t) aptr - (size_t) ptr; @@ -613,8 +649,8 @@ namespace embree bytesWasted(alloc->bytesWasted), stat_all(alloc,ANY_TYPE), stat_malloc(alloc,ALIGNED_MALLOC), - stat_4K(alloc,OS_MALLOC,false), - stat_2M(alloc,OS_MALLOC,true), + stat_4K(alloc,EMBREE_OS_MALLOC,false), + stat_2M(alloc,EMBREE_OS_MALLOC,true), stat_shared(alloc,SHARED) {} AllStatistics (size_t bytesUsed, @@ -707,7 +743,7 @@ namespace embree /* We avoid using os_malloc for small blocks as this could * cause a risk of fragmenting the virtual address space and * reach the limit of vm.max_map_count = 65k under Linux. */ - if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize) + if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize) atype = ALIGNED_MALLOC; /* we need to additionally allocate some header */ @@ -716,7 +752,7 @@ namespace embree bytesReserve = sizeof_Header+bytesReserve; /* consume full 4k pages with using os_malloc */ - if (atype == OS_MALLOC) { + if (atype == EMBREE_OS_MALLOC) { bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); } @@ -748,11 +784,11 @@ namespace embree return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); } } - else if (atype == OS_MALLOC) + else if (atype == EMBREE_OS_MALLOC) { if (device) device->memoryMonitor(bytesAllocate,false); bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages); - return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); + return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); } else assert(false); @@ -796,7 +832,7 @@ namespace embree if (device) device->memoryMonitor(-sizeof_Alloced,true); } - else if (atype == OS_MALLOC) { + else if (atype == EMBREE_OS_MALLOC) { size_t sizeof_This = sizeof_Header+reserveEnd; os_free(this,sizeof_This,huge_pages); if (device) device->memoryMonitor(-sizeof_Alloced,true); @@ -857,7 +893,7 @@ namespace embree bool hasType(AllocationType atype_i, bool huge_pages_i) const { if (atype_i == ANY_TYPE ) return true; - else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; + else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; else return atype_i == atype; } @@ -906,7 +942,7 @@ namespace embree void print_block() const { if (atype == ALIGNED_MALLOC) std::cout << "A"; - else if (atype == OS_MALLOC) std::cout << "O"; + else if (atype == EMBREE_OS_MALLOC) std::cout << "O"; else if (atype == SHARED) std::cout << "S"; if (huge_pages) std::cout << "H"; size_t bytesUsed = getBlockUsedBytes(); @@ -936,7 +972,11 @@ namespace embree std::atomic<Block*> freeBlocks; std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; - SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#if defined(APPLE) && defined(__aarch64__) + std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#else + PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#endif bool use_single_mode; size_t defaultBlockSize; @@ -950,7 +990,11 @@ namespace embree static __thread ThreadLocal2* thread_local_allocator2; static SpinLock s_thread_local_allocators_lock; static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators; +#if defined(APPLE) && defined(__aarch64__) + std::mutex thread_local_allocators_lock; +#else SpinLock thread_local_allocators_lock; +#endif std::vector<ThreadLocal2*> thread_local_allocators; AllocationType atype; mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp index 068e0c2983..833ec65139 100644 --- a/thirdparty/embree/kernels/common/device.cpp +++ b/thirdparty/embree/kernels/common/device.cpp @@ -66,7 +66,11 @@ namespace embree case CPU::CORE1: frequency_level = FREQUENCY_SIMD128; break; case CPU::XEON_PHI_KNIGHTS_MILL : frequency_level = FREQUENCY_SIMD512; break; case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break; +#if defined(__APPLE__) + case CPU::ARM: frequency_level = FREQUENCY_SIMD256; break; // Apple M1 supports high throughput for SIMD4 +#else case CPU::ARM: frequency_level = FREQUENCY_SIMD128; break; +#endif } /* initialize global state */ diff --git a/thirdparty/embree/kernels/common/geometry.h b/thirdparty/embree/kernels/common/geometry.h index 2f9f2e7c94..593990f5b1 100644 --- a/thirdparty/embree/kernels/common/geometry.h +++ b/thirdparty/embree/kernels/common/geometry.h @@ -91,7 +91,7 @@ namespace embree size_t numFilterFunctions; //!< number of geometries with filter functions enabled size_t numTriangles; //!< number of enabled triangles - size_t numMBTriangles; //!< number of enabled motion blured triangles + size_t numMBTriangles; //!< number of enabled motion blurred triangles size_t numQuads; //!< number of enabled quads size_t numMBQuads; //!< number of enabled motion blurred quads size_t numBezierCurves; //!< number of enabled curves @@ -99,7 +99,7 @@ namespace embree size_t numLineSegments; //!< number of enabled line segments size_t numMBLineSegments; //!< number of enabled line motion blurred segments size_t numSubdivPatches; //!< number of enabled subdivision patches - size_t numMBSubdivPatches; //!< number of enabled motion blured subdivision patches + size_t numMBSubdivPatches; //!< number of enabled motion blurred subdivision patches size_t numUserGeometries; //!< number of enabled user geometries size_t numMBUserGeometries; //!< number of enabled motion blurred user geometries size_t numInstancesCheap; //!< number of enabled cheap instances diff --git a/thirdparty/embree/kernels/common/isa.h b/thirdparty/embree/kernels/common/isa.h index ae6556336c..9e1132e1a0 100644 --- a/thirdparty/embree/kernels/common/isa.h +++ b/thirdparty/embree/kernels/common/isa.h @@ -44,7 +44,7 @@ namespace embree #define SELECT_SYMBOL_DEFAULT(features,intersector) \ intersector = isa::intersector; -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) #if !defined(EMBREE_TARGET_SIMD4) #define EMBREE_TARGET_SIMD4 #endif diff --git a/thirdparty/embree/kernels/common/ray.h b/thirdparty/embree/kernels/common/ray.h index 7b951cc1e8..3c8ee3989c 100644 --- a/thirdparty/embree/kernels/common/ray.h +++ b/thirdparty/embree/kernels/common/ray.h @@ -6,7 +6,7 @@ #include "default.h" #include "instance_stack.h" -// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted +// FIXME: if ray gets separated into ray* and hit, uload4 needs to be adjusted namespace embree { diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp index 94b3819e42..a6ea55bfc4 100644 --- a/thirdparty/embree/kernels/common/rtcore.cpp +++ b/thirdparty/embree/kernels/common/rtcore.cpp @@ -7,6 +7,7 @@ #include "device.h" #include "scene.h" #include "context.h" +#include "../geometry/filter.h" #include "../../include/embree3/rtcore_ray.h" using namespace embree; @@ -482,7 +483,7 @@ RTC_NAMESPACE_BEGIN; IntersectContext context(scene,user_context); #if !defined(EMBREE_RAY_PACKETS) - Ray4* ray4 = (Ray4*) rayhit; + RayHit4* ray4 = (RayHit4*) rayhit; for (size_t i=0; i<4; i++) { if (!valid[i]) continue; RayHit ray1; ray4->get(i,ray1); @@ -513,7 +514,7 @@ RTC_NAMESPACE_BEGIN; IntersectContext context(scene,user_context); #if !defined(EMBREE_RAY_PACKETS) - Ray8* ray8 = (Ray8*) rayhit; + RayHit8* ray8 = (RayHit8*) rayhit; for (size_t i=0; i<8; i++) { if (!valid[i]) continue; RayHit ray1; ray8->get(i,ray1); @@ -546,7 +547,7 @@ RTC_NAMESPACE_BEGIN; IntersectContext context(scene,user_context); #if !defined(EMBREE_RAY_PACKETS) - Ray16* ray16 = (Ray16*) rayhit; + RayHit16* ray16 = (RayHit16*) rayhit; for (size_t i=0; i<16; i++) { if (!valid[i]) continue; RayHit ray1; ray16->get(i,ray1); @@ -1097,13 +1098,13 @@ RTC_NAMESPACE_BEGIN; RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) { IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i; - args->report(args,filter_args); + isa::reportIntersection1(args, filter_args); } RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) { OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i; - args->report(args,filter_args); + isa::reportOcclusion1(args,filter_args); } RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type) @@ -1763,4 +1764,19 @@ RTC_NAMESPACE_BEGIN; return nullptr; } + RTC_API RTCGeometry rtcGetGeometryThreadSafe (RTCScene hscene, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryThreadSafe); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_GEOMID(geomID); +#endif + Ref<Geometry> geom = scene->get_locked(geomID); + return (RTCGeometry) geom.ptr; + RTC_CATCH_END2(scene); + return nullptr; + } + RTC_NAMESPACE_END diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h index f8aad7c7cb..ac58a84d6f 100644 --- a/thirdparty/embree/kernels/common/rtcore.h +++ b/thirdparty/embree/kernels/common/rtcore.h @@ -26,56 +26,59 @@ namespace embree /*! Macros used in the rtcore API implementation */ // -- GODOT start -- -// #define RTC_CATCH_BEGIN try { #define RTC_CATCH_BEGIN - -// #define RTC_CATCH_END(device) \ -// } catch (std::bad_alloc&) { \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// } catch (rtcore_error& e) { \ -// Device::process_error(device,e.error,e.what()); \ -// } catch (std::exception& e) { \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// } catch (...) { \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// } #define RTC_CATCH_END(device) - -// #define RTC_CATCH_END2(scene) \ -// } catch (std::bad_alloc&) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// } catch (rtcore_error& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,e.error,e.what()); \ -// } catch (std::exception& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// } catch (...) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// } #define RTC_CATCH_END2(scene) - -// #define RTC_CATCH_END2_FALSE(scene) \ -// } catch (std::bad_alloc&) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// return false; \ -// } catch (rtcore_error& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,e.error,e.what()); \ -// return false; \ -// } catch (std::exception& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// return false; \ -// } catch (...) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// return false; \ -// } #define RTC_CATCH_END2_FALSE(scene) return false; + +#if 0 +#define RTC_CATCH_BEGIN try { + +#define RTC_CATCH_END(device) \ + } catch (std::bad_alloc&) { \ + Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ + } catch (rtcore_error& e) { \ + Device::process_error(device,e.error,e.what()); \ + } catch (std::exception& e) { \ + Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ + } catch (...) { \ + Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ + } + +#define RTC_CATCH_END2(scene) \ + } catch (std::bad_alloc&) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ + } catch (rtcore_error& e) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,e.error,e.what()); \ + } catch (std::exception& e) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ + } catch (...) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ + } + +#define RTC_CATCH_END2_FALSE(scene) \ + } catch (std::bad_alloc&) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ + return false; \ + } catch (rtcore_error& e) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,e.error,e.what()); \ + return false; \ + } catch (std::exception& e) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ + return false; \ + } catch (...) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ + return false; \ + } +#endif // -- GODOT end -- #define RTC_VERIFY_HANDLE(handle) \ @@ -103,39 +106,35 @@ namespace embree #define RTC_TRACE(x) #endif -// -- GODOT begin -- -// /*! used to throw embree API errors */ -// struct rtcore_error : public std::exception -// { -// __forceinline rtcore_error(RTCError error, const std::string& str) -// : error(error), str(str) {} -// -// ~rtcore_error() throw() {} -// -// const char* what () const throw () { -// return str.c_str(); -// } -// -// RTCError error; -// std::string str; -// }; -// -- GODOT end -- +// -- GODOT start -- +#if 0 + /*! used to throw embree API errors */ + struct rtcore_error : public std::exception + { + __forceinline rtcore_error(RTCError error, const std::string& str) + : error(error), str(str) {} + + ~rtcore_error() throw() {} + + const char* what () const throw () { + return str.c_str(); + } + + RTCError error; + std::string str; + }; +#endif #if defined(DEBUG) // only report file and line in debug mode - // -- GODOT begin -- - // #define throw_RTCError(error,str) \ - // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); #define throw_RTCError(error,str) \ printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort(); - // -- GODOT end -- + // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); #else - // -- GODOT begin -- - // #define throw_RTCError(error,str) \ - // throw rtcore_error(error,str); #define throw_RTCError(error,str) \ abort(); - // -- GODOT end -- + // throw rtcore_error(error,str); #endif +// -- GODOT end -- #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) diff --git a/thirdparty/embree/kernels/common/rtcore_builder.cpp b/thirdparty/embree/kernels/common/rtcore_builder.cpp index 1f1b6f6ddf..29e3bdca20 100644 --- a/thirdparty/embree/kernels/common/rtcore_builder.cpp +++ b/thirdparty/embree/kernels/common/rtcore_builder.cpp @@ -371,7 +371,7 @@ RTC_NAMESPACE_BEGIN bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa)); bvh->allocator.reset(); - /* switch between differnet builders based on quality level */ + /* switch between different builders based on quality level */ if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW) return rtcBuildBVHMorton(arguments); else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM) diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp index 408d7eae6f..65d31d0f81 100644 --- a/thirdparty/embree/kernels/common/scene.cpp +++ b/thirdparty/embree/kernels/common/scene.cpp @@ -629,9 +629,7 @@ namespace embree if (geometry == null) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry"); - if (geometry->isEnabled()) { - setModified (); - } + setModified (); accels_deleteGeometry(unsigned(geomID)); id_pool.deallocate((unsigned)geomID); geometries[geomID] = null; diff --git a/thirdparty/embree/kernels/common/scene_curves.h b/thirdparty/embree/kernels/common/scene_curves.h index a5a39e42d4..a1ea45d3c7 100644 --- a/thirdparty/embree/kernels/common/scene_curves.h +++ b/thirdparty/embree/kernels/common/scene_curves.h @@ -452,6 +452,10 @@ namespace embree const Vec3fa n1 = normal(index+1,itime); if (!isvalid(n0) || !isvalid(n1)) return false; + + const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds(); + if (!isvalid(b)) + return false; } } @@ -612,6 +616,10 @@ namespace embree const Vec3fa dn1 = dnormal(index+1,itime); if (!isvalid(dn0) || !isvalid(dn1)) return false; + + const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds(); + if (!isvalid(b)) + return false; } } diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp index 01c862da0c..db6b803041 100644 --- a/thirdparty/embree/kernels/common/state.cpp +++ b/thirdparty/embree/kernels/common/state.cpp @@ -144,7 +144,20 @@ namespace embree } bool State::checkISASupport() { +#if defined(__ARM_NEON) + /* + * NEON CPU type is a mixture of NEON and SSE2 + */ + + bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2; + + /* this will be true when explicitly initialize Device with `isa=neon` config */ + bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON; + + return hasSSE2 || hasNEON; +#else return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features; +#endif } void State::verify() @@ -157,8 +170,10 @@ namespace embree * functions */ #if defined(DEBUG) #if defined(EMBREE_TARGET_SSE2) +#if !defined(__ARM_NEON) assert(sse2::getISA() <= SSE2); #endif +#endif #if defined(EMBREE_TARGET_SSE42) assert(sse42::getISA() <= SSE42); #endif diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h index 2bf7e93587..84ac27d103 100644 --- a/thirdparty/embree/kernels/config.h +++ b/thirdparty/embree/kernels/config.h @@ -1,5 +1,4 @@ - -// Copyright 2009-2020 Intel Corporation +// Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 /* #undef EMBREE_RAY_MASK */ @@ -20,6 +19,7 @@ /* #undef EMBREE_COMPACT_POLYS */ #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 +#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE #if defined(EMBREE_GEOMETRY_TRIANGLE) #define IF_ENABLED_TRIS(x) x diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h index 3d8900c2aa..75532f5ae0 100644 --- a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h +++ b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h @@ -225,7 +225,7 @@ namespace embree /* exit if convergence cannot get proven, but terminate if we are very small */ if (unlikely(!subset(K,x) && !very_small)) return false; - /* solve using newton raphson iteration of convergence is guarenteed */ + /* solve using newton raphson iteration of convergence is guaranteed */ solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J); return true; } diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h index 2d4abd73ac..ed827d583f 100644 --- a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h +++ b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h @@ -60,7 +60,7 @@ namespace embree const Vec3fa dir = ray.dir; const float length_ray_dir = length(dir); - /* error of curve evaluations is propertional to largest coordinate */ + /* error of curve evaluations is proportional to largest coordinate */ const BBox3ff box = curve.bounds(); const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper))); diff --git a/thirdparty/embree/kernels/geometry/disc_intersector.h b/thirdparty/embree/kernels/geometry/disc_intersector.h index 816c066899..ec6fa9c4f3 100644 --- a/thirdparty/embree/kernels/geometry/disc_intersector.h +++ b/thirdparty/embree/kernels/geometry/disc_intersector.h @@ -68,15 +68,15 @@ namespace embree const Vec3vf<M> center = v0.xyz(); const vfloat<M> radius = v0.w; + /* compute ray distance projC0 to hit point with ray oriented plane */ const Vec3vf<M> c0 = center - ray_org; const vfloat<M> projC0 = dot(c0, ray_dir) * rd2; valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar)); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale; // ignore self intersections if (unlikely(none(valid))) return false; - + + /* check if hit point lies inside disc */ const Vec3vf<M> perp = c0 - projC0 * ray_dir; const vfloat<M> l2 = dot(perp, perp); const vfloat<M> r2 = radius * radius; @@ -84,6 +84,15 @@ namespace embree if (unlikely(none(valid))) return false; + /* We reject hits where the ray origin lies inside the ray + * oriented disc to avoid self intersections. */ +#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE) + const vfloat<M> m2 = dot(c0, c0); + valid &= (m2 > r2); + if (unlikely(none(valid))) + return false; +#endif + DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir); return epilog(valid, hit); } @@ -152,15 +161,15 @@ namespace embree const Vec3vf<M> center = v0.xyz(); const vfloat<M> radius = v0.w; + /* compute ray distance projC0 to hit point with ray oriented plane */ const Vec3vf<M> c0 = center - ray_org; const vfloat<M> projC0 = dot(c0, ray_dir) * rd2; valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k])); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k]; // ignore self intersections if (unlikely(none(valid))) return false; + /* check if hit point lies inside disc */ const Vec3vf<M> perp = c0 - projC0 * ray_dir; const vfloat<M> l2 = dot(perp, perp); const vfloat<M> r2 = radius * radius; @@ -168,6 +177,15 @@ namespace embree if (unlikely(none(valid))) return false; + /* We reject hits where the ray origin lies inside the ray + * oriented disc to avoid self intersections. */ +#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE) + const vfloat<M> m2 = dot(c0, c0); + valid &= (m2 > r2); + if (unlikely(none(valid))) + return false; +#endif + DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir); return epilog(valid, hit); } diff --git a/thirdparty/embree/kernels/geometry/filter.h b/thirdparty/embree/kernels/geometry/filter.h index 3b4d924ea7..d64320bf78 100644 --- a/thirdparty/embree/kernels/geometry/filter.h +++ b/thirdparty/embree/kernels/geometry/filter.h @@ -51,20 +51,11 @@ namespace embree __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) { #if defined(EMBREE_FILTER_FUNCTION) - IntersectContext* MAYBE_UNUSED context = args->internal_context; - const Geometry* const geometry = args->geometry; - if (geometry->intersectionFilterN) { - assert(context->scene->hasGeometryFilterFunction()); - geometry->intersectionFilterN(filter_args); - } + if (args->geometry->intersectionFilterN) + args->geometry->intersectionFilterN(filter_args); - //if (args->valid[0] == 0) - // return; - - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(filter_args); - } + if (args->context->filter) + args->context->filter(filter_args); #endif } @@ -105,20 +96,11 @@ namespace embree __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) { #if defined(EMBREE_FILTER_FUNCTION) - IntersectContext* MAYBE_UNUSED context = args->internal_context; - const Geometry* const geometry = args->geometry; - if (geometry->occlusionFilterN) { - assert(context->scene->hasGeometryFilterFunction()); - geometry->occlusionFilterN(filter_args); - } - - //if (args->valid[0] == 0) - // return false; + if (args->geometry->occlusionFilterN) + args->geometry->occlusionFilterN(filter_args); - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(filter_args); - } + if (args->context->filter) + args->context->filter(filter_args); #endif } diff --git a/thirdparty/embree/kernels/geometry/object_intersector.h b/thirdparty/embree/kernels/geometry/object_intersector.h index 11ceb2f7fe..e4ad01852f 100644 --- a/thirdparty/embree/kernels/geometry/object_intersector.h +++ b/thirdparty/embree/kernels/geometry/object_intersector.h @@ -32,7 +32,7 @@ namespace embree return; #endif - accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1); + accel->intersect(ray,prim.geomID(),prim.primID(),context); } static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) @@ -44,7 +44,7 @@ namespace embree return false; #endif - accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); + accel->occluded(ray,prim.geomID(),prim.primID(),context); return ray.tfar < 0.0f; } @@ -89,7 +89,7 @@ namespace embree valid &= (ray.mask & accel->mask) != 0; if (none(valid)) return; #endif - accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1); + accel->intersect(valid,ray,prim.geomID(),prim.primID(),context); } static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim) @@ -102,7 +102,7 @@ namespace embree valid &= (ray.mask & accel->mask) != 0; if (none(valid)) return false; #endif - accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); + accel->occluded(valid,ray,prim.geomID(),prim.primID(),context); return ray.tfar < 0.0f; } diff --git a/thirdparty/embree/kernels/geometry/quadv.h b/thirdparty/embree/kernels/geometry/quadv.h index 2137356ff2..514e519b0c 100644 --- a/thirdparty/embree/kernels/geometry/quadv.h +++ b/thirdparty/embree/kernels/geometry/quadv.h @@ -152,7 +152,7 @@ namespace embree Vec3vf<M> v0; // 1st vertex of the quads Vec3vf<M> v1; // 2nd vertex of the quads Vec3vf<M> v2; // 3rd vertex of the quads - Vec3vf<M> v3; // 4rd vertex of the quads + Vec3vf<M> v3; // 4th vertex of the quads private: vuint<M> geomIDs; // geometry ID vuint<M> primIDs; // primitive ID diff --git a/thirdparty/embree/kernels/geometry/roundline_intersector.h b/thirdparty/embree/kernels/geometry/roundline_intersector.h index 0e9393442b..764ff93fec 100644 --- a/thirdparty/embree/kernels/geometry/roundline_intersector.h +++ b/thirdparty/embree/kernels/geometry/roundline_intersector.h @@ -19,7 +19,7 @@ For multiple connected round linear curve segments this construction yield a proper shape when viewed from the outside. Using the - following CSG we can also handle the interiour in most common cases: + following CSG we can also handle the interior in most common cases: round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) = cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr) @@ -431,7 +431,7 @@ namespace embree Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP Inserting the definition of w0 and dw and refactoring - yield a furhter scaled Ng'': + yield a further scaled Ng'': Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_intersector.h index ad5fee2e4e..e241073812 100644 --- a/thirdparty/embree/kernels/geometry/subgrid_intersector.h +++ b/thirdparty/embree/kernels/geometry/subgrid_intersector.h @@ -264,8 +264,8 @@ namespace embree const Vec3vf<K> p2 = vtx[i*4+2]; const Vec3vf<K> p3 = vtx[i*4+3]; STAT3(shadow.trav_prims,1,popcnt(valid0),K); - if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) - break; + pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)); + if (none(valid0)) break; } return !valid0; } @@ -408,10 +408,8 @@ namespace embree const Vec3vf<K> p2 = vtx[i*4+2]; const Vec3vf<K> p3 = vtx[i*4+3]; STAT3(shadow.trav_prims,1,popcnt(valid0),K); - //if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) - if (pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) - - break; + pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)); + if (none(valid0)) break; } return !valid0; } diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h index 470e15f03e..39d50e2354 100644 --- a/thirdparty/embree/kernels/hash.h +++ b/thirdparty/embree/kernels/hash.h @@ -1,5 +1,4 @@ - -// Copyright 2009-2020 Intel Corporation +// Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd" +#define RTC_HASH "698442324ccddd11725fb8875275dc1384f7fb40" diff --git a/thirdparty/embree/kernels/subdiv/bezier_patch.h b/thirdparty/embree/kernels/subdiv/bezier_patch.h index 2ff03902a7..0a2aef321f 100644 --- a/thirdparty/embree/kernels/subdiv/bezier_patch.h +++ b/thirdparty/embree/kernels/subdiv/bezier_patch.h @@ -94,7 +94,7 @@ namespace embree matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1); matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); - /* compute buttom edge control points */ + /* compute bottom edge control points */ matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1); matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2); diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h index e5ad5dadfe..eab91d9ee6 100644 --- a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h +++ b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h @@ -388,7 +388,7 @@ namespace embree return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n); } - /* gets limit tangent in the direction of egde vtx -> ring[0] */ + /* gets limit tangent in the direction of edge vtx -> ring[0] */ __forceinline Vertex getLimitTangent() const { if (unlikely(std::isinf(vertex_crease_weight))) @@ -429,7 +429,7 @@ namespace embree return sigma * (alpha + beta); } - /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ + /* gets limit tangent in the direction of edge vtx -> ring[edge_valence-2] */ __forceinline Vertex getSecondLimitTangent() const { if (unlikely(std::isinf(vertex_crease_weight))) @@ -763,7 +763,7 @@ namespace embree } - /* gets limit tangent in the direction of egde vtx -> ring[0] */ + /* gets limit tangent in the direction of edge vtx -> ring[0] */ __forceinline Vertex getLimitTangent() const { CatmullClark1Ring cc_vtx; @@ -779,7 +779,7 @@ namespace embree return 2.0f * cc_vtx.getLimitTangent(); } - /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ + /* gets limit tangent in the direction of edge vtx -> ring[edge_valence-2] */ __forceinline Vertex getSecondLimitTangent() const { CatmullClark1Ring cc_vtx; diff --git a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h index 74fc4c1230..9532287d98 100644 --- a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h +++ b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h @@ -8,7 +8,7 @@ /* - Implements Catmul Rom curves with control points p0, p1, p2, p3. At + Implements Catmull-Rom curves with control points p0, p1, p2, p3. At t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1 the curve goes through p2 with tangent (p3-p2)/2. @@ -91,11 +91,11 @@ namespace embree : v0(v0), v1(v1), v2(v2), v3(v3) {} __forceinline Vertex begin() const { - return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2)); + return v1; } __forceinline Vertex end() const { - return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3)); + return v2; } __forceinline Vertex center() const { diff --git a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h index f8e8a25f35..dcdb101d7c 100644 --- a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h +++ b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h @@ -81,29 +81,29 @@ namespace embree { SourceCurve<Vec3ff> vcurve = center; SourceCurve<Vec3fa> ncurve = normal; - + /* here we construct a patch which follows the curve l(t) = * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */ const Vec3ff p0 = vcurve.eval(0.0f); const Vec3ff dp0 = vcurve.eval_du(0.0f); - const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); + //const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); // ddp0 is assumed to be 0 const Vec3fa n0 = ncurve.eval(0.0f); const Vec3fa dn0 = ncurve.eval_du(0.0f); const Vec3ff p1 = vcurve.eval(1.0f); const Vec3ff dp1 = vcurve.eval_du(1.0f); - const Vec3ff ddp1 = vcurve.eval_dudu(1.0f); + //const Vec3ff ddp1 = vcurve.eval_dudu(1.0f); // ddp1 is assumed to be 0 const Vec3fa n1 = ncurve.eval(1.0f); const Vec3fa dn1 = ncurve.eval_du(1.0f); const Vec3fa bt0 = cross(n0,dp0); - const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0); + const Vec3fa dbt0 = cross(dn0,dp0);// + cross(n0,ddp0); const Vec3fa bt1 = cross(n1,dp1); - const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1); + const Vec3fa dbt1 = cross(dn1,dp1);// + cross(n1,ddp1); const Vec3fa k0 = normalize(bt0); const Vec3fa dk0 = dnormalize(bt0,dbt0); |