diff options
Diffstat (limited to 'thirdparty/embree')
320 files changed, 94546 insertions, 0 deletions
diff --git a/thirdparty/embree/common/algorithms/parallel_any_of.h b/thirdparty/embree/common/algorithms/parallel_any_of.h new file mode 100644 index 0000000000..a64e4a1889 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_any_of.h @@ -0,0 +1,55 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <functional> +#include "parallel_reduce.h" + +namespace embree +{ + + template<typename Index, class UnaryPredicate> + __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred) + { + bool ret = false; + +#if defined(TASKING_TBB) +#if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) { + if (context.is_group_execution_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + context.cancel_group_execution(); + } + } + }); +#else + tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) { + if (tbb::task::self().is_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + tbb::task::self().cancel_group_execution(); + } + } + }); +#endif +#else + ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool { + bool localret = false; + for (auto i=r.begin(); i<r.end(); ++i) { + localret |= pred(i); + } + return localret; + }, + std::bit_or<bool>() + ); +#endif + + return ret; + } + +} // end namespace diff --git a/thirdparty/embree/common/algorithms/parallel_filter.h b/thirdparty/embree/common/algorithms/parallel_filter.h new file mode 100644 index 0000000000..090ef164c2 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_filter.h @@ -0,0 +1,93 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template<typename Ty, typename Index, typename Predicate> + inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate) + { + Index j = first; + for (Index i=first; i<last; i++) + if (predicate(data[i])) + data[j++] = data[i]; + + return j; + } + + template<typename Ty, typename Index, typename Predicate> + inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate) + { + /* sequential fallback */ + if (end-begin <= minStepSize) + return sequential_filter(data,begin,end,predicate); + + /* calculate number of tasks to use */ + enum { MAX_TASKS = 64 }; + const Index numThreads = TaskScheduler::threadCount(); + const Index numBlocks = (end-begin+minStepSize-1)/minStepSize; + const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS); + + /* filter blocks */ + Index nused[MAX_TASKS]; + Index nfree[MAX_TASKS]; + parallel_for(taskCount, [&](const Index taskIndex) + { + const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount; + const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount; + const Index i2 = sequential_filter(data,i0,i1,predicate); + nused[taskIndex] = i2-i0; + nfree[taskIndex] = i1-i2; + }); + + /* calculate offsets */ + Index sused=0; + Index sfree=0; + Index pfree[MAX_TASKS]; + for (Index i=0; i<taskCount; i++) + { + sused+=nused[i]; + Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree; + } + + /* return if we did not filter out any element */ + assert(sfree <= end-begin); + assert(sused <= end-begin); + if (sused == end-begin) + return end; + + /* otherwise we have to copy misplaced elements around */ + parallel_for(taskCount, [&](const Index taskIndex) + { + /* destination to write elements to */ + Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex]; + Index dst_end = min(dst+nfree[taskIndex],begin+sused); + if (dst_end <= dst) return; + + /* range of misplaced elements to copy to destination */ + Index r0 = pfree[taskIndex]; + Index r1 = r0+dst_end-dst; + + /* find range in misplaced elements in back to front order */ + Index k0=0; + for (Index i=taskCount-1; i>0; i--) + { + if (k0 > r1) break; + Index k1 = k0+nused[i]; + Index src = begin+(i+0)*(end-begin)/taskCount+nused[i]; + for (Index i=max(r0,k0); i<min(r1,k1); i++) { + Index isrc = src-i+k0-1; + assert(dst >= begin && dst < end); + assert(isrc >= begin && isrc < end); + data[dst++] = data[isrc]; + } + k0 = k1; + } + }); + + return begin+sused; + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h new file mode 100644 index 0000000000..645681ac63 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for.h @@ -0,0 +1,186 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../tasking/taskscheduler.h" +#include "../sys/array.h" +#include "../math/math.h" +#include "../math/range.h" + +namespace embree +{ + /* parallel_for without range */ + template<typename Index, typename Func> + __forceinline void parallel_for( const Index N, const Func& func) + { +#if defined(TASKING_INTERNAL) + if (N) { + TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) { + assert(r.size() == 1); + func(r.begin()); + }); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + } + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range and granulatity */ + template<typename Index, typename Func> + __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func) + { + assert(first <= last); +#if defined(TASKING_INTERNAL) + TaskScheduler::spawn(first,last,minStepSize,func); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { + func(range<Index>(r.begin(),r.end())); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { + func(range<Index>(r.begin(),r.end())); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { + func(range<Index>(i,i+1)); + }); + +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range */ + template<typename Index, typename Func> + __forceinline void parallel_for( const Index first, const Index last, const Func& func) + { + assert(first <= last); + parallel_for(first,last,(Index)1,func); + } + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001) + + template<typename Index, typename Func> + __forceinline void parallel_for_static( const Index N, const Func& func) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner(),context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner()); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + + typedef tbb::affinity_partitioner affinity_partitioner; + + template<typename Index, typename Func> + __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap,context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + +#else + + template<typename Index, typename Func> + __forceinline void parallel_for_static( const Index N, const Func& func) + { + parallel_for(N,func); + } + + struct affinity_partitioner { + }; + + template<typename Index, typename Func> + __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) + { + parallel_for(N,func); + } + +#endif +} diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h new file mode 100644 index 0000000000..92c37a4a38 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for_for.h @@ -0,0 +1,149 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template<typename ArrayArray, typename Func> + __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + size_t k=0; + for (size_t i=0; i!=array2.size(); ++i) { + const size_t N = array2[i]->size(); + if (N) func(array2[i],range<size_t>(0,N),k); + k+=N; + } + } + + class ParallelForForState + { + public: + + enum { MAX_TASKS = 64 }; + + __forceinline ParallelForForState () + : taskCount(0) {} + + template<typename ArrayArray> + __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { + init(array2,minStepSize); + } + + template<typename ArrayArray> + __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) + { + /* first calculate total number of elements */ + size_t N = 0; + for (size_t i=0; i<array2.size(); i++) { + N += array2[i] ? array2[i]->size() : 0; + } + this->N = N; + + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (N+minStepSize-1)/minStepSize; + taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); + + /* calculate start (i,j) for each task */ + size_t taskIndex = 0; + i0[taskIndex] = 0; + j0[taskIndex] = 0; + size_t k0 = (++taskIndex)*N/taskCount; + for (size_t i=0, k=0; taskIndex < taskCount; i++) + { + assert(i<array2.size()); + size_t j=0, M = array2[i] ? array2[i]->size() : 0; + while (j<M && k+M-j >= k0 && taskIndex < taskCount) { + assert(taskIndex<taskCount); + i0[taskIndex] = i; + j0[taskIndex] = j += k0-k; + k=k0; + k0 = (++taskIndex)*N/taskCount; + } + k+=M-j; + } + } + + __forceinline size_t size() const { + return N; + } + + public: + size_t i0[MAX_TASKS]; + size_t j0[MAX_TASKS]; + size_t taskCount; + size_t N; + }; + + template<typename ArrayArray, typename Func> + __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + ParallelForForState state(array2,minStepSize); + + parallel_for(state.taskCount, [&](const size_t taskIndex) + { + /* calculate range */ + const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; + const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + for (size_t i=i0; k<k1; i++) { + const size_t N = array2[i] ? array2[i]->size() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k); + k+=r1-r0; j0 = 0; + } + }); + } + + template<typename ArrayArray, typename Func> + __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) + { + parallel_for_for(array2,1,func); + } + + template<typename ArrayArray, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + ParallelForForState state(array2,minStepSize); + Value temp[ParallelForForState::MAX_TASKS]; + + for (size_t i=0; i<state.taskCount; i++) + temp[i] = identity; + + parallel_for(state.taskCount, [&](const size_t taskIndex) + { + /* calculate range */ + const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; + const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + for (size_t i=i0; k<k1; i++) { + const size_t N = array2[i] ? array2[i]->size() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k)); + k+=r1-r0; j0 = 0; + } + }); + + Value ret = identity; + for (size_t i=0; i<state.taskCount; i++) + ret = reduction(ret,temp[i]); + return ret; + } + + template<typename ArrayArray, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_reduce(array2,1,identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h new file mode 100644 index 0000000000..b15b44a991 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h @@ -0,0 +1,112 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for_for.h" +#include "parallel_prefix_sum.h" + +namespace embree +{ + template<typename Value> + struct ParallelForForPrefixSumState : public ParallelForForState + { + __forceinline ParallelForForPrefixSumState () {} + + template<typename ArrayArray> + __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) + : ParallelForForState(array2,minStepSize) {} + + ParallelPrefixSumState<Value> prefix_state; + }; + + template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; k<k1; i++) { + const size_t size = array2[i] ? array2[i]->size() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i)); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i<taskCount; i++) + { + const Value c = state.prefix_state.counts[i]; + state.prefix_state.sums[i] = sum; + sum=reduction(sum,c); + } + + return sum; + } + + template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; k<k1; i++) { + const size_t size = array2[i] ? array2[i]->size() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i<taskCount; i++) + { + const Value c = state.prefix_state.counts[i]; + state.prefix_state.sums[i] = sum; + sum=reduction(sum,c); + } + + return sum; + } + + template<typename ArrayArray, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction); + } + + template<typename ArrayArray, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_map.h b/thirdparty/embree/common/algorithms/parallel_map.h new file mode 100644 index 0000000000..15c098fe20 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_map.h @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_sort.h" + +namespace embree +{ + /*! implementation of a key/value map with parallel construction */ + template<typename Key, typename Val> + class parallel_map + { + /* key/value pair to build the map */ + struct KeyValue + { + __forceinline KeyValue () {} + + __forceinline KeyValue (const Key key, const Val val) + : key(key), val(val) {} + + __forceinline operator Key() const { + return key; + } + + public: + Key key; + Val val; + }; + + public: + + /*! parallel map constructors */ + parallel_map () {} + + /*! construction from pair of vectors */ + template<typename KeyVector, typename ValVector> + parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); } + + /*! initialized the parallel map from a vector with keys and values */ + template<typename KeyVector, typename ValVector> + void init(const KeyVector& keys, const ValVector& values) + { + /* reserve sufficient space for all data */ + assert(keys.size() == values.size()); + vec.resize(keys.size()); + + /* generate key/value pairs */ + parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + vec[i] = KeyValue((Key)keys[i],values[i]); + }); + + /* perform parallel radix sort of the key/value pairs */ + std::vector<KeyValue> temp(keys.size()); + radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size()); + } + + /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */ + __forceinline const Val* lookup(const Key& key) const + { + typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return nullptr; + if (i->key != key) return nullptr; + return &i->val; + } + + /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */ + __forceinline Val lookup(const Key& key, const Val& def) const + { + typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return def; + if (i->key != key) return def; + return i->val; + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector<KeyValue> vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree/common/algorithms/parallel_partition.h b/thirdparty/embree/common/algorithms/parallel_partition.h new file mode 100644 index 0000000000..a1cbdc8e04 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_partition.h @@ -0,0 +1,283 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" +#include "../math/range.h" + +namespace embree +{ + /* serial partitioning */ + template<typename T, typename V, typename IsLeft, typename Reduction_T> + __forceinline size_t serial_partitioning(T* array, + const size_t begin, + const size_t end, + V& leftReduction, + V& rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t) + { + T* l = array + begin; + T* r = array + end - 1; + + while(1) + { + /* *l < pivot */ + while (likely(l <= r && is_left(*l) )) + { + //prefetchw(l+4); // FIXME: enable? + reduction_t(leftReduction,*l); + ++l; + } + /* *r >= pivot) */ + while (likely(l <= r && !is_left(*r))) + { + //prefetchw(r-4); FIXME: enable? + reduction_t(rightReduction,*r); + --r; + } + if (r<l) break; + + reduction_t(leftReduction ,*r); + reduction_t(rightReduction,*l); + xchg(*l,*r); + l++; r--; + } + + return l - array; + } + + template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> + class __aligned(64) parallel_partition_task + { + ALIGNED_CLASS_(64); + private: + + static const size_t MAX_TASKS = 64; + + T* array; + size_t N; + const IsLeft& is_left; + const Reduction_T& reduction_t; + const Reduction_V& reduction_v; + const Vi& identity; + + size_t numTasks; + __aligned(64) size_t counter_start[MAX_TASKS+1]; + __aligned(64) size_t counter_left[MAX_TASKS+1]; + __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS]; + __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; + __aligned(64) V leftReductions[MAX_TASKS]; + __aligned(64) V rightReductions[MAX_TASKS]; + + public: + + __forceinline parallel_partition_task(T* array, + const size_t N, + const Vi& identity, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + const size_t BLOCK_SIZE) + + : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), + numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} + + __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges) + { + size_t i = 0; + while(index >= (size_t)r[i].size()) + { + assert(i < numRanges); + index -= (size_t)r[i].size(); + i++; + } + return &r[i]; + } + + __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, + const size_t numRightMisplacedRanges, + const size_t startID, + const size_t endID) + { + size_t leftLocalIndex = startID; + size_t rightLocalIndex = startID; + const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); + const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); + + size_t l_left = l_range->size() - leftLocalIndex; + size_t r_left = r_range->size() - rightLocalIndex; + T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; + T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; + size_t size = endID - startID; + size_t items = min(size,min(l_left,r_left)); + + while (size) + { + if (unlikely(l_left == 0)) + { + l_range++; + l_left = l_range->size(); + l = &array[l_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + if (unlikely(r_left == 0)) + { + r_range++; + r_left = r_range->size(); + r = &array[r_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + size -= items; + l_left -= items; + r_left -= items; + + while(items) { + items--; + xchg(*l++,*r++); + } + } + } + + __forceinline size_t partition(V& leftReduction, V& rightReduction) + { + /* partition the individual ranges for each task */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*N/numTasks; + const size_t endID = (taskID+1)*N/numTasks; + V local_left(identity); + V local_right(identity); + const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); + counter_start[taskID] = startID; + counter_left [taskID] = mid-startID; + leftReductions[taskID] = local_left; + rightReductions[taskID] = local_right; + }); + counter_start[numTasks] = N; + counter_left[numTasks] = 0; + + /* finalize the reductions */ + for (size_t i=0; i<numTasks; i++) { + reduction_v(leftReduction,leftReductions[i]); + reduction_v(rightReduction,rightReductions[i]); + } + + /* calculate mid point for partitioning */ + size_t mid = counter_left[0]; + for (size_t i=1; i<numTasks; i++) + mid += counter_left[i]; + const range<ssize_t> globalLeft (0,mid); + const range<ssize_t> globalRight(mid,N); + + /* calculate all left and right ranges that are on the wrong global side */ + size_t numMisplacedRangesLeft = 0; + size_t numMisplacedRangesRight = 0; + size_t numMisplacedItemsLeft = 0; + size_t numMisplacedItemsRight = 0; + + for (size_t i=0; i<numTasks; i++) + { + const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]); + const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]); + const range<ssize_t> left_misplaced = globalLeft. intersect(right_range); + const range<ssize_t> right_misplaced = globalRight.intersect(left_range); + + if (!left_misplaced.empty()) + { + numMisplacedItemsLeft += left_misplaced.size(); + leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; + } + + if (!right_misplaced.empty()) + { + numMisplacedItemsRight += right_misplaced.size(); + rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; + } + } + assert( numMisplacedItemsLeft == numMisplacedItemsRight ); + + /* if no items are misplaced we are done */ + if (numMisplacedItemsLeft == 0) + return mid; + + /* otherwise we copy the items to the right place in parallel */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; + const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; + swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); + }); + + return mid; + } + }; + + template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE = 128) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < BLOCK_SIZE)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; + std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE, + size_t PARALLEL_THRESHOLD) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < PARALLEL_THRESHOLD)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; + std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + + template<typename T, typename IsLeft> + inline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const IsLeft& is_left, + size_t BLOCK_SIZE = 128) + { + size_t leftReduction = 0; + size_t rightReduction = 0; + return parallel_partitioning( + array,begin,end,0,leftReduction,rightReduction,is_left, + [] (size_t& t,const T& ref) { }, + [] (size_t& t0,size_t& t1) { }, + BLOCK_SIZE); + } + +} diff --git a/thirdparty/embree/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h new file mode 100644 index 0000000000..208bb4e480 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template<typename Value> + struct ParallelPrefixSumState + { + enum { MAX_TASKS = 64 }; + Value counts[MAX_TASKS]; + Value sums [MAX_TASKS]; + }; + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; + const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS)); + + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; + const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; + state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]); + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i<taskCount; i++) + { + const Value c = state.counts[i]; + state.sums[i] = sum; + sum=reduction(sum,c); + } + + return sum; + } + + /*! parallel calculation of prefix sums */ + template<typename SrcArray, typename DstArray, typename Value, typename Add> + __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) + { + /* perform single threaded prefix operation for small N */ + if (N < SINGLE_THREAD_THRESHOLD) + { + Value sum=identity; + for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum; + return sum; + } + + /* perform parallel prefix operation for large N */ + else + { + ParallelPrefixSumState<Value> state; + + /* initial run just sets up start values for subtasks */ + parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]); + return s; + + }, add); + + /* final run calculates prefix sum */ + return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i<r.end(); i++) { + dst[i] = add(sum,s); + s = add(s,src[i]); + } + return s; + + }, add); + } + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h new file mode 100644 index 0000000000..8271372ea4 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_reduce.h @@ -0,0 +1,150 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range<Index>(first,last)); + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range<Index>(first,last)); + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + const Index maxTasks = 512; + const Index threadCount = (Index) TaskScheduler::threadCount(); + taskCount = min(taskCount,threadCount,maxTasks); + + /* parallel invokation of all tasks */ + dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack + parallel_for(taskCount, [&](const Index taskIndex) { + const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; + const Index k1 = first+(taskIndex+1)*(last-first)/taskCount; + values[taskIndex] = func(range<Index>(k0,k1)); + }); + + /* perform reduction over all tasks */ + Value v = identity; + for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]); + return v; + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { +#if defined(TASKING_INTERNAL) + + /* fast path for small number of iterations */ + Index taskCount = (last-first+minStepSize-1)/minStepSize; + if (likely(taskCount == 1)) { + return func(range<Index>(first,last)); + } + return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction); + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, + [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, + reduction,context); + // -- GODOT start -- + // if (context.is_group_execution_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #else + const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, + [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, + reduction); + // -- GODOT start -- + // if (tbb::task::self().is_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #endif +#else // TASKING_PPL + struct AlignedValue + { + char storage[__alignof(Value)+sizeof(Value)]; + static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); }; + Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); } + const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); } + AlignedValue(const Value& v) { new(getValuePtr()) Value(v); } + AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); } + AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); }; + AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + operator Value() const { return *getValuePtr(); } + }; + + struct Iterator_Index + { + Index v; + typedef std::forward_iterator_tag iterator_category; + typedef AlignedValue value_type; + typedef Index difference_type; + typedef Index distance_type; + typedef AlignedValue* pointer; + typedef AlignedValue& reference; + __forceinline Iterator_Index() {} + __forceinline Iterator_Index(Index v) : v(v) {} + __forceinline bool operator== (Iterator_Index other) { return v == other.v; } + __forceinline bool operator!= (Iterator_Index other) { return v != other.v; } + __forceinline Iterator_Index operator++() { return Iterator_Index(++v); } + __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); } + }; + + auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) { + assert(begin.v < end.v); + return reduction(start, func(range<Index>(begin.v, end.v))); + }; + const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction); + return v; +#endif + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + if (likely(last-first < parallel_threshold)) { + return func(range<Index>(first,last)); + } else { + return parallel_reduce(first,last,minStepSize,identity,func,reduction); + } + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction); + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + auto funcr = [&] ( const range<Index> r ) { + Value v = identity; + for (Index i=r.begin(); i<r.end(); i++) + v = reduction(v,func(i)); + return v; + }; + return parallel_reduce(first,last,Index(1),identity,funcr,reduction); + } + + template<typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_set.h b/thirdparty/embree/common/algorithms/parallel_set.h new file mode 100644 index 0000000000..7eae577457 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_set.h @@ -0,0 +1,52 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_sort.h" + +namespace embree +{ + /* implementation of a set of values with parallel construction */ + template<typename T> + class parallel_set + { + public: + + /*! default constructor for the parallel set */ + parallel_set () {} + + /*! construction from vector */ + template<typename Vector> + parallel_set (const Vector& in) { init(in); } + + /*! initialized the parallel set from a vector */ + template<typename Vector> + void init(const Vector& in) + { + /* copy data to internal vector */ + vec.resize(in.size()); + parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + vec[i] = in[i]; + }); + + /* sort the data */ + std::vector<T> temp(in.size()); + radix_sort<T>(vec.data(),temp.data(),vec.size()); + } + + /*! tests if some element is in the set */ + __forceinline bool lookup(const T& elt) const { + return std::binary_search(vec.begin(), vec.end(), elt); + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector<T> vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree/common/algorithms/parallel_sort.h b/thirdparty/embree/common/algorithms/parallel_sort.h new file mode 100644 index 0000000000..30e56c2bfc --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_sort.h @@ -0,0 +1,454 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../simd/simd.h" +#include "parallel_for.h" +#include <algorithm> + +namespace embree +{ + template<class T> + __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i<length;++i) + { + T v = array[i]; + size_t j = i; + while(j > 0 && v < array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template<class T> + __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i<length;++i) + { + T v = array[i]; + size_t j = i; + while(j > 0 && v > array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template<class T> + void quicksort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_ascending(t, begin, pivot); + quicksort_ascending(t, pivot + 1, end); + } + } + + template<class T> + void quicksort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_decending(t, begin, pivot); + quicksort_decending(t, pivot + 1, end); + } + } + + + template<class T, ssize_t THRESHOLD> + void quicksort_insertionsort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_ascending<T>(&t[begin],size); + } + else + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot); + quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end); + } + } + } + + + template<class T, ssize_t THRESHOLD> + void quicksort_insertionsort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_decending<T>(&t[begin],size); + } + else + { + + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot); + quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end); + } + } + } + + template<typename T> + static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) + { + static const unsigned int BITS = 8; + static const unsigned int BUCKETS = (1 << BITS); + static const unsigned int CMP_SORT_THRESHOLD = 16; + + __aligned(64) unsigned int count[BUCKETS]; + + /* clear buckets */ + for (size_t i=0;i<BUCKETS;i++) count[i] = 0; + + /* count buckets */ +#if defined(__INTEL_COMPILER) +#pragma nounroll +#endif + for (size_t i=0;i<num;i++) + count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++; + + /* prefix sums */ + __aligned(64) unsigned int head[BUCKETS]; + __aligned(64) unsigned int tail[BUCKETS]; + + head[0] = 0; + for (size_t i=1; i<BUCKETS; i++) + head[i] = head[i-1] + count[i-1]; + + for (size_t i=0; i<BUCKETS-1; i++) + tail[i] = head[i+1]; + + tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1]; + + assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]); + assert(tail[BUCKETS-1] == num); + + /* in-place swap */ + for (size_t i=0;i<BUCKETS;i++) + { + /* process bucket */ + while(head[i] < tail[i]) + { + T v = morton[head[i]]; + while(1) + { + const size_t b = (unsigned(v) >> shift) & (BUCKETS-1); + if (b == i) break; + std::swap(v,morton[head[b]++]); + } + assert((unsigned(v) >> shift & (BUCKETS-1)) == i); + morton[head[i]++] = v; + } + } + if (shift == 0) return; + + size_t offset = 0; + for (size_t i=0;i<BUCKETS;i++) + if (count[i]) + { + + for (size_t j=offset;j<offset+count[i]-1;j++) + assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i); + + if (unlikely(count[i] < CMP_SORT_THRESHOLD)) + insertionsort_ascending(morton + offset, count[i]); + else + radixsort32(morton + offset, count[i], shift-BITS); + + for (size_t j=offset;j<offset+count[i]-1;j++) + assert(morton[j] <= morton[j+1]); + + offset += count[i]; + } + } + + template<typename Ty, typename Key> + class ParallelRadixSort + { + static const size_t MAX_TASKS = 64; + static const size_t BITS = 8; + static const size_t BUCKETS = (1 << BITS); + typedef unsigned int TyRadixCount[BUCKETS]; + + template<typename T> + static bool compare(const T& v0, const T& v1) { + return (Key)v0 < (Key)v1; + } + + private: + ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement + ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement + + + public: + ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) + : radixCount(nullptr), src(src), tmp(tmp), N(N) {} + + void sort(const size_t blockSize) + { + assert(blockSize > 0); + + /* perform single threaded sort for small N */ + if (N<=blockSize) // handles also special case of 0! + { + /* do inplace sort inside destination array */ + std::sort(src,src+N,compare<Ty>); + } + + /* perform parallel sort for large N */ + else + { + const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); + tbbRadixSort(numThreads); + } + } + + ~ParallelRadixSort() + { + alignedFree(radixCount); + radixCount = nullptr; + } + + private: + + void tbbRadixIteration0(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* count how many items go into the buckets */ + for (size_t i=0; i<BUCKETS; i++) + radixCount[threadIndex][i] = 0; + + /* iterate over src array and count buckets */ + unsigned int * __restrict const count = radixCount[threadIndex]; +#if defined(__INTEL_COMPILER) +#pragma nounroll +#endif + for (size_t i=startID; i<endID; i++) { +#if defined(__64BIT__) + const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; +#else + const Key index = ((Key)src[i] >> shift) & mask; +#endif + count[index]++; + } + } + + void tbbRadixIteration1(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* calculate total number of items for each bucket */ + __aligned(64) unsigned int total[BUCKETS]; + /* + for (size_t i=0; i<BUCKETS; i++) + total[i] = 0; + */ + for (size_t i=0; i<BUCKETS; i+=VSIZEX) + vintx::store(&total[i], zero); + + for (size_t i=0; i<threadCount; i++) + { + /* + for (size_t j=0; j<BUCKETS; j++) + total[j] += radixCount[i][j]; + */ + for (size_t j=0; j<BUCKETS; j+=VSIZEX) + vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j])); + } + + /* calculate start offset of each bucket */ + __aligned(64) unsigned int offset[BUCKETS]; + offset[0] = 0; + for (size_t i=1; i<BUCKETS; i++) + offset[i] = offset[i-1] + total[i-1]; + + /* calculate start offset of each bucket for this thread */ + for (size_t i=0; i<threadIndex; i++) + { + /* + for (size_t j=0; j<BUCKETS; j++) + offset[j] += radixCount[i][j]; + */ + for (size_t j=0; j<BUCKETS; j+=VSIZEX) + vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j])); + } + + /* copy items into their buckets */ +#if defined(__INTEL_COMPILER) +#pragma nounroll +#endif + for (size_t i=startID; i<endID; i++) { + const Ty elt = src[i]; +#if defined(__64BIT__) + const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; +#else + const size_t index = ((Key)src[i] >> shift) & mask; +#endif + dst[offset[index]++] = elt; + } + } + + void tbbRadixIteration(const Key shift, const bool last, + const Ty* __restrict src, Ty* __restrict dst, + const size_t numTasks) + { + affinity_partitioner ap; + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); + } + + void tbbRadixSort(const size_t numTasks) + { + radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); + + if (sizeof(Key) == sizeof(uint32_t)) { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,1,tmp,src,numTasks); + } + else if (sizeof(Key) == sizeof(uint64_t)) + { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,0,tmp,src,numTasks); + tbbRadixIteration(4*BITS,0,src,tmp,numTasks); + tbbRadixIteration(5*BITS,0,tmp,src,numTasks); + tbbRadixIteration(6*BITS,0,src,tmp,numTasks); + tbbRadixIteration(7*BITS,1,tmp,src,numTasks); + } + } + + private: + TyRadixCount* radixCount; + Ty* const src; + Ty* const tmp; + const size_t N; + }; + + template<typename Ty> + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize); + } + + template<typename Ty, typename Key> + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize); + } + + template<typename Ty> + void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort<Ty,uint32_t>(src,tmp,N,blockSize); + } + + template<typename Ty> + void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort<Ty,uint64_t>(src,tmp,N,blockSize); + } +} diff --git a/thirdparty/embree/common/lexers/parsestream.h b/thirdparty/embree/common/lexers/parsestream.h new file mode 100644 index 0000000000..f65a52cb47 --- /dev/null +++ b/thirdparty/embree/common/lexers/parsestream.h @@ -0,0 +1,101 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stringstream.h" +#include "../sys/filename.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/col3.h" +#include "../math/color.h" + +namespace embree +{ + /*! helper class for simple command line parsing */ + class ParseStream : public Stream<std::string> + { + public: + ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {} + + ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ", + const std::string& endl = "", bool multiLine = false) + : cin(new StringStream(cin,seps,endl,multiLine)) {} + + public: + ParseLocation location() { return cin->loc(); } + std::string next() { return cin->get(); } + + void force(const std::string& next) { + std::string token = getString(); + if (token != next) + THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found"); + } + + std::string getString() { + return get(); + } + + FileName getFileName() { + return FileName(get()); + } + + int getInt () { + return atoi(get().c_str()); + } + + Vec2i getVec2i() { + int x = atoi(get().c_str()); + int y = atoi(get().c_str()); + return Vec2i(x,y); + } + + Vec3ia getVec3ia() { + int x = atoi(get().c_str()); + int y = atoi(get().c_str()); + int z = atoi(get().c_str()); + return Vec3ia(x,y,z); + } + + float getFloat() { + return (float)atof(get().c_str()); + } + + Vec2f getVec2f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + return Vec2f(x,y); + } + + Vec3f getVec3f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Vec3f(x,y,z); + } + + Vec3fa getVec3fa() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Vec3fa(x,y,z); + } + + Col3f getCol3f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Col3f(x,y,z); + } + + Color getColor() { + float r = (float)atof(get().c_str()); + float g = (float)atof(get().c_str()); + float b = (float)atof(get().c_str()); + return Color(r,g,b); + } + + private: + Ref<Stream<std::string> > cin; + }; +} diff --git a/thirdparty/embree/common/lexers/stream.h b/thirdparty/embree/common/lexers/stream.h new file mode 100644 index 0000000000..a40c15f8eb --- /dev/null +++ b/thirdparty/embree/common/lexers/stream.h @@ -0,0 +1,215 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/ref.h" +#include "../sys/filename.h" +#include "../sys/string.h" + +#include <vector> +#include <iostream> +#include <cstdio> +#include <string.h> + +namespace embree +{ + /*! stores the location of a stream element in the source */ + class ParseLocation + { + public: + ParseLocation () : lineNumber(-1), colNumber(-1) {} + ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/) + : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {} + + std::string str() const + { + std::string str = "unknown"; + if (fileName) str = *fileName; + if (lineNumber >= 0) str += " line " + toString(lineNumber); + if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber); + return str; + } + + private: + std::shared_ptr<std::string> fileName; /// name of the file (or stream) the token is from + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + }; + + /*! a stream class templated over the stream elements */ + template<typename T> class Stream : public RefCount + { + enum { BUF_SIZE = 1024 }; + + private: + virtual T next() = 0; + virtual ParseLocation location() = 0; + __forceinline std::pair<T,ParseLocation> nextHelper() { + ParseLocation l = location(); + T v = next(); + return std::pair<T,ParseLocation>(v,l); + } + __forceinline void push_back(const std::pair<T,ParseLocation>& v) { + if (past+future == BUF_SIZE) pop_front(); + size_t end = (start+past+future++)%BUF_SIZE; + buffer[end] = v; + } + __forceinline void pop_front() { + if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty"); + start = (start+1)%BUF_SIZE; past--; + } + public: + Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {} + virtual ~Stream() {} + + public: + + const ParseLocation& loc() { + if (future == 0) push_back(nextHelper()); + return buffer[(start+past)%BUF_SIZE].second; + } + T get() { + if (future == 0) push_back(nextHelper()); + T t = buffer[(start+past)%BUF_SIZE].first; + past++; future--; + return t; + } + const T& peek() { + if (future == 0) push_back(nextHelper()); + return buffer[(start+past)%BUF_SIZE].first; + } + const T& unget(size_t n = 1) { + if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items"); + past -= n; future += n; + return peek(); + } + void drop() { + if (future == 0) push_back(nextHelper()); + past++; future--; + } + private: + size_t start,past,future; + std::vector<std::pair<T,ParseLocation> > buffer; + }; + + /*! warps an iostream stream */ + class StdStream : public Stream<int> + { + public: + StdStream (std::istream& cin, const std::string& name = "std::stream") + : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {} + ~StdStream() {} + ParseLocation location() { + return ParseLocation(name,lineNumber,colNumber,charNumber); + } + int next() { + int c = cin.get(); + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + private: + std::istream& cin; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + std::shared_ptr<std::string> name; /// name of buffer + }; + + /*! creates a stream from a file */ + class FileStream : public Stream<int> + { + public: + + FileStream (FILE* file, const std::string& name = "file") + : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {} + + FileStream (const FileName& fileName) + : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str()))) + { + file = fopen(fileName.c_str(),"r"); + if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str()); + } + ~FileStream() { if (file) fclose(file); } + + public: + ParseLocation location() { + return ParseLocation(name,lineNumber,colNumber,charNumber); + } + + int next() { + int c = fgetc(file); + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + + private: + FILE* file; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + std::shared_ptr<std::string> name; /// name of buffer + }; + + /*! creates a stream from a string */ + class StrStream : public Stream<int> + { + public: + + StrStream (const char* str) + : str(str), lineNumber(1), colNumber(0), charNumber(0) {} + + public: + ParseLocation location() { + return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber); + } + + int next() { + int c = str[charNumber]; + if (c == 0) return EOF; + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + + private: + const char* str; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + }; + + /*! creates a character stream from a command line */ + class CommandLineStream : public Stream<int> + { + public: + CommandLineStream (int argc, char** argv, const std::string& name = "command line") + : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) + { + if (argc > 0) { + for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++; + charNumber++; + } + for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]); + } + ~CommandLineStream() {} + public: + ParseLocation location() { + return ParseLocation(name,0,charNumber,charNumber); + } + int next() { + if (i == args.size()) return EOF; + if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; } + charNumber++; + return args[i][j++]; + } + private: + size_t i,j; + std::vector<std::string> args; + ssize_t charNumber; /// the character in the file + std::shared_ptr<std::string> name; /// name of buffer + }; +} diff --git a/thirdparty/embree/common/lexers/streamfilters.h b/thirdparty/embree/common/lexers/streamfilters.h new file mode 100644 index 0000000000..3592b77b03 --- /dev/null +++ b/thirdparty/embree/common/lexers/streamfilters.h @@ -0,0 +1,39 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" + +namespace embree +{ + /* removes all line comments from a stream */ + class LineCommentFilter : public Stream<int> + { + public: + LineCommentFilter (const FileName& fileName, const std::string& lineComment) + : cin(new FileStream(fileName)), lineComment(lineComment) {} + LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment) + : cin(cin), lineComment(lineComment) {} + + ParseLocation location() { return cin->loc(); } + + int next() + { + /* look if the line comment starts here */ + for (size_t j=0; j<lineComment.size(); j++) { + if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; } + cin->get(); + } + /* eat all characters until the end of the line (or file) */ + while (cin->peek() != '\n' && cin->peek() != EOF) cin->get(); + + not_found: + return cin->get(); + } + + private: + Ref<Stream<int> > cin; + std::string lineComment; + }; +} diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp new file mode 100644 index 0000000000..a037869506 --- /dev/null +++ b/thirdparty/embree/common/lexers/stringstream.cpp @@ -0,0 +1,51 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "stringstream.h" + +namespace embree +{ + static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; + + /* creates map for fast categorization of characters */ + static void createCharMap(bool map[256], const std::string& chrs) { + for (size_t i=0; i<256; i++) map[i] = false; + for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true; + } + + /* simple tokenizer */ + StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine) + : cin(cin), endl(endl), multiLine(multiLine) + { + createCharMap(isSepMap,seps); + createCharMap(isValidCharMap,stringChars); + } + + std::string StringStream::next() + { + /* skip separators */ + while (cin->peek() != EOF) { + if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; } + if (multiLine && cin->peek() == '\\') { + cin->drop(); + if (cin->peek() == '\n') { cin->drop(); continue; } + cin->unget(); + } + if (!isSeparator(cin->peek())) break; + cin->drop(); + } + + /* parse everything until the next separator */ + std::vector<char> str; str.reserve(64); + while (cin->peek() != EOF && !isSeparator(cin->peek())) { + int c = cin->get(); + // -- GODOT start -- + // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); + if (!isValidChar(c)) abort(); + // -- GODOT end -- + str.push_back((char)c); + } + str.push_back(0); + return std::string(str.data()); + } +} diff --git a/thirdparty/embree/common/lexers/stringstream.h b/thirdparty/embree/common/lexers/stringstream.h new file mode 100644 index 0000000000..6d9c27e3cd --- /dev/null +++ b/thirdparty/embree/common/lexers/stringstream.h @@ -0,0 +1,29 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" + +namespace embree +{ + /*! simple tokenizer that produces a string stream */ + class StringStream : public Stream<std::string> + { + public: + StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ", + const std::string& endl = "", bool multiLine = false); + public: + ParseLocation location() { return cin->loc(); } + std::string next(); + private: + __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } + __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; } + private: + Ref<Stream<int> > cin; /*! source character stream */ + bool isSepMap[256]; /*! map for fast classification of separators */ + bool isValidCharMap[256]; /*! map for valid characters */ + std::string endl; /*! the token of the end of line */ + bool multiLine; /*! whether to parse lines wrapped with \ */ + }; +} diff --git a/thirdparty/embree/common/lexers/tokenstream.cpp b/thirdparty/embree/common/lexers/tokenstream.cpp new file mode 100644 index 0000000000..6ed6f2045a --- /dev/null +++ b/thirdparty/embree/common/lexers/tokenstream.cpp @@ -0,0 +1,181 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "tokenstream.h" +#include "../math/math.h" + +namespace embree +{ + /* shorthands for common sets of characters */ + const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz"; + const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + const std::string TokenStream::numbers = "0123456789"; + const std::string TokenStream::separators = "\n\t\r "; + const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; + + /* creates map for fast categorization of characters */ + static void createCharMap(bool map[256], const std::string& chrs) { + for (size_t i=0; i<256; i++) map[i] = false; + for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true; + } + + /* build full tokenizer that takes list of valid characters and keywords */ + TokenStream::TokenStream(const Ref<Stream<int> >& cin, //< stream to read from + const std::string& alpha, //< valid characters for identifiers + const std::string& seps, //< characters that act as separators + const std::vector<std::string>& symbols) //< symbols + : cin(cin), symbols(symbols) + { + createCharMap(isAlphaMap,alpha); + createCharMap(isSepMap,seps); + createCharMap(isStringCharMap,stringChars); + } + + bool TokenStream::decDigits(std::string& str_o) + { + bool ok = false; + std::string str; + if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get(); + while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } + if (ok) str_o += str; + else cin->unget(str.size()); + return ok; + } + + bool TokenStream::decDigits1(std::string& str_o) + { + bool ok = false; + std::string str; + while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } + if (ok) str_o += str; else cin->unget(str.size()); + return ok; + } + + bool TokenStream::trySymbol(const std::string& symbol) + { + size_t pos = 0; + while (pos < symbol.size()) { + if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; } + cin->drop(); pos++; + } + return true; + } + + bool TokenStream::trySymbols(Token& token, const ParseLocation& loc) + { + for (size_t i=0; i<symbols.size(); i++) { + if (!trySymbol(symbols[i])) continue; + token = Token(symbols[i],Token::TY_SYMBOL,loc); + return true; + } + return false; + } + + bool TokenStream::tryFloat(Token& token, const ParseLocation& loc) + { + bool ok = false; + std::string str; + if (trySymbol("nan")) { + token = Token(float(nan)); + return true; + } + if (trySymbol("+inf")) { + token = Token(float(pos_inf)); + return true; + } + if (trySymbol("-inf")) { + token = Token(float(neg_inf)); + return true; + } + + if (decDigits(str)) + { + if (cin->peek() == '.') { + str += (char)cin->get(); + decDigits(str); + if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // 1.[2]E2 + } + else ok = true; // 1.[2] + } + else if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // 1E2 + } + } + else + { + if (cin->peek() == '.') { + str += (char)cin->get(); + if (decDigits(str)) { + if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // .3E2 + } + else ok = true; // .3 + } + } + } + if (ok) { + token = Token((float)atof(str.c_str()),loc); + } + else cin->unget(str.size()); + return ok; + } + + bool TokenStream::tryInt(Token& token, const ParseLocation& loc) { + std::string str; + if (decDigits(str)) { + token = Token(atoi(str.c_str()),loc); + return true; + } + return false; + } + + bool TokenStream::tryString(Token& token, const ParseLocation& loc) + { + std::string str; + if (cin->peek() != '\"') return false; + cin->drop(); + while (cin->peek() != '\"') { + const int c = cin->get(); + if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str()); + str += (char)c; + } + cin->drop(); + token = Token(str,Token::TY_STRING,loc); + return true; + } + + bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc) + { + std::string str; + if (!isAlpha(cin->peek())) return false; + str += (char)cin->get(); + while (isAlphaNum(cin->peek())) str += (char)cin->get(); + token = Token(str,Token::TY_IDENTIFIER,loc); + return true; + } + + void TokenStream::skipSeparators() + { + /* skip separators */ + while (cin->peek() != EOF && isSeparator(cin->peek())) + cin->drop(); + } + + Token TokenStream::next() + { + Token token; + skipSeparators(); + ParseLocation loc = cin->loc(); + if (trySymbols (token,loc)) return token; /**< try to parse a symbol */ + if (tryFloat (token,loc)) return token; /**< try to parse float */ + if (tryInt (token,loc)) return token; /**< try to parse integer */ + if (tryString (token,loc)) return token; /**< try to parse string */ + if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */ + if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */ + return Token((char)cin->get(),loc); /**< return invalid character token */ + } +} diff --git a/thirdparty/embree/common/lexers/tokenstream.h b/thirdparty/embree/common/lexers/tokenstream.h new file mode 100644 index 0000000000..6e49dd0b39 --- /dev/null +++ b/thirdparty/embree/common/lexers/tokenstream.h @@ -0,0 +1,164 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" +#include <string> +#include <vector> + +namespace embree +{ + /*! token class */ + class Token + { + public: + + enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL }; + + Token ( const ParseLocation& loc = ParseLocation()) : ty(TY_EOF ), loc(loc) {} + Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {} + Token (int i, const ParseLocation& loc = ParseLocation()) : ty(TY_INT ), i(i), loc(loc) {} + Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {} + Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty), str(str), loc(loc) {} + + static Token Eof() { return Token(); } + static Token Sym(std::string str) { return Token(str,TY_SYMBOL); } + static Token Str(std::string str) { return Token(str,TY_STRING); } + static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); } + + char Char() const { + if (ty == TY_CHAR) return c; + THROW_RUNTIME_ERROR(loc.str()+": character expected"); + } + + int Int() const { + if (ty == TY_INT) return i; + THROW_RUNTIME_ERROR(loc.str()+": integer expected"); + } + + float Float(bool cast = true) const { + if (ty == TY_FLOAT) return f; + if (ty == TY_INT && cast) return (float)i; + THROW_RUNTIME_ERROR(loc.str()+": float expected"); + } + + std::string Identifier() const { + if (ty == TY_IDENTIFIER) return str; + THROW_RUNTIME_ERROR(loc.str()+": identifier expected"); + } + + std::string String() const { + if (ty == TY_STRING) return str; + THROW_RUNTIME_ERROR(loc.str()+": string expected"); + } + + std::string Symbol() const { + if (ty == TY_SYMBOL) return str; + THROW_RUNTIME_ERROR(loc.str()+": symbol expected"); + } + + const ParseLocation& Location() const { return loc; } + + friend bool operator==(const Token& a, const Token& b) + { + if (a.ty != b.ty) return false; + if (a.ty == TY_CHAR) return a.c == b.c; + if (a.ty == TY_INT) return a.i == b.i; + if (a.ty == TY_FLOAT) return a.f == b.f; + if (a.ty == TY_IDENTIFIER) return a.str == b.str; + if (a.ty == TY_STRING) return a.str == b.str; + if (a.ty == TY_SYMBOL) return a.str == b.str; + return true; + } + + friend bool operator!=(const Token& a, const Token& b) { + return !(a == b); + } + + friend bool operator <( const Token& a, const Token& b ) { + if (a.ty != b.ty) return (int)a.ty < (int)b.ty; + if (a.ty == TY_CHAR) return a.c < b.c; + if (a.ty == TY_INT) return a.i < b.i; + if (a.ty == TY_FLOAT) return a.f < b.f; + if (a.ty == TY_IDENTIFIER) return a.str < b.str; + if (a.ty == TY_STRING) return a.str < b.str; + if (a.ty == TY_SYMBOL) return a.str < b.str; + return false; + } + + friend std::ostream& operator<<(std::ostream& cout, const Token& t) + { + if (t.ty == TY_EOF) return cout << "eof"; + if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")"; + if (t.ty == TY_INT) return cout << "Int(" << t.i << ")"; + if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")"; + if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")"; + if (t.ty == TY_STRING) return cout << "String(" << t.str << ")"; + if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")"; + return cout << "unknown"; + } + + private: + Type ty; //< the type of the token + union { + char c; //< data for char tokens + int i; //< data for int tokens + float f; //< data for float tokens + }; + std::string str; //< data for string and identifier tokens + ParseLocation loc; //< the location the token is from + }; + + /*! build full tokenizer that takes list of valid characters and keywords */ + class TokenStream : public Stream<Token> + { + public: + + /*! shorthands for common sets of characters */ + static const std::string alpha; + static const std::string ALPHA; + static const std::string numbers; + static const std::string separators; + static const std::string stringChars; + + public: + TokenStream(const Ref<Stream<int> >& cin, + const std::string& alpha, //< valid characters for identifiers + const std::string& seps, //< characters that act as separators + const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols + public: + ParseLocation location() { return cin->loc(); } + Token next(); + bool trySymbol(const std::string& symbol); + + private: + void skipSeparators(); + bool decDigits(std::string& str); + bool decDigits1(std::string& str); + bool trySymbols(Token& token, const ParseLocation& loc); + bool tryFloat(Token& token, const ParseLocation& loc); + bool tryInt(Token& token, const ParseLocation& loc); + bool tryString(Token& token, const ParseLocation& loc); + bool tryIdentifier(Token& token, const ParseLocation& loc); + + Ref<Stream<int> > cin; + bool isSepMap[256]; + bool isAlphaMap[256]; + bool isStringCharMap[256]; + std::vector<std::string> symbols; + + /*! checks if a character is a separator */ + __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } + + /*! checks if a character is a number */ + __forceinline bool isDigit(unsigned int c) const { return c >= '0' && c <= '9'; } + + /*! checks if a character is valid inside a string */ + __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; } + + /*! checks if a character is legal for an identifier */ + __forceinline bool isAlpha(unsigned int c) const { return c<256 && isAlphaMap[c]; } + __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); } + }; +} diff --git a/thirdparty/embree/common/math/affinespace.h b/thirdparty/embree/common/math/affinespace.h new file mode 100644 index 0000000000..9d4a0f0846 --- /dev/null +++ b/thirdparty/embree/common/math/affinespace.h @@ -0,0 +1,361 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "linearspace2.h" +#include "linearspace3.h" +#include "quaternion.h" +#include "bbox.h" +#include "vec4.h" + +namespace embree +{ + #define VectorT typename L::Vector + #define ScalarT typename L::Vector::Scalar + + //////////////////////////////////////////////////////////////////////////////// + // Affine Space + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> + struct AffineSpaceT + { + L l; /*< linear part of affine space */ + VectorT p; /*< affine part of affine space */ + + //////////////////////////////////////////////////////////////////////////////// + // Constructors, Assignment, Cast, Copy Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline AffineSpaceT ( ) { } + __forceinline AffineSpaceT ( const AffineSpaceT& other ) { l = other.l; p = other.p; } + __forceinline AffineSpaceT ( const L & other ) { l = other ; p = VectorT(zero); } + __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; } + + __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {} + __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {} + + template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {} + + //////////////////////////////////////////////////////////////////////////////// + // Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {} + __forceinline AffineSpaceT( OneTy ) : l(one), p(zero) {} + + /*! return matrix for scaling */ + static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); } + + /*! return matrix for translation */ + static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); } + + /*! return matrix for rotation, only in 2D */ + static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); } + + /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */ + static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); } + + /*! return matrix for rotation around arbitrary axis and point, only in 3D */ + static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p); } + + /*! return matrix for looking at given point, only in 3D */ + static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) { + VectorT Z = normalize(point-eye); + VectorT U = normalize(cross(up,Z)); + VectorT V = normalize(cross(Z,U)); + return AffineSpaceT(L(U,V,Z),eye); + } + + }; + + // template specialization to get correct identity matrix for type AffineSpace3fa + template<> + __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy ) : l(one), p(0.f, 0.f, 0.f, 1.f) {} + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); } + template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); } + template<typename L> __forceinline AffineSpaceT<L> rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); } + template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); } + + template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); } + template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); } + template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); } + template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT & b ) { return a * rcp(b); } + + template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; } + template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT & b ) { return a = a * b; } + template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; } + template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT & b ) { return a = a / b; } + + template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); } + template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); } + template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); } + + __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) + { + BBox3fa dst = empty; + const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0)); + const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1)); + const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2)); + const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3)); + const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4)); + const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5)); + const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6)); + const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7)); + return dst; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; } + template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) { + return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p)); + } + + //////////////////////////////////////////////////////////////////////////////// + // Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) { + return cout << "{ l = " << m.l << ", p = " << m.p << " }"; + } + + //////////////////////////////////////////////////////////////////////////////// + // Template Instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef AffineSpaceT<LinearSpace2f> AffineSpace2f; + typedef AffineSpaceT<LinearSpace3f> AffineSpace3f; + typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa; + typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx; + typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff; + typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f; + + template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>; + typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>> AffineSpace3vf4; + typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>> AffineSpace3vf8; + typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16; + + template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>; + typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>> AffineSpace3vfa4; + typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>> AffineSpace3vfa8; + typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16; + + ////////////////////////////////////////////////////////////////////////////// + /// Interpolation + ////////////////////////////////////////////////////////////////////////////// + template<typename T, typename R> + __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0, + const AffineSpaceT<T>& M1, + const R& t) + { + return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t)); + } + + // slerp interprets the 16 floats of the matrix M = D * R * S as components of + // three matrizes (D, R, S) that are interpolated individually. + template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>> + slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0, + const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1, + const T& t) + { + QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); + QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); + QuaternionT<T> q = slerp(q0, q1, t); + + AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t); + AffineSpaceT<LinearSpace3<Vec3<T>>> D(one); + D.p.x = S.l.vx.y; + D.p.y = S.l.vx.z; + D.p.z = S.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + + AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q); + return D * R * S; + } + + // this is a specialized version for Vec3fa because that does + // not play along nicely with the other templated Vec3/Vec4 types + __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0, + const AffineSpace3ff& M1, + const float& t) + { + Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); + Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); + Quaternion3f q = slerp(q0, q1, t); + + AffineSpace3fa S = lerp(M0, M1, t); + AffineSpace3fa D(one); + D.p.x = S.l.vx.y; + D.p.y = S.l.vx.z; + D.p.z = S.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + + AffineSpace3fa R = LinearSpace3fa(q); + return D * R * S; + } + + __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd) + { + // compute affine transform from quaternion decomposition + Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); + AffineSpace3fa M = qd; + AffineSpace3fa D(one); + D.p.x = M.l.vx.y; + D.p.y = M.l.vx.z; + D.p.z = M.l.vy.z; + M.l.vx.y = 0; + M.l.vx.z = 0; + M.l.vy.z = 0; + AffineSpace3fa R = LinearSpace3fa(q); + return D * R * M; + } + + __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S) + { + q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); + S = qd; + T.x = qd.l.vx.y; + T.y = qd.l.vx.z; + T.z = qd.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + } + + __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S) + { + AffineSpace3ff M = S; + M.l.vx.w = q.i; + M.l.vy.w = q.j; + M.l.vz.w = q.k; + M.p.w = q.r; + M.l.vx.y = T.x; + M.l.vx.z = T.y; + M.l.vy.z = T.z; + return M; + } + + struct __aligned(16) QuaternionDecomposition + { + float scale_x = 1.f; + float scale_y = 1.f; + float scale_z = 1.f; + float skew_xy = 0.f; + float skew_xz = 0.f; + float skew_yz = 0.f; + float shift_x = 0.f; + float shift_y = 0.f; + float shift_z = 0.f; + float quaternion_r = 1.f; + float quaternion_i = 0.f; + float quaternion_j = 0.f; + float quaternion_k = 0.f; + float translation_x = 0.f; + float translation_y = 0.f; + float translation_z = 0.f; + }; + + __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M) + { + QuaternionDecomposition qd; + qd.scale_x = M.l.vx.x; + qd.scale_y = M.l.vy.y; + qd.scale_z = M.l.vz.z; + qd.shift_x = M.p.x; + qd.shift_y = M.p.y; + qd.shift_z = M.p.z; + qd.translation_x = M.l.vx.y; + qd.translation_y = M.l.vx.z; + qd.translation_z = M.l.vy.z; + qd.skew_xy = M.l.vy.x; + qd.skew_xz = M.l.vz.x; + qd.skew_yz = M.l.vz.y; + qd.quaternion_r = M.p.w; + qd.quaternion_i = M.l.vx.w; + qd.quaternion_j = M.l.vy.w; + qd.quaternion_k = M.l.vz.w; + return qd; + } + + //////////////////////////////////////////////////////////////////////////////// + /* + * ! Template Specialization for 2D: return matrix for rotation around point + * (rotation around arbitrarty vector is not meaningful in 2D) + */ + template<> __forceinline + AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) { + return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p); + } + + //////////////////////////////////////////////////////////////////////////////// + // Similarity Transform + // + // checks, if M is a similarity transformation, i.e if there exists a factor D + // such that for all x,y: distance(Mx, My) = D * distance(x, y) + //////////////////////////////////////////////////////////////////////////////// + __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D) + { + if (D) *D = 0.f; + if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false; + if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false; + if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false; + + const float D_x = dot(M.l.vx, M.l.vx); + const float D_y = dot(M.l.vy, M.l.vy); + const float D_z = dot(M.l.vz, M.l.vz); + + if (abs(D_x - D_y) > 1e-5f || + abs(D_x - D_z) > 1e-5f || + abs(D_y - D_z) > 1e-5f) + return false; + + if (D) *D = sqrtf(D_x); + return true; + } + + __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr) + { + Vec3fa::storeu(&ptr->l.vx, source.l.vx); + Vec3fa::storeu(&ptr->l.vy, source.l.vy); + Vec3fa::storeu(&ptr->l.vz, source.l.vz); + Vec3fa::storeu(&ptr->p, source.p); + } + + __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr) + { + AffineSpace3fa space; + space.l.vx = Vec3fa::loadu(&ptr->l.vx); + space.l.vy = Vec3fa::loadu(&ptr->l.vy); + space.l.vz = Vec3fa::loadu(&ptr->l.vz); + space.p = Vec3fa::loadu(&ptr->p); + return space; + } + + #undef VectorT + #undef ScalarT +} diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h new file mode 100644 index 0000000000..bc43155358 --- /dev/null +++ b/thirdparty/embree/common/math/bbox.h @@ -0,0 +1,331 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" +#include "vec3.h" + +namespace embree +{ + namespace internal { + + template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); } + template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; } + template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; } + + } // namespace internal + template<typename T> + struct BBox + { + T lower, upper; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline BBox ( ) { } + template<typename T1> + __forceinline BBox ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {} + __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; } + + __forceinline BBox ( const T& v ) : lower(v), upper(v) {} + __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Extending Bounds + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } + __forceinline const BBox& extend(const T & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } + + /*! tests if box is empty */ + __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; } + + /*! computes the size of the box */ + __forceinline T size() const { return upper - lower; } + + /*! computes the center of the box */ + __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); } + + /*! computes twice the center of the box */ + __forceinline T center2() const { return lower+upper; } + + /*! merges two boxes */ + __forceinline static const BBox merge (const BBox& a, const BBox& b) { + return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); + } + + /*! enlarge box by some scaling factor */ + __forceinline BBox enlarge_by(const float a) const { + return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( FullTy ) : lower(neg_inf), upper(pos_inf) {} + __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( TrueTy ) : lower(neg_inf), upper(pos_inf) {} + __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {} + }; + + template<> __forceinline bool BBox<float>::empty() const { + return lower > upper; + } + +#if defined(__SSE__) + template<> __forceinline bool BBox<Vec3fa>::empty() const { + return !all(le_mask(lower,upper)); + } + template<> __forceinline bool BBox<Vec3fx>::empty() const { + return !all(le_mask(lower,upper)); + } +#endif + + /*! tests if box is finite */ + __forceinline bool isvalid( const BBox<Vec3fa>& v ) { + return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE))); + } + + /*! tests if box is finite and non-empty*/ + __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) { + return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper)); + } + + /*! tests if box has finite entries */ + __forceinline bool is_finite( const BBox<Vec3fa>& b) { + return is_finite(b.lower) && is_finite(b.upper); + } + + /*! test if point contained in box */ + __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); } + + /*! computes the center of the box */ + template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; } + template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); } + + /*! computes the volume of a bounding box */ + __forceinline float volume ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); } + __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); } + + /*! computes the volume of a bounding box */ + __forceinline float volume( const BBox<Vec3f>& b ) { return reduce_mul(b.size()); } + + /*! computes the surface area of a bounding box */ + template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; } + + template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); } + template<typename T> __forceinline const T area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); } + + __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); } + __forceinline float area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); } + + __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); } + __forceinline float area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); } + + template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); } + + template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) { + return halfArea(box); + } + + /*! merges bounding boxes and points */ + template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const T& b ) { return BBox<T>(min(a.lower, b ), max(a.upper, b )); } + template<typename T> __forceinline const BBox<T> merge( const T& a, const BBox<T>& b ) { return BBox<T>(min(a , b.lower), max(a , b.upper)); } + template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); } + + /*! Merges three boxes. */ + template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); } + + /*! Merges four boxes. */ + template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) { + return merge(merge(a,b),merge(c,d)); + } + + /*! Comparison Operators */ + template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; } + template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; } + + /*! scaling */ + template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); } + template<typename T> __forceinline BBox<T> operator *( const T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); } + + /*! translations */ + template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); } + template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); } + template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const T & b ) { return BBox<T>(a.lower+b ,a.upper+b ); } + template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const T & b ) { return BBox<T>(a.lower-b ,a.upper-b ); } + + /*! extension */ + template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); } + + /*! intersect bounding boxes */ + template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); } + template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); } + template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); } + + /*! subtract bounds from each other */ + template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d) + { + c.lower = a.lower; + c.upper = min(a.upper,b.lower); + d.lower = max(a.lower,b.upper); + d.upper = a.upper; + } + + /*! tests if bounding boxes (and points) are disjoint (empty intersection) */ + template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); } + template<typename T> __inline bool disjoint( const BBox<T>& a, const T& b ) { return disjoint(a,BBox<T>(b)); } + template<typename T> __inline bool disjoint( const T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); } + + /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */ + template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); } + template<typename T> __inline bool conjoint( const BBox<T>& a, const T& b ) { return conjoint(a,BBox<T>(b)); } + template<typename T> __inline bool conjoint( const T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); } + + /*! subset relation */ + template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b ) + { + for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false; + for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false; + return true; + } + + template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) { + return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + } + + template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) { + return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + } + + /*! blending */ + template<typename T> + __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) { + return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t)); + } + + /*! output operator */ + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) { + return cout << "[" << box.lower << "; " << box.upper << "]"; + } + + /*! default template instantiations */ + typedef BBox<float> BBox1f; + typedef BBox<Vec2f> BBox2f; + typedef BBox<Vec2fa> BBox2fa; + typedef BBox<Vec3f> BBox3f; + typedef BBox<Vec3fa> BBox3fa; + typedef BBox<Vec3fx> BBox3fx; + typedef BBox<Vec3ff> BBox3ff; +} + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined __SSE__ +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<int N> + __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds); + + template<> + __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds) + { + BBox<Vec3<vfloat4>> dest; + + transpose((vfloat4&)bounds[0].lower, + (vfloat4&)bounds[1].lower, + (vfloat4&)bounds[2].lower, + (vfloat4&)bounds[3].lower, + dest.lower.x, + dest.lower.y, + dest.lower.z); + + transpose((vfloat4&)bounds[0].upper, + (vfloat4&)bounds[1].upper, + (vfloat4&)bounds[2].upper, + (vfloat4&)bounds[3].upper, + dest.upper.x, + dest.upper.y, + dest.upper.z); + + return dest; + } + +#if defined(__AVX__) + template<> + __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds) + { + BBox<Vec3<vfloat8>> dest; + + transpose((vfloat4&)bounds[0].lower, + (vfloat4&)bounds[1].lower, + (vfloat4&)bounds[2].lower, + (vfloat4&)bounds[3].lower, + (vfloat4&)bounds[4].lower, + (vfloat4&)bounds[5].lower, + (vfloat4&)bounds[6].lower, + (vfloat4&)bounds[7].lower, + dest.lower.x, + dest.lower.y, + dest.lower.z); + + transpose((vfloat4&)bounds[0].upper, + (vfloat4&)bounds[1].upper, + (vfloat4&)bounds[2].upper, + (vfloat4&)bounds[3].upper, + (vfloat4&)bounds[4].upper, + (vfloat4&)bounds[5].upper, + (vfloat4&)bounds[6].upper, + (vfloat4&)bounds[7].upper, + dest.upper.x, + dest.upper.y, + dest.upper.z); + + return dest; + } +#endif + + template<int N> + __forceinline BBox3fa merge(const BBox3fa* bounds); + + template<> + __forceinline BBox3fa merge<4>(const BBox3fa* bounds) + { + const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower), + min(bounds[2].lower,bounds[3].lower)); + const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper), + max(bounds[2].upper,bounds[3].upper)); + return BBox3fa(lower,upper); + } + +#if defined(__AVX__) + template<> + __forceinline BBox3fa merge<8>(const BBox3fa* bounds) + { + const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)), + min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower))); + const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)), + max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper))); + return BBox3fa(lower,upper); + } +#endif +} + diff --git a/thirdparty/embree/common/math/col3.h b/thirdparty/embree/common/math/col3.h new file mode 100644 index 0000000000..3f50c04393 --- /dev/null +++ b/thirdparty/embree/common/math/col3.h @@ -0,0 +1,47 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// RGB Color Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Col3 + { + T r, g, b; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col3 ( ) { } + __forceinline Col3 ( const Col3& other ) { r = other.r; g = other.g; b = other.b; } + __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; } + + __forceinline explicit Col3 (const T& v) : r(v), g(v), b(v) {} + __forceinline Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col3 (ZeroTy) : r(zero) , g(zero) , b(zero) {} + __forceinline Col3 (OneTy) : r(one) , g(one) , b(one) {} + __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {} + __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {} + }; + + /*! output operator */ + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; + } + + /*! default template instantiations */ + typedef Col3<unsigned char> Col3uc; + typedef Col3<float > Col3f; +} diff --git a/thirdparty/embree/common/math/col4.h b/thirdparty/embree/common/math/col4.h new file mode 100644 index 0000000000..788508516b --- /dev/null +++ b/thirdparty/embree/common/math/col4.h @@ -0,0 +1,47 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// RGBA Color Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Col4 + { + T r, g, b, a; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col4 ( ) { } + __forceinline Col4 ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; } + __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; } + + __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {} + __forceinline Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col4 (ZeroTy) : r(zero) , g(zero) , b(zero) , a(zero) {} + __forceinline Col4 (OneTy) : r(one) , g(one) , b(one) , a(one) {} + __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {} + __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {} + }; + + /*! output operator */ + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")"; + } + + /*! default template instantiations */ + typedef Col4<unsigned char> Col4uc; + typedef Col4<float > Col4f; +} diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h new file mode 100644 index 0000000000..529584ea16 --- /dev/null +++ b/thirdparty/embree/common/math/color.h @@ -0,0 +1,241 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "constants.h" +#include "col3.h" +#include "col4.h" + +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE RGBA Color Class + //////////////////////////////////////////////////////////////////////////////// + + struct Color4 + { + union { + __m128 m128; + struct { float r,g,b,a; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color4 () {} + __forceinline Color4 ( const __m128 a ) : m128(a) {} + + __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {} + __forceinline Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {} + + __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } + __forceinline explicit Color4 ( const Col3f& other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); } + __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } + __forceinline explicit Color4 ( const Col4f& other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); } + + __forceinline Color4 ( const Color4& other ) : m128(other.m128) {} + __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; } + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Set + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } + __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; } + __forceinline void set(Col3uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (unsigned char)(s[0]); + d.g = (unsigned char)(s[1]); + d.b = (unsigned char)(s[2]); + } + __forceinline void set(Col4uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (unsigned char)(s[0]); + d.g = (unsigned char)(s[1]); + d.b = (unsigned char)(s[2]); + d.a = (unsigned char)(s[3]); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color4( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} + __forceinline Color4( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + /// SSE RGB Color Class + //////////////////////////////////////////////////////////////////////////////// + + struct Color + { + union { + __m128 m128; + struct { float r,g,b; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color () {} + __forceinline Color ( const __m128 a ) : m128(a) {} + + __forceinline explicit Color (const float v) : m128(_mm_set1_ps(v)) {} + __forceinline Color (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {} + + __forceinline Color ( const Color& other ) : m128(other.m128) {} + __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; } + + __forceinline Color ( const Color4& other ) : m128(other.m128) {} + __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; } + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Set + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } + __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; } + __forceinline void set(Col3uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (unsigned char)(s[0]); + d.g = (unsigned char)(s[1]); + d.b = (unsigned char)(s[2]); + } + __forceinline void set(Col4uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (unsigned char)(s[0]); + d.g = (unsigned char)(s[1]); + d.b = (unsigned char)(s[2]); + d.a = 255; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} + __forceinline Color( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator +( const Color& a ) { return a; } + __forceinline const Color operator -( const Color& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline const Color abs ( const Color& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline const Color rcp ( const Color& a ) + { +#if defined(__AVX512VL__) + const Color r = _mm_rcp14_ps(a.m128); +#else + const Color r = _mm_rcp_ps(a.m128); +#endif + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + } + __forceinline const Color rsqrt( const Color& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline const Color operator *( const Color& a, const float b ) { return a * Color(b); } + __forceinline const Color operator *( const float a, const Color& b ) { return Color(a) * b; } + __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); } + __forceinline const Color operator /( const Color& a, const float b ) { return a * rcp(b); } + + __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; } + __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; } + __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; } + __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; } + __forceinline const Color operator*=(Color& a, const float b ) { return a = a * b; } + __forceinline const Color operator/=(Color& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; } + __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; } + __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); } + __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + __forceinline bool operator < ( const Color& a, const Color& b ) { + if (a.r != b.r) return a.r < b.r; + if (a.g != b.g) return a.g < b.g; + if (a.b != b.b) return a.b < b.b; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color select( bool s, const Color& t, const Color& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Special Operators + //////////////////////////////////////////////////////////////////////////////// + + /*! computes luminance of a color */ + __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); } + + /*! output operator */ + __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; + } +} diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp new file mode 100644 index 0000000000..03919ae20c --- /dev/null +++ b/thirdparty/embree/common/math/constants.cpp @@ -0,0 +1,27 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "constants.h" + +namespace embree +{ + TrueTy True; + FalseTy False; + ZeroTy zero; + OneTy one; + NegInfTy neg_inf; + PosInfTy inf; + PosInfTy pos_inf; + NaNTy nan; + UlpTy ulp; + PiTy pi; + OneOverPiTy one_over_pi; + TwoPiTy two_pi; + OneOverTwoPiTy one_over_two_pi; + FourPiTy four_pi; + OneOverFourPiTy one_over_four_pi; + StepTy step; + ReverseStepTy reverse_step; + EmptyTy empty; + UndefinedTy undefined; +} diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h new file mode 100644 index 0000000000..578473a8ab --- /dev/null +++ b/thirdparty/embree/common/math/constants.h @@ -0,0 +1,197 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" + +#include <limits> + +#define _USE_MATH_DEFINES +#include <math.h> // using cmath causes issues under Windows +#include <cfloat> +#include <climits> + +namespace embree +{ + static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f; + static MAYBE_UNUSED const float min_rcp_input = 1E-18f; // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail + + /* we consider floating point numbers in that range as valid input numbers */ + static MAYBE_UNUSED float FLT_LARGE = 1.844E18f; + + struct TrueTy { + __forceinline operator bool( ) const { return true; } + }; + + extern MAYBE_UNUSED TrueTy True; + + struct FalseTy { + __forceinline operator bool( ) const { return false; } + }; + + extern MAYBE_UNUSED FalseTy False; + + struct ZeroTy + { + __forceinline operator double ( ) const { return 0; } + __forceinline operator float ( ) const { return 0; } + __forceinline operator long long( ) const { return 0; } + __forceinline operator unsigned long long( ) const { return 0; } + __forceinline operator long ( ) const { return 0; } + __forceinline operator unsigned long ( ) const { return 0; } + __forceinline operator int ( ) const { return 0; } + __forceinline operator unsigned int ( ) const { return 0; } + __forceinline operator short ( ) const { return 0; } + __forceinline operator unsigned short ( ) const { return 0; } + __forceinline operator char ( ) const { return 0; } + __forceinline operator unsigned char ( ) const { return 0; } + }; + + extern MAYBE_UNUSED ZeroTy zero; + + struct OneTy + { + __forceinline operator double ( ) const { return 1; } + __forceinline operator float ( ) const { return 1; } + __forceinline operator long long( ) const { return 1; } + __forceinline operator unsigned long long( ) const { return 1; } + __forceinline operator long ( ) const { return 1; } + __forceinline operator unsigned long ( ) const { return 1; } + __forceinline operator int ( ) const { return 1; } + __forceinline operator unsigned int ( ) const { return 1; } + __forceinline operator short ( ) const { return 1; } + __forceinline operator unsigned short ( ) const { return 1; } + __forceinline operator char ( ) const { return 1; } + __forceinline operator unsigned char ( ) const { return 1; } + }; + + extern MAYBE_UNUSED OneTy one; + + struct NegInfTy + { + __forceinline operator double ( ) const { return -std::numeric_limits<double>::infinity(); } + __forceinline operator float ( ) const { return -std::numeric_limits<float>::infinity(); } + __forceinline operator long long( ) const { return std::numeric_limits<long long>::min(); } + __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); } + __forceinline operator long ( ) const { return std::numeric_limits<long>::min(); } + __forceinline operator unsigned long ( ) const { return std::numeric_limits<unsigned long>::min(); } + __forceinline operator int ( ) const { return std::numeric_limits<int>::min(); } + __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::min(); } + __forceinline operator short ( ) const { return std::numeric_limits<short>::min(); } + __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::min(); } + __forceinline operator char ( ) const { return std::numeric_limits<char>::min(); } + __forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::min(); } + + }; + + extern MAYBE_UNUSED NegInfTy neg_inf; + + struct PosInfTy + { + __forceinline operator double ( ) const { return std::numeric_limits<double>::infinity(); } + __forceinline operator float ( ) const { return std::numeric_limits<float>::infinity(); } + __forceinline operator long long( ) const { return std::numeric_limits<long long>::max(); } + __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); } + __forceinline operator long ( ) const { return std::numeric_limits<long>::max(); } + __forceinline operator unsigned long ( ) const { return std::numeric_limits<unsigned long>::max(); } + __forceinline operator int ( ) const { return std::numeric_limits<int>::max(); } + __forceinline operator unsigned int ( ) const { return std::numeric_limits<unsigned int>::max(); } + __forceinline operator short ( ) const { return std::numeric_limits<short>::max(); } + __forceinline operator unsigned short ( ) const { return std::numeric_limits<unsigned short>::max(); } + __forceinline operator char ( ) const { return std::numeric_limits<char>::max(); } + __forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); } + }; + + extern MAYBE_UNUSED PosInfTy inf; + extern MAYBE_UNUSED PosInfTy pos_inf; + + struct NaNTy + { + __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); } + __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); } + }; + + extern MAYBE_UNUSED NaNTy nan; + + struct UlpTy + { + __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); } + __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); } + }; + + extern MAYBE_UNUSED UlpTy ulp; + + struct PiTy + { + __forceinline operator double( ) const { return double(M_PI); } + __forceinline operator float ( ) const { return float(M_PI); } + }; + + extern MAYBE_UNUSED PiTy pi; + + struct OneOverPiTy + { + __forceinline operator double( ) const { return double(M_1_PI); } + __forceinline operator float ( ) const { return float(M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverPiTy one_over_pi; + + struct TwoPiTy + { + __forceinline operator double( ) const { return double(2.0*M_PI); } + __forceinline operator float ( ) const { return float(2.0*M_PI); } + }; + + extern MAYBE_UNUSED TwoPiTy two_pi; + + struct OneOverTwoPiTy + { + __forceinline operator double( ) const { return double(0.5*M_1_PI); } + __forceinline operator float ( ) const { return float(0.5*M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi; + + struct FourPiTy + { + __forceinline operator double( ) const { return double(4.0*M_PI); } + __forceinline operator float ( ) const { return float(4.0*M_PI); } + }; + + extern MAYBE_UNUSED FourPiTy four_pi; + + struct OneOverFourPiTy + { + __forceinline operator double( ) const { return double(0.25*M_1_PI); } + __forceinline operator float ( ) const { return float(0.25*M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi; + + struct StepTy { + }; + + extern MAYBE_UNUSED StepTy step; + + struct ReverseStepTy { + }; + + extern MAYBE_UNUSED ReverseStepTy reverse_step; + + struct EmptyTy { + }; + + extern MAYBE_UNUSED EmptyTy empty; + + struct FullTy { + }; + + extern MAYBE_UNUSED FullTy full; + + struct UndefinedTy { + }; + + extern MAYBE_UNUSED UndefinedTy undefined; +} diff --git a/thirdparty/embree/common/math/interval.h b/thirdparty/embree/common/math/interval.h new file mode 100644 index 0000000000..310add2129 --- /dev/null +++ b/thirdparty/embree/common/math/interval.h @@ -0,0 +1,161 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" +#include "vec3.h" +#include "bbox.h" + +namespace embree +{ + template<typename V> + struct Interval + { + V lower, upper; + + __forceinline Interval() {} + __forceinline Interval ( const Interval& other ) { lower = other.lower; upper = other.upper; } + __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; } + + __forceinline Interval(const V& a) : lower(a), upper(a) {} + __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {} + __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {} + + /*! tests if box is empty */ + //__forceinline bool empty() const { return lower > upper; } + + /*! computes the size of the interval */ + __forceinline V size() const { return upper - lower; } + + __forceinline V center() const { return 0.5f*(lower+upper); } + + __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } + __forceinline const Interval& extend(const V & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } + + __forceinline friend Interval operator +( const Interval& a, const Interval& b ) { + return Interval(a.lower+b.lower,a.upper+b.upper); + } + + __forceinline friend Interval operator -( const Interval& a, const Interval& b ) { + return Interval(a.lower-b.upper,a.upper-b.lower); + } + + __forceinline friend Interval operator -( const Interval& a, const V& b ) { + return Interval(a.lower-b,a.upper-b); + } + + __forceinline friend Interval operator *( const Interval& a, const Interval& b ) + { + const V ll = a.lower*b.lower; + const V lu = a.lower*b.upper; + const V ul = a.upper*b.lower; + const V uu = a.upper*b.upper; + return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu)); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b) { + return Interval(min(a.lower,b.lower),max(a.upper,b.upper)); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) { + return merge(merge(a,b),c); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) { + return merge(merge(a,b),merge(c,d)); + } + + /*! intersect bounding boxes */ + __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); } + __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); } + __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); } + + friend embree_ostream operator<<(embree_ostream cout, const Interval& a) { + return cout << "[" << a.lower << ", " << a.upper << "]"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline Interval( FullTy ) : lower(neg_inf), upper(pos_inf) {} + }; + + __forceinline bool isEmpty(const Interval<float>& v) { + return v.lower > v.upper; + } + + __forceinline vboolx isEmpty(const Interval<vfloatx>& v) { + return v.lower > v.upper; + } + + /*! subset relation */ + template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { + return (a.lower > b.lower) && (a.upper < b.upper); + } + + template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { + return subset(a.x,b.x) && subset(a.y,b.y); + } + + template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { + return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) { + return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); + } + + template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) { + return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); + } + + __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1) + { + float eps = 1E-4f; + bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps; + bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps; + return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1); + } + + typedef Interval<float> Interval1f; + typedef Vec2<Interval<float>> Interval2f; + typedef Vec3<Interval<float>> Interval3f; + +inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; } + +inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); } + +#define TWO_PI (2.0*M_PI) +inline Interval1f sin(Interval1f interval) +{ + if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } + if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } + if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } + float sinLower = sin(interval.lower); + float sinUpper = sin(interval.upper); + if (sinLower > sinUpper) swap(sinLower, sinUpper); + if (interval.lower < M_PI / 2.0 && interval.upper > M_PI / 2.0) sinUpper = 1.0; + if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0; + return Interval1f(sinLower, sinUpper); +} + +inline Interval1f cos(Interval1f interval) +{ + if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } + if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } + if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } + float cosLower = cos(interval.lower); + float cosUpper = cos(interval.upper); + if (cosLower > cosUpper) swap(cosLower, cosUpper); + if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0; + return Interval1f(cosLower, cosUpper); +} +#undef TWO_PI +} diff --git a/thirdparty/embree/common/math/lbbox.h b/thirdparty/embree/common/math/lbbox.h new file mode 100644 index 0000000000..2b397a05c8 --- /dev/null +++ b/thirdparty/embree/common/math/lbbox.h @@ -0,0 +1,289 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bbox.h" +#include "range.h" + +namespace embree +{ + template<typename T> + __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt) + { + const float rcp_dt_size = float(1.0f)/dt.size(); + const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size); + const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size); + return std::make_pair(g0,g1); + } + + template<typename T> + struct LBBox + { + public: + __forceinline LBBox () {} + + template<typename T1> + __forceinline LBBox ( const LBBox<T1>& other ) + : bounds0(other.bounds0), bounds1(other.bounds1) {} + + __forceinline LBBox& operator= ( const LBBox& other ) { + bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; + } + + __forceinline LBBox (EmptyTy) + : bounds0(EmptyTy()), bounds1(EmptyTy()) {} + + __forceinline explicit LBBox ( const BBox<T>& bounds) + : bounds0(bounds), bounds1(bounds) { } + + __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) + : bounds0(bounds0), bounds1(bounds1) { } + + LBBox ( const avector<BBox<T>>& bounds ) + { + assert(bounds.size()); + BBox<T> b0 = bounds.front(); + BBox<T> b1 = bounds.back(); + for (size_t i=1; i<bounds.size()-1; i++) { + const float f = float(i)/float(bounds.size()-1); + const BBox<T> bt = lerp(b0,b1,f); + const T dlower = min(bounds[i].lower-bt.lower,T(zero)); + const T dupper = max(bounds[i].upper-bt.upper,T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template<typename BoundsFunc> + __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments) + { + const float lower = time_range.lower*numTimeSegments; + const float upper = time_range.upper*numTimeSegments; + const float ilowerf = floor(lower); + const float iupperf = ceil(upper); + const int ilower = (int)ilowerf; + const int iupper = (int)iupperf; + + const BBox<T> blower0 = bounds(ilower); + const BBox<T> bupper1 = bounds(iupper); + + if (iupper-ilower == 1) { + bounds0 = lerp(blower0, bupper1, lower-ilowerf); + bounds1 = lerp(bupper1, blower0, iupperf-upper); + return; + } + + const BBox<T> blower1 = bounds(ilower+1); + const BBox<T> bupper0 = bounds(iupper-1); + BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf); + BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper); + + for (int i = ilower+1; i < iupper; i++) + { + const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size(); + const BBox<T> bt = lerp(b0, b1, f); + const BBox<T> bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template<typename BoundsFunc> + __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments) + { + /* normalize global time_range_in to local geom_time_range */ + const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(), + (time_range_in.upper-geom_time_range.lower)/geom_time_range.size()); + + const float lower = time_range.lower*geom_time_segments; + const float upper = time_range.upper*geom_time_segments; + const float ilowerf = floor(lower); + const float iupperf = ceil(upper); + const float ilowerfc = max(0.0f,ilowerf); + const float iupperfc = min(iupperf,geom_time_segments); + const int ilowerc = (int)ilowerfc; + const int iupperc = (int)iupperfc; + assert(iupperc-ilowerc > 0); + + /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */ + const int ilower_iter = max(-1,(int)ilowerf); + const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1); + + const BBox<T> blower0 = bounds(ilowerc); + const BBox<T> bupper1 = bounds(iupperc); + if (iupper_iter-ilower_iter == 1) { + bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc)); + bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper)); + return; + } + + const BBox<T> blower1 = bounds(ilowerc+1); + const BBox<T> bupper0 = bounds(iupperc-1); + BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc)); + BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper)); + + for (int i = ilower_iter+1; i < iupper_iter; i++) + { + const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size(); + const BBox<T> bt = lerp(b0, b1, f); + const BBox<T> bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template<typename BoundsFunc> + __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments) + { + const int ilower = time_range.begin(); + const int iupper = time_range.end(); + + BBox<T> b0 = bounds(ilower); + BBox<T> b1 = bounds(iupper); + + if (iupper-ilower == 1) + { + bounds0 = b0; + bounds1 = b1; + return; + } + + for (int i = ilower+1; i<iupper; i++) + { + const float f = float(i - time_range.begin()) / float(time_range.size()); + const BBox<T> bt = lerp(b0, b1, f); + const BBox<T> bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + public: + + __forceinline bool empty() const { + return bounds().empty(); + } + + __forceinline BBox<T> bounds () const { + return merge(bounds0,bounds1); + } + + __forceinline BBox<T> interpolate( const float t ) const { + return lerp(bounds0,bounds1,t); + } + + __forceinline LBBox<T> interpolate( const BBox1f& dt ) const { + return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper)); + } + + __forceinline void extend( const LBBox& other ) { + bounds0.extend(other.bounds0); + bounds1.extend(other.bounds1); + } + + __forceinline float expectedHalfArea() const; + + __forceinline float expectedHalfArea(const BBox1f& dt) const { + return interpolate(dt).expectedHalfArea(); + } + + __forceinline float expectedApproxHalfArea() const { + return 0.5f*(halfArea(bounds0) + halfArea(bounds1)); + } + + /* calculates bounds for [0,1] time range from bounds in dt time range */ + __forceinline LBBox global(const BBox1f& dt) const + { + const float rcp_dt_size = 1.0f/dt.size(); + const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size); + const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size); + return LBBox(b0,b1); + } + + /*! Comparison Operators */ + //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } + //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } + friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } + friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) { + return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }"; + } + + public: + BBox<T> bounds0, bounds1; + }; + + /*! tests if box is finite */ + template<typename T> + __forceinline bool isvalid( const LBBox<T>& v ) { + return isvalid(v.bounds0) && isvalid(v.bounds1); + } + + template<typename T> + __forceinline bool isvalid_non_empty( const LBBox<T>& v ) { + return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1); + } + + template<typename T> + __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1) + { + const T da = a1-a0; + const T db = b1-b0; + return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f); + } + + template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const + { + const Vec3fa d0 = bounds0.size(); + const Vec3fa d1 = bounds1.size(); + return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z), + Vec3fa(d1.x,d1.y,d1.z), + Vec3fa(d0.y,d0.z,d0.x), + Vec3fa(d1.y,d1.z,d1.x))); + } + + template<typename T> + __forceinline float expectedApproxHalfArea(const LBBox<T>& box) { + return box.expectedApproxHalfArea(); + } + + template<typename T> + __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) { + return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1)); + } + + /*! subset relation */ + template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) { + return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1); + } + + /*! default template instantiations */ + typedef LBBox<float> LBBox1f; + typedef LBBox<Vec2f> LBBox2f; + typedef LBBox<Vec3f> LBBox3f; + typedef LBBox<Vec3fa> LBBox3fa; + typedef LBBox<Vec3fx> LBBox3fx; +} diff --git a/thirdparty/embree/common/math/linearspace2.h b/thirdparty/embree/common/math/linearspace2.h new file mode 100644 index 0000000000..184ee695fb --- /dev/null +++ b/thirdparty/embree/common/math/linearspace2.h @@ -0,0 +1,148 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// 2D Linear Transform (2x2 Matrix) + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct LinearSpace2 + { + typedef T Vector; + typedef typename T::Scalar Scalar; + + /*! default matrix constructor */ + __forceinline LinearSpace2 ( ) {} + __forceinline LinearSpace2 ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; } + __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; } + + template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {} + + /*! matrix construction from column vectors */ + __forceinline LinearSpace2(const Vector& vx, const Vector& vy) + : vx(vx), vy(vy) {} + + /*! matrix construction from row mayor data */ + __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, + const Scalar& m10, const Scalar& m11) + : vx(m00,m10), vy(m01,m11) {} + + /*! compute the determinant of the matrix */ + __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; } + + /*! compute adjoint matrix */ + __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); } + + /*! compute inverse matrix */ + __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); } + + /*! compute transposed matrix */ + __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); } + + /*! returns first row of matrix */ + __forceinline Vector row0() const { return Vector(vx.x,vy.x); } + + /*! returns second row of matrix */ + __forceinline Vector row1() const { return Vector(vx.y,vy.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {} + __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {} + + /*! return matrix for scaling */ + static __forceinline LinearSpace2 scale(const Vector& s) { + return LinearSpace2(s.x, 0, + 0 , s.y); + } + + /*! return matrix for rotation */ + static __forceinline LinearSpace2 rotate(const Scalar& r) { + Scalar s = sin(r), c = cos(r); + return LinearSpace2(c, -s, + s, c); + } + + /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */ + LinearSpace2 orthogonal() const + { + LinearSpace2 m = *this; + + // mirrored? + Scalar mirror(one); + if (m.det() < Scalar(zero)) { + m.vx = -m.vx; + mirror = -mirror; + } + + // rotation + for (int i = 0; i < 99; i++) { + const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse()); + const LinearSpace2 d = m_next - m; + m = m_next; + // norm^2 of difference small enough? + if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8) + break; + } + + // rotation * mirror_x + return LinearSpace2(mirror*m.vx, m.vy); + } + + public: + + /*! the column vectors of the matrix */ + Vector vx,vy; + }; + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); } + template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); } + template<typename T> __forceinline LinearSpace2<T> rcp ( const LinearSpace2<T>& a ) { return a.inverse(); } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); } + template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); } + + template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); } + template<typename T> __forceinline T operator*(const LinearSpace2<T>& a, const T & b) { return b.x*a.vx + b.y*a.vy; } + template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); } + + template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); } + template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); } + + template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; } + template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; } + template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) { + return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}"; + } + + /*! Shortcuts for common linear spaces. */ + typedef LinearSpace2<Vec2f> LinearSpace2f; + typedef LinearSpace2<Vec2fa> LinearSpace2fa; +} diff --git a/thirdparty/embree/common/math/linearspace3.h b/thirdparty/embree/common/math/linearspace3.h new file mode 100644 index 0000000000..9eaa2cc2bb --- /dev/null +++ b/thirdparty/embree/common/math/linearspace3.h @@ -0,0 +1,213 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec3.h" +#include "quaternion.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// 3D Linear Transform (3x3 Matrix) + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct LinearSpace3 + { + typedef T Vector; + typedef typename T::Scalar Scalar; + + /*! default matrix constructor */ + __forceinline LinearSpace3 ( ) {} + __forceinline LinearSpace3 ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; } + __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; } + + template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {} + + /*! matrix construction from column vectors */ + __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz) + : vx(vx), vy(vy), vz(vz) {} + + /*! construction from quaternion */ + __forceinline LinearSpace3( const QuaternionT<Scalar>& q ) + : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j)) + , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i)) + , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {} + + /*! matrix construction from row mayor data */ + __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02, + const Scalar& m10, const Scalar& m11, const Scalar& m12, + const Scalar& m20, const Scalar& m21, const Scalar& m22) + : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {} + + /*! compute the determinant of the matrix */ + __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); } + + /*! compute adjoint matrix */ + __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); } + + /*! compute inverse matrix */ + __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); } + + /*! compute transposed matrix */ + __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); } + + /*! returns first row of matrix */ + __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); } + + /*! returns second row of matrix */ + __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); } + + /*! returns third row of matrix */ + __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {} + __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {} + + /*! return matrix for scaling */ + static __forceinline LinearSpace3 scale(const Vector& s) { + return LinearSpace3(s.x, 0, 0, + 0 , s.y, 0, + 0 , 0, s.z); + } + + /*! return matrix for rotation around arbitrary axis */ + static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) { + Vector u = normalize(_u); + Scalar s = sin(r), c = cos(r); + return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c, u.x*u.y*(1-c)-u.z*s, u.x*u.z*(1-c)+u.y*s, + u.x*u.y*(1-c)+u.z*s, u.y*u.y+(1-u.y*u.y)*c, u.y*u.z*(1-c)-u.x*s, + u.x*u.z*(1-c)-u.y*s, u.y*u.z*(1-c)+u.x*s, u.z*u.z+(1-u.z*u.z)*c); + } + + public: + + /*! the column vectors of the matrix */ + Vector vx,vy,vz; + }; + + /*! compute transposed matrix */ + template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { + vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz); + return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); + } + + template<typename T> + __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { + return xfm.transposed(); + } + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); } + template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); } + template<typename T> __forceinline LinearSpace3<T> rcp ( const LinearSpace3<T>& a ) { return a.inverse(); } + + /* constructs a coordinate frame form a normalized normal */ + template<typename T> __forceinline LinearSpace3<T> frame(const T& N) + { + const T dx0(0,N.z,-N.y); + const T dx1(-N.z,0,N.x); + const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1)); + const T dy = normalize(cross(N,dx)); + return LinearSpace3<T>(dx,dy,N); + } + + /* constructs a coordinate frame from a normal and approximate x-direction */ + template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi) + { + if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel + const T dx = normalize(cross(dxi,N)); + const T dy = normalize(cross(N,dx)); + return LinearSpace3<T>(dx,dy,N); + } + + /* clamps linear space to range -1 to +1 */ + template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) { + return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)), + clamp(space.vy,T(-1.0f),T(1.0f)), + clamp(space.vz,T(-1.0f),T(1.0f))); + } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); } + template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); } + + template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); } + template<typename T> __forceinline T operator*(const LinearSpace3<T>& a, const T & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); } + template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); } + + template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); } + template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); } + + template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; } + template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; } + + template<typename T> __forceinline T xfmPoint (const LinearSpace3<T>& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } + template<typename T> __forceinline T xfmVector(const LinearSpace3<T>& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } + template<typename T> __forceinline T xfmNormal(const LinearSpace3<T>& s, const T & a) { return xfmVector(s.inverse().transposed(),a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } + template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) { + return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz)); + } + + /*! blending */ + template<typename T> + __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) + { + return LinearSpace3<T>(lerp(l0.vx,l1.vx,t), + lerp(l0.vy,l1.vy,t), + lerp(l0.vz,l1.vz,t)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) { + return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}"; + } + + /*! Shortcuts for common linear spaces. */ + typedef LinearSpace3<Vec3f> LinearSpace3f; + typedef LinearSpace3<Vec3fa> LinearSpace3fa; + typedef LinearSpace3<Vec3fx> LinearSpace3fx; + typedef LinearSpace3<Vec3ff> LinearSpace3ff; + + template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>; + typedef LinearSpace3<Vec3<vfloat<4>>> LinearSpace3vf4; + typedef LinearSpace3<Vec3<vfloat<8>>> LinearSpace3vf8; + typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16; + + /*! blending */ + template<typename T, typename S> + __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, + const LinearSpace3<T>& l1, + const S& t) + { + return LinearSpace3<T>(lerp(l0.vx,l1.vx,t), + lerp(l0.vy,l1.vy,t), + lerp(l0.vz,l1.vz,t)); + } + +} diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h new file mode 100644 index 0000000000..4bc54c1a6a --- /dev/null +++ b/thirdparty/embree/common/math/math.h @@ -0,0 +1,369 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "constants.h" +#include <cmath> + +#if defined(__ARM_NEON) +#include "../simd/arm/emulation.h" +#else +#include <emmintrin.h> +#include <xmmintrin.h> +#include <immintrin.h> +#endif + +#if defined(__WIN32__) +#if defined(_MSC_VER) && (_MSC_VER <= 1700) +namespace std +{ + __forceinline bool isinf ( const float x ) { return _finite(x) == 0; } + __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; } + __forceinline bool isfinite (const float x) { return _finite(x) != 0; } +} +#endif +#endif + +namespace embree +{ + __forceinline bool isvalid ( const float& v ) { + return (v > -FLT_LARGE) & (v < +FLT_LARGE); + } + + __forceinline int cast_f2i(float f) { + union { float f; int i; } v; v.f = f; return v.i; + } + + __forceinline float cast_i2f(int i) { + union { float f; int i; } v; v.i = i; return v.f; + } + + __forceinline int toInt (const float& a) { return int(a); } + __forceinline float toFloat(const int& a) { return float(a); } + +#if defined(__WIN32__) + __forceinline bool finite ( const float x ) { return _finite(x) != 0; } +#endif + + __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; } + __forceinline float sqr ( const float x ) { return x*x; } + + __forceinline float rcp ( const float x ) + { + const __m128 a = _mm_set_ss(x); + +#if defined(__AVX512VL__) + const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a); +#else + const __m128 r = _mm_rcp_ss(a); +#endif + +#if defined(__AVX2__) + return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); +#else + return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); +#endif + } + + __forceinline float signmsk ( const float x ) { + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); + } + __forceinline float xorf( const float x, const float y ) { + return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); + } + __forceinline float andf( const float x, const unsigned y ) { + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); + } + __forceinline float rsqrt( const float x ) + { + const __m128 a = _mm_set_ss(x); +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); +#else + __m128 r = _mm_rsqrt_ss(a); +#endif + r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); +#if defined(__ARM_NEON) + r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); +#endif + return _mm_cvtss_f32(r); + } + +#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700) + __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } + __forceinline double nextafter(double x, double y) { return _nextafter(x, y); } + __forceinline int roundf(float f) { return (int)(f + 0.5f); } +#else + __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); } + __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); } +#endif + + __forceinline float abs ( const float x ) { return ::fabsf(x); } + __forceinline float acos ( const float x ) { return ::acosf (x); } + __forceinline float asin ( const float x ) { return ::asinf (x); } + __forceinline float atan ( const float x ) { return ::atanf (x); } + __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); } + __forceinline float cos ( const float x ) { return ::cosf (x); } + __forceinline float cosh ( const float x ) { return ::coshf (x); } + __forceinline float exp ( const float x ) { return ::expf (x); } + __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); } + __forceinline float log ( const float x ) { return ::logf (x); } + __forceinline float log10( const float x ) { return ::log10f(x); } + __forceinline float pow ( const float x, const float y ) { return ::powf (x, y); } + __forceinline float sin ( const float x ) { return ::sinf (x); } + __forceinline float sinh ( const float x ) { return ::sinhf (x); } + __forceinline float sqrt ( const float x ) { return ::sqrtf (x); } + __forceinline float tan ( const float x ) { return ::tanf (x); } + __forceinline float tanh ( const float x ) { return ::tanhf (x); } + __forceinline float floor( const float x ) { return ::floorf (x); } + __forceinline float ceil ( const float x ) { return ::ceilf (x); } + __forceinline float frac ( const float x ) { return x-floor(x); } + + __forceinline double abs ( const double x ) { return ::fabs(x); } + __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; } + __forceinline double acos ( const double x ) { return ::acos (x); } + __forceinline double asin ( const double x ) { return ::asin (x); } + __forceinline double atan ( const double x ) { return ::atan (x); } + __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); } + __forceinline double cos ( const double x ) { return ::cos (x); } + __forceinline double cosh ( const double x ) { return ::cosh (x); } + __forceinline double exp ( const double x ) { return ::exp (x); } + __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); } + __forceinline double log ( const double x ) { return ::log (x); } + __forceinline double log10( const double x ) { return ::log10(x); } + __forceinline double pow ( const double x, const double y ) { return ::pow (x, y); } + __forceinline double rcp ( const double x ) { return 1.0/x; } + __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); } + __forceinline double sin ( const double x ) { return ::sin (x); } + __forceinline double sinh ( const double x ) { return ::sinh (x); } + __forceinline double sqr ( const double x ) { return x*x; } + __forceinline double sqrt ( const double x ) { return ::sqrt (x); } + __forceinline double tan ( const double x ) { return ::tan (x); } + __forceinline double tanh ( const double x ) { return ::tanh (x); } + __forceinline double floor( const double x ) { return ::floor (x); } + __forceinline double ceil ( const double x ) { return ::ceil (x); } + +#if defined(__SSE4_1__) + __forceinline float mini(float a, float b) { + const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); + const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); + const __m128i ci = _mm_min_epi32(ai,bi); + return _mm_cvtss_f32(_mm_castsi128_ps(ci)); + } +#endif + +#if defined(__SSE4_1__) + __forceinline float maxi(float a, float b) { + const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); + const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); + const __m128i ci = _mm_max_epi32(ai,bi); + return _mm_cvtss_f32(_mm_castsi128_ps(ci)); + } +#endif + + template<typename T> + __forceinline T twice(const T& a) { return a+a; } + + __forceinline int min(int a, int b) { return a<b ? a:b; } + __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; } + __forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; } + __forceinline float min(float a, float b) { return a<b ? a:b; } + __forceinline double min(double a, double b) { return a<b ? a:b; } +#if defined(__64BIT__) + __forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; } +#endif + + template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } + template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } + template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); } + + template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); } + template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); } + template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); } + + __forceinline int max(int a, int b) { return a<b ? b:a; } + __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; } + __forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; } + __forceinline float max(float a, float b) { return a<b ? b:a; } + __forceinline double max(double a, double b) { return a<b ? b:a; } +#if defined(__64BIT__) + __forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; } +#endif + + template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } + template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } + template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); } + + template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); } + template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); } + template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); } + +#if defined(__MACOSX__) + __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; } + __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; } +#endif + +#if defined(__MACOSX__) && !defined(__INTEL_COMPILER) + __forceinline void sincosf(float x, float *sin, float *cos) { + __sincosf(x,sin,cos); + } +#endif + +#if defined(__WIN32__) || defined(__FreeBSD__) + __forceinline void sincosf(float x, float *s, float *c) { + *s = sinf(x); *c = cosf(x); + } +#endif + + template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); } + template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); } + + template<typename T> __forceinline T deg2rad ( const T& x ) { return x * T(1.74532925199432957692e-2f); } + template<typename T> __forceinline T rad2deg ( const T& x ) { return x * T(5.72957795130823208768e1f); } + template<typename T> __forceinline T sin2cos ( const T& x ) { return sqrt(max(T(zero),T(one)-x*x)); } + template<typename T> __forceinline T cos2sin ( const T& x ) { return sin2cos(x); } + +#if defined(__AVX2__) + __forceinline float madd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } +#else + __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } + __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } + __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;} + __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; } +#endif + + /*! random functions */ + template<typename T> T random() { return T(0); } +#if defined(_WIN32) + template<> __forceinline int random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); } + template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); } +#else + template<> __forceinline int random() { return int(rand()); } + template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); } +#endif + template<> __forceinline float random() { return rand()/float(RAND_MAX); } + template<> __forceinline double random() { return rand()/double(RAND_MAX); } + +#if _WIN32 + __forceinline double drand48() { + return double(rand())/double(RAND_MAX); + } + + __forceinline void srand48(long seed) { + return srand(seed); + } +#endif + + /*! selects */ + __forceinline bool select(bool s, bool t , bool f) { return s ? t : f; } + __forceinline int select(bool s, int t, int f) { return s ? t : f; } + __forceinline float select(bool s, float t, float f) { return s ? t : f; } + + __forceinline bool all(bool s) { return s; } + + __forceinline float lerp(const float v0, const float v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + template<typename T> + __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) { + return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3))); + } + + /*! exchange */ + template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; } + + /* load/store */ + template<typename Ty> struct mem; + + template<> struct mem<float> { + static __forceinline float load (bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; } + static __forceinline float loadu(bool mask, const void* ptr) { return mask ? *(float*)ptr : 0.0f; } + + static __forceinline void store (bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; } + static __forceinline void storeu(bool mask, void* ptr, const float v) { if (mask) *(float*)ptr = v; } + }; + + /*! bit reverse operation */ + template<class T> + __forceinline T bitReverse(const T& vin) + { + T v = vin; + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + v = ( v >> 16 ) | ( v << 16); + return v; + } + + /*! bit interleave operation */ + template<class T> + __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin) + { + T x = xin, y = yin, z = zin; + x = (x | (x << 16)) & 0x030000FF; + x = (x | (x << 8)) & 0x0300F00F; + x = (x | (x << 4)) & 0x030C30C3; + x = (x | (x << 2)) & 0x09249249; + + y = (y | (y << 16)) & 0x030000FF; + y = (y | (y << 8)) & 0x0300F00F; + y = (y | (y << 4)) & 0x030C30C3; + y = (y | (y << 2)) & 0x09249249; + + z = (z | (z << 16)) & 0x030000FF; + z = (z | (z << 8)) & 0x0300F00F; + z = (z | (z << 4)) & 0x030C30C3; + z = (z | (z << 2)) & 0x09249249; + + return x | (y << 1) | (z << 2); + } + +#if defined(__AVX2__) + + template<> + __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) + { + const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ ); + const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */); + const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */); + return xx | yy | zz; + } + +#endif + + /*! bit interleave operation for 64bit data types*/ + template<class T> + __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){ + T x = xin & 0x1fffff; + T y = yin & 0x1fffff; + T z = zin & 0x1fffff; + + x = (x | x << 32) & 0x1f00000000ffff; + x = (x | x << 16) & 0x1f0000ff0000ff; + x = (x | x << 8) & 0x100f00f00f00f00f; + x = (x | x << 4) & 0x10c30c30c30c30c3; + x = (x | x << 2) & 0x1249249249249249; + + y = (y | y << 32) & 0x1f00000000ffff; + y = (y | y << 16) & 0x1f0000ff0000ff; + y = (y | y << 8) & 0x100f00f00f00f00f; + y = (y | y << 4) & 0x10c30c30c30c30c3; + y = (y | y << 2) & 0x1249249249249249; + + z = (z | z << 32) & 0x1f00000000ffff; + z = (z | z << 16) & 0x1f0000ff0000ff; + z = (z | z << 8) & 0x100f00f00f00f00f; + z = (z | z << 4) & 0x10c30c30c30c30c3; + z = (z | z << 2) & 0x1249249249249249; + + return x | (y << 1) | (z << 2); + } +} diff --git a/thirdparty/embree/common/math/obbox.h b/thirdparty/embree/common/math/obbox.h new file mode 100644 index 0000000000..2fe8bbf071 --- /dev/null +++ b/thirdparty/embree/common/math/obbox.h @@ -0,0 +1,39 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bbox.h" +#include "linearspace3.h" + +namespace embree +{ + /*! Oriented bounding box */ + template<typename T> + struct OBBox + { + public: + + __forceinline OBBox () {} + + __forceinline OBBox (EmptyTy) + : space(one), bounds(empty) {} + + __forceinline OBBox (const BBox<T>& bounds) + : space(one), bounds(bounds) {} + + __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) + : space(space), bounds(bounds) {} + + friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) { + return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}"; + } + + public: + LinearSpace3<T> space; //!< orthonormal transformation + BBox<T> bounds; //!< bounds in transformed space + }; + + typedef OBBox<Vec3f> OBBox3f; + typedef OBBox<Vec3fa> OBBox3fa; +} diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h new file mode 100644 index 0000000000..080800efcd --- /dev/null +++ b/thirdparty/embree/common/math/quaternion.h @@ -0,0 +1,254 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec3.h" +#include "vec4.h" + +#include "transcendental.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////// + // Quaternion Struct + //////////////////////////////////////////////////////////////// + + template<typename T> + struct QuaternionT + { + typedef Vec3<T> Vector; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline QuaternionT () { } + __forceinline QuaternionT ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; } + __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; } + + __forceinline QuaternionT( const T& r ) : r(r), i(zero), j(zero), k(zero) {} + __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {} + __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {} + __forceinline QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {} + __forceinline QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {} + + __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz ); + __inline QuaternionT( const T& yaw, const T& pitch, const T& roll ); + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {} + __forceinline QuaternionT( OneTy ) : r( one), i(zero), j(zero), k(zero) {} + + /*! return quaternion for rotation around arbitrary axis */ + static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) { + return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u)); + } + + /*! returns the rotation axis of the quaternion as a vector */ + __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); } + + public: + T r, i, j, k; + }; + + template<typename T> __forceinline QuaternionT<T> operator *( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); } + template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); } + + //////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////// + + template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); } + template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); } + template<typename T> __forceinline QuaternionT<T> conj ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); } + template<typename T> __forceinline T abs ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + template<typename T> __forceinline QuaternionT<T> rcp ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + + // evaluates a*q-r + template<typename T> __forceinline QuaternionT<T> + msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p) + { + return QuaternionT<T>(msub(a, q.r, p.r), + msub(a, q.i, p.i), + msub(a, q.j, p.j), + msub(a, q.k, p.k)); + } + // evaluates a*q-r + template<typename T> __forceinline QuaternionT<T> + madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p) + { + return QuaternionT<T>(madd(a, q.r, p.r), + madd(a, q.i, p.i), + madd(a, q.j, p.j), + madd(a, q.k, p.k)); + } + + //////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////// + + template<typename T> __forceinline QuaternionT<T> operator +( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r, b.i, b.j, b.k); } + template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); } + template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); } + template<typename T> __forceinline QuaternionT<T> operator -( const T & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); } + template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); } + template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); } + + template<typename T> __forceinline Vec3<T> operator *( const QuaternionT<T>& a, const Vec3<T> & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } + template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) { + return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k, + a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j, + a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i, + a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r); + } + template<typename T> __forceinline QuaternionT<T> operator /( const T & a, const QuaternionT<T>& b ) { return a*rcp(b); } + template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T & b ) { return a*rcp(b); } + template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); } + + template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T & b ) { return a = a+b; } + template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; } + template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T & b ) { return a = a-b; } + template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; } + template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T & b ) { return a = a*b; } + template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; } + template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T & b ) { return a = a*rcp(b); } + template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); } + + template<typename T, typename M> __forceinline QuaternionT<T> + select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p) + { + return QuaternionT<T>(select(m, q.r, p.r), + select(m, q.i, p.i), + select(m, q.j, p.j), + select(m, q.k, p.k)); + } + + + template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } + template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } + template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>& b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); } + + template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; } + template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Orientation Functions + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz ) + { + if ( vx.x + vy.y + vz.z >= T(zero) ) + { + const T t = T(one) + (vx.x + vy.y + vz.z); + const T s = rsqrt(t)*T(0.5f); + r = t*s; + i = (vy.z - vz.y)*s; + j = (vz.x - vx.z)*s; + k = (vx.y - vy.x)*s; + } + else if ( vx.x >= max(vy.y, vz.z) ) + { + const T t = (T(one) + vx.x) - (vy.y + vz.z); + const T s = rsqrt(t)*T(0.5f); + r = (vy.z - vz.y)*s; + i = t*s; + j = (vx.y + vy.x)*s; + k = (vz.x + vx.z)*s; + } + else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) ) + { + const T t = (T(one) + vy.y) - (vz.z + vx.x); + const T s = rsqrt(t)*T(0.5f); + r = (vz.x - vx.z)*s; + i = (vx.y + vy.x)*s; + j = t*s; + k = (vy.z + vz.y)*s; + } + else //if ( vz.z >= max(vy.y, vx.x) ) + { + const T t = (T(one) + vz.z) - (vx.x + vy.y); + const T s = rsqrt(t)*T(0.5f); + r = (vx.y - vy.x)*s; + i = (vz.x + vx.z)*s; + j = (vy.z + vz.y)*s; + k = t*s; + } + } + + template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll ) + { + const T cya = cos(yaw *T(0.5f)); + const T cpi = cos(pitch*T(0.5f)); + const T cro = cos(roll *T(0.5f)); + const T sya = sin(yaw *T(0.5f)); + const T spi = sin(pitch*T(0.5f)); + const T sro = sin(roll *T(0.5f)); + r = cro*cya*cpi + sro*sya*spi; + i = cro*cya*spi + sro*sya*cpi; + j = cro*sya*cpi - sro*cya*spi; + k = sro*cya*cpi - cro*sya*spi; + } + + ////////////////////////////////////////////////////////////////////////////// + /// Output Operators + ////////////////////////////////////////////////////////////////////////////// + + template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) { + return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }"; + } + + /*! default template instantiations */ + typedef QuaternionT<float> Quaternion3f; + typedef QuaternionT<double> Quaternion3d; + + template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>; + typedef QuaternionT<vfloat<4>> Quaternion3vf4; + typedef QuaternionT<vfloat<8>> Quaternion3vf8; + typedef QuaternionT<vfloat<16>> Quaternion3vf16; + + ////////////////////////////////////////////////////////////////////////////// + /// Interpolation + ////////////////////////////////////////////////////////////////////////////// + template<typename T> + __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0, + const QuaternionT<T>& q1, + const T& factor) + { + QuaternionT<T> q; + q.r = lerp(q0.r, q1.r, factor); + q.i = lerp(q0.i, q1.i, factor); + q.j = lerp(q0.j, q1.j, factor); + q.k = lerp(q0.k, q1.k, factor); + return q; + } + + template<typename T> + __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0, + const QuaternionT<T>& q1_, + const T& t) + { + T cosTheta = dot(q0, q1_); + QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_); + cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta); + if (unlikely(all(cosTheta > 0.9995f))) { + return normalize(lerp(q0, q1, t)); + } + const T phi = t * fastapprox::acos(cosTheta); + T sinPhi, cosPhi; + fastapprox::sincos(phi, sinPhi, cosPhi); + QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1)); + return msub(cosPhi, q0, qperp); + } +} diff --git a/thirdparty/embree/common/math/range.h b/thirdparty/embree/common/math/range.h new file mode 100644 index 0000000000..909fadb995 --- /dev/null +++ b/thirdparty/embree/common/math/range.h @@ -0,0 +1,137 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../math/math.h" + +namespace embree +{ + template<typename Ty> + struct range + { + __forceinline range() {} + + __forceinline range(const Ty& begin) + : _begin(begin), _end(begin+1) {} + + __forceinline range(const Ty& begin, const Ty& end) + : _begin(begin), _end(end) {} + + __forceinline range(const range& other) + : _begin(other._begin), _end(other._end) {} + + template<typename T1> + __forceinline range(const range<T1>& other) + : _begin(Ty(other._begin)), _end(Ty(other._end)) {} + + template<typename T1> + __forceinline range& operator =(const range<T1>& other) { + _begin = other._begin; + _end = other._end; + return *this; + } + + __forceinline Ty begin() const { + return _begin; + } + + __forceinline Ty end() const { + return _end; + } + + __forceinline range intersect(const range& r) const { + return range (max(_begin,r._begin),min(_end,r._end)); + } + + __forceinline Ty size() const { + return _end - _begin; + } + + __forceinline bool empty() const { + return _end <= _begin; + } + + __forceinline Ty center() const { + return (_begin + _end)/2; + } + + __forceinline std::pair<range,range> split() const + { + const Ty _center = center(); + return std::make_pair(range(_begin,_center),range(_center,_end)); + } + + __forceinline void split(range& left_o, range& right_o) const + { + const Ty _center = center(); + left_o = range(_begin,_center); + right_o = range(_center,_end); + } + + __forceinline friend bool operator< (const range& r0, const range& r1) { + return r0.size() < r1.size(); + } + + friend embree_ostream operator<<(embree_ostream cout, const range& r) { + return cout << "range [" << r.begin() << ", " << r.end() << "]"; + } + + Ty _begin, _end; + }; + + template<typename Ty> + range<Ty> make_range(const Ty& begin, const Ty& end) { + return range<Ty>(begin,end); + } + + template<typename Ty> + struct extended_range : public range<Ty> + { + __forceinline extended_range () {} + + __forceinline extended_range (const Ty& begin) + : range<Ty>(begin), _ext_end(begin+1) {} + + __forceinline extended_range (const Ty& begin, const Ty& end) + : range<Ty>(begin,end), _ext_end(end) {} + + __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end) + : range<Ty>(begin,end), _ext_end(ext_end) {} + + __forceinline Ty ext_end() const { + return _ext_end; + } + + __forceinline Ty ext_size() const { + return _ext_end - range<Ty>::_begin; + } + + __forceinline Ty ext_range_size() const { + return _ext_end - range<Ty>::_end; + } + + __forceinline bool has_ext_range() const { + assert(_ext_end >= range<Ty>::_end); + return (_ext_end - range<Ty>::_end) > 0; + } + + __forceinline void set_ext_range(const size_t ext_end){ + assert(ext_end >= range<Ty>::_end); + _ext_end = ext_end; + } + + __forceinline void move_right(const size_t plus){ + range<Ty>::_begin += plus; + range<Ty>::_end += plus; + _ext_end += plus; + } + + friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) { + return cout << "extended_range [" << r.begin() << ", " << r.end() << " (" << r.ext_end() << ")]"; + } + + Ty _ext_end; + }; +} diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h new file mode 100644 index 0000000000..fd16c26e81 --- /dev/null +++ b/thirdparty/embree/common/math/transcendental.h @@ -0,0 +1,525 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +// Transcendental functions from "ispc": https://github.com/ispc/ispc/ +// Most of the transcendental implementations in ispc code come from +// Solomon Boulos's "syrah": https://github.com/boulos/syrah/ + +#include "../simd/simd.h" + +namespace embree +{ + +namespace fastapprox +{ + +template <typename T> +__forceinline T sin(const T &v) +{ + static const float piOverTwoVec = 1.57079637050628662109375; + static const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + auto kMod4 = k & 3; + auto sinUseCos = (kMod4 == 1 | kMod4 == 3); + auto flipSign = (kMod4 > 1); + + // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, + // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); + static const float sinC2 = -0.16666667163372039794921875; + static const float sinC4 = +8.333347737789154052734375e-3; + static const float sinC6 = -1.9842604524455964565277099609375e-4; + static const float sinC8 = +2.760012648650445044040679931640625e-6; + static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + static const float cosC2 = -0.5; + static const float cosC4 = +4.166664183139801025390625e-2; + static const float cosC6 = -1.388833043165504932403564453125e-3; + static const float cosC8 = +2.47562347794882953166961669921875e-5; + static const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto outside = select(sinUseCos, 1., x); + auto c2 = select(sinUseCos, T(cosC2), T(sinC2)); + auto c4 = select(sinUseCos, T(cosC4), T(sinC4)); + auto c6 = select(sinUseCos, T(cosC6), T(sinC6)); + auto c8 = select(sinUseCos, T(cosC8), T(sinC8)); + auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); + + auto x2 = x * x; + auto formula = x2 * c10 + c8; + formula = x2 * formula + c6; + formula = x2 * formula + c4; + formula = x2 * formula + c2; + formula = x2 * formula + 1.; + formula *= outside; + + formula = select(flipSign, -formula, formula); + return formula; +} + +template <typename T> +__forceinline T cos(const T &v) +{ + static const float piOverTwoVec = 1.57079637050628662109375; + static const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + + auto kMod4 = k & 3; + auto cosUseCos = (kMod4 == 0 | kMod4 == 2); + auto flipSign = (kMod4 == 1 | kMod4 == 2); + + const float sinC2 = -0.16666667163372039794921875; + const float sinC4 = +8.333347737789154052734375e-3; + const float sinC6 = -1.9842604524455964565277099609375e-4; + const float sinC8 = +2.760012648650445044040679931640625e-6; + const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + const float cosC2 = -0.5; + const float cosC4 = +4.166664183139801025390625e-2; + const float cosC6 = -1.388833043165504932403564453125e-3; + const float cosC8 = +2.47562347794882953166961669921875e-5; + const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto outside = select(cosUseCos, 1., x); + auto c2 = select(cosUseCos, T(cosC2), T(sinC2)); + auto c4 = select(cosUseCos, T(cosC4), T(sinC4)); + auto c6 = select(cosUseCos, T(cosC6), T(sinC6)); + auto c8 = select(cosUseCos, T(cosC8), T(sinC8)); + auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); + + auto x2 = x * x; + auto formula = x2 * c10 + c8; + formula = x2 * formula + c6; + formula = x2 * formula + c4; + formula = x2 * formula + c2; + formula = x2 * formula + 1.; + formula *= outside; + + formula = select(flipSign, -formula, formula); + return formula; +} + +template <typename T> +__forceinline void sincos(const T &v, T &sinResult, T &cosResult) +{ + const float piOverTwoVec = 1.57079637050628662109375; + const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + auto kMod4 = k & 3; + auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); + auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); + auto sinFlipSign = (kMod4 > 1); + auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); + + const float oneVec = +1.; + const float sinC2 = -0.16666667163372039794921875; + const float sinC4 = +8.333347737789154052734375e-3; + const float sinC6 = -1.9842604524455964565277099609375e-4; + const float sinC8 = +2.760012648650445044040679931640625e-6; + const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + const float cosC2 = -0.5; + const float cosC4 = +4.166664183139801025390625e-2; + const float cosC6 = -1.388833043165504932403564453125e-3; + const float cosC8 = +2.47562347794882953166961669921875e-5; + const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto x2 = x * x; + + auto sinFormula = x2 * sinC10 + sinC8; + auto cosFormula = x2 * cosC10 + cosC8; + sinFormula = x2 * sinFormula + sinC6; + cosFormula = x2 * cosFormula + cosC6; + + sinFormula = x2 * sinFormula + sinC4; + cosFormula = x2 * cosFormula + cosC4; + + sinFormula = x2 * sinFormula + sinC2; + cosFormula = x2 * cosFormula + cosC2; + + sinFormula = x2 * sinFormula + oneVec; + cosFormula = x2 * cosFormula + oneVec; + + sinFormula *= x; + + sinResult = select(sinUseCos, cosFormula, sinFormula); + cosResult = select(cosUseCos, cosFormula, sinFormula); + + sinResult = select(sinFlipSign, -sinResult, sinResult); + cosResult = select(cosFlipSign, -cosResult, cosResult); +} + +template <typename T> +__forceinline T tan(const T &v) +{ + const float piOverFourVec = 0.785398185253143310546875; + const float fourOverPiVec = 1.27323949337005615234375; + + auto xLt0 = v < 0.; + auto y = select(xLt0, -v, v); + auto scaled = y * fourOverPiVec; + + auto kReal = floor(scaled); + auto k = toInt(kReal); + + auto x = y - kReal * piOverFourVec; + + // If k & 1, x -= Pi/4 + auto needOffset = (k & 1) != 0; + x = select(needOffset, x - piOverFourVec, x); + + // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... + auto kMod4 = k & 3; + auto useCotan = (kMod4 == 1) | (kMod4 == 2); + + const float oneVec = 1.0; + + const float tanC2 = +0.33333075046539306640625; + const float tanC4 = +0.13339905440807342529296875; + const float tanC6 = +5.3348250687122344970703125e-2; + const float tanC8 = +2.46033705770969390869140625e-2; + const float tanC10 = +2.892402000725269317626953125e-3; + const float tanC12 = +9.500005282461643218994140625e-3; + + const float cotC2 = -0.3333333432674407958984375; + const float cotC4 = -2.222204394638538360595703125e-2; + const float cotC6 = -2.11752182804048061370849609375e-3; + const float cotC8 = -2.0846328698098659515380859375e-4; + const float cotC10 = -2.548247357481159269809722900390625e-5; + const float cotC12 = -3.5257363606433500535786151885986328125e-7; + + auto x2 = x * x; + T z; + if (any(useCotan)) + { + auto cotVal = x2 * cotC12 + cotC10; + cotVal = x2 * cotVal + cotC8; + cotVal = x2 * cotVal + cotC6; + cotVal = x2 * cotVal + cotC4; + cotVal = x2 * cotVal + cotC2; + cotVal = x2 * cotVal + oneVec; + // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. + cotVal /= -x; + z = cotVal; + } + auto useTan = !useCotan; + if (any(useTan)) + { + auto tanVal = x2 * tanC12 + tanC10; + tanVal = x2 * tanVal + tanC8; + tanVal = x2 * tanVal + tanC6; + tanVal = x2 * tanVal + tanC4; + tanVal = x2 * tanVal + tanC2; + tanVal = x2 * tanVal + oneVec; + // Equation was for tan(x)/x + tanVal *= x; + z = select(useTan, tanVal, z); + } + return select(xLt0, -z, z); +} + +template <typename T> +__forceinline T asin(const T &x0) +{ + auto isneg = (x0 < 0.f); + auto x = abs(x0); + auto isnan = (x > 1.f); + + // sollya + // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], + // [1e-20;.9999999999999999]); + // avg error: 1.1105439e-06, max error 1.3187528e-06 + auto v = 1.57079517841339111328125f + + x * (-0.21450997889041900634765625f + + x * (8.78556668758392333984375e-2f + + x * (-4.489909112453460693359375e-2f + + x * (1.928029954433441162109375e-2f + + x * (-4.3095736764371395111083984375e-3f))))); + + v *= -sqrt(1.f - x); + v = v + 1.57079637050628662109375f; + + v = select(v < 0.f, T(0.f), v); + v = select(isneg, -v, v); + v = select(isnan, T(cast_i2f(0x7fc00000)), v); + + return v; +} + +template <typename T> +__forceinline T acos(const T &v) +{ + return 1.57079637050628662109375f - asin(v); +} + +template <typename T> +__forceinline T atan(const T &v) +{ + const float piOverTwoVec = 1.57079637050628662109375; + // atan(-x) = -atan(x) (so flip from negative to positive first) + // If x > 1 -> atan(x) = Pi/2 - atan(1/x) + auto xNeg = v < 0.f; + auto xFlipped = select(xNeg, -v, v); + + auto xGt1 = xFlipped > 1.; + auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); + + // These coefficients approximate atan(x)/x + const float atanC0 = +0.99999988079071044921875; + const float atanC2 = -0.3333191573619842529296875; + const float atanC4 = +0.199689209461212158203125; + const float atanC6 = -0.14015688002109527587890625; + const float atanC8 = +9.905083477497100830078125e-2; + const float atanC10 = -5.93664981424808502197265625e-2; + const float atanC12 = +2.417283318936824798583984375e-2; + const float atanC14 = -4.6721356920897960662841796875e-3; + + auto x2 = x * x; + auto result = x2 * atanC14 + atanC12; + result = x2 * result + atanC10; + result = x2 * result + atanC8; + result = x2 * result + atanC6; + result = x2 * result + atanC4; + result = x2 * result + atanC2; + result = x2 * result + atanC0; + result *= x; + + result = select(xGt1, piOverTwoVec - result, result); + result = select(xNeg, -result, result); + return result; +} + +template <typename T> +__forceinline T atan2(const T &y, const T &x) +{ + const float piVec = 3.1415926536; + // atan2(y, x) = + // + // atan2(y > 0, x = +-0) -> Pi/2 + // atan2(y < 0, x = +-0) -> -Pi/2 + // atan2(y = +-0, x < +0) -> +-Pi + // atan2(y = +-0, x >= +0) -> +-0 + // + // atan2(y >= 0, x < 0) -> Pi + atan(y/x) + // atan2(y < 0, x < 0) -> -Pi + atan(y/x) + // atan2(y, x > 0) -> atan(y/x) + // + // and then a bunch of code for dealing with infinities. + auto yOverX = y * rcpSafe(x); + auto atanArg = atan(yOverX); + auto xLt0 = x < 0.f; + auto yLt0 = y < 0.f; + auto offset = select(xLt0, + select(yLt0, T(-piVec), T(piVec)), 0.f); + return offset + atanArg; +} + +template <typename T> +__forceinline T exp(const T &v) +{ + const float ln2Part1 = 0.6931457519; + const float ln2Part2 = 1.4286067653e-6; + const float oneOverLn2 = 1.44269502162933349609375; + + auto scaled = v * oneOverLn2; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * ln2Part1; + x -= kReal * ln2Part2; + + // These coefficients are for e^x in [0, ln(2)] + const float one = 1.; + const float c2 = 0.4999999105930328369140625; + const float c3 = 0.166668415069580078125; + const float c4 = 4.16539050638675689697265625e-2; + const float c5 = 8.378830738365650177001953125e-3; + const float c6 = 1.304379315115511417388916015625e-3; + const float c7 = 2.7555381529964506626129150390625e-4; + + auto result = x * c7 + c6; + result = x * result + c5; + result = x * result + c4; + result = x * result + c3; + result = x * result + c2; + result = x * result + one; + result = x * result + one; + + // Compute 2^k (should differ for float and double, but I'll avoid + // it for now and just do floats) + const int fpbias = 127; + auto biasedN = k + fpbias; + auto overflow = kReal > fpbias; + // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) + // we've got underflow. -127 * ln(2) -> -88.02. So the most + // negative float input that doesn't result in zero is like -88. + auto underflow = kReal <= -fpbias; + const int infBits = 0x7f800000; + biasedN <<= 23; + // Reinterpret this thing as float + auto twoToTheN = asFloat(biasedN); + // Handle both doubles and floats (hopefully eliding the copy for float) + auto elemtype2n = twoToTheN; + result *= elemtype2n; + result = select(overflow, cast_i2f(infBits), result); + result = select(underflow, 0., result); + return result; +} + +// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n +// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). +template <typename T, typename R> +__forceinline void __rangeReduceLog(const T &input, + T &reduced, + R &exponent) +{ + auto intVersion = asInt(input); + // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM + // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 + // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 + // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 + // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF + + //const int exponentMask(0x7F800000) + static const int nonexponentMask = 0x807FFFFF; + + // We want the reduced version to have an exponent of -1 which is + // -1 + 127 after biasing or 126 + static const int exponentNeg1 = (126l << 23); + // NOTE(boulos): We don't need to mask anything out since we know + // the sign bit has to be 0. If it's 1, we need to return infinity/nan + // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). + auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] + + auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 + exponent = offsetExponent - 127; // get the real value + + // Blend the offset_exponent with the original input (do this in + // int for now, until I decide if float can have & and ¬) + auto blended = (intVersion & nonexponentMask) | (exponentNeg1); + reduced = asFloat(blended); +} + +template <typename T> struct ExponentType { }; +template <int N> struct ExponentType<vfloat_impl<N>> { typedef vint<N> Ty; }; +template <> struct ExponentType<float> { typedef int Ty; }; + +template <typename T> +__forceinline T log(const T &v) +{ + T reduced; + typename ExponentType<T>::Ty exponent; + + const int nanBits = 0x7fc00000; + const int negInfBits = 0xFF800000; + const float nan = cast_i2f(nanBits); + const float negInf = cast_i2f(negInfBits); + auto useNan = v < 0.; + auto useInf = v == 0.; + auto exceptional = useNan | useInf; + const float one = 1.0; + + auto patched = select(exceptional, one, v); + __rangeReduceLog(patched, reduced, exponent); + + const float ln2 = 0.693147182464599609375; + + auto x1 = one - reduced; + const float c1 = +0.50000095367431640625; + const float c2 = +0.33326041698455810546875; + const float c3 = +0.2519190013408660888671875; + const float c4 = +0.17541764676570892333984375; + const float c5 = +0.3424419462680816650390625; + const float c6 = -0.599632322788238525390625; + const float c7 = +1.98442304134368896484375; + const float c8 = -2.4899270534515380859375; + const float c9 = +1.7491014003753662109375; + + auto result = x1 * c9 + c8; + result = x1 * result + c7; + result = x1 * result + c6; + result = x1 * result + c5; + result = x1 * result + c4; + result = x1 * result + c3; + result = x1 * result + c2; + result = x1 * result + c1; + result = x1 * result + one; + + // Equation was for -(ln(red)/(1-red)) + result *= -x1; + result += toFloat(exponent) * ln2; + + return select(exceptional, + select(useNan, T(nan), T(negInf)), + result); +} + +template <typename T> +__forceinline T pow(const T &x, const T &y) +{ + auto x1 = abs(x); + auto z = exp(y * log(x1)); + + // Handle special cases + const float twoOver23 = 8388608.0f; + auto yInt = y == round(y); + auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit + + // x == 0 + z = select(x == 0.0f, + select(y < 0.0f, T(inf) | signmsk(x), + select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); + + // x < 0 + auto xNegative = x < 0.0f; + if (any(xNegative)) + { + auto z1 = z | asFloat(yOddInt); + z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN()); + z = select(xNegative, z1, z); + } + + auto xFinite = isfinite(x); + auto yFinite = isfinite(y); + if (all(xFinite & yFinite)) + return z; + + // x finite and y infinite + z = select(andn(xFinite, yFinite), + select(x1 == 1.0f, 1.0f, + select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); + + // x infinite + z = select(xFinite, z, + select(y == 0.0f, 1.0f, + select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); + + return z; +} + +template <typename T> +__forceinline T pow(const T &x, float y) +{ + return pow(x, T(y)); +} + +} // namespace fastapprox + +} // namespace embree diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h new file mode 100644 index 0000000000..d62aef51f3 --- /dev/null +++ b/thirdparty/embree/common/math/vec2.h @@ -0,0 +1,235 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + struct Vec2fa; + + //////////////////////////////////////////////////////////////////////////////// + /// Generic 2D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Vec2 + { + enum { N = 2 }; + union { + struct { T x, y; }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2( ) {} + __forceinline explicit Vec2( const T& a ) : x(a), y(a) {} + __forceinline Vec2( const T& x, const T& y ) : x(x), y(y) {} + + __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; } + __forceinline Vec2( const Vec2fa& other ); + + template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {} + template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; } + + __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2( ZeroTy ) : x(zero), y(zero) {} + __forceinline Vec2( OneTy ) : x(one), y(one) {} + __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {} + __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {} + +#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler + __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 2); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; } + __forceinline T& operator [](const size_t axis ) { assert(axis < 2); return components[axis]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); } + template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); } + template<typename T> __forceinline Vec2<T> abs ( const Vec2<T>& a ) { return Vec2<T>(abs (a.x), abs (a.y)); } + template<typename T> __forceinline Vec2<T> rcp ( const Vec2<T>& a ) { return Vec2<T>(rcp (a.x), rcp (a.y)); } + template<typename T> __forceinline Vec2<T> rsqrt ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); } + template<typename T> __forceinline Vec2<T> sqrt ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); } + template<typename T> __forceinline Vec2<T> frac ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); } + template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x + b , a.y + b ); } + template<typename T> __forceinline Vec2<T> operator +( const T& a, const Vec2<T>& b ) { return Vec2<T>(a + b.x, a + b.y); } + template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); } + template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x - b , a.y - b ); } + template<typename T> __forceinline Vec2<T> operator -( const T& a, const Vec2<T>& b ) { return Vec2<T>(a - b.x, a - b.y); } + template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); } + template<typename T> __forceinline Vec2<T> operator *( const T& a, const Vec2<T>& b ) { return Vec2<T>(a * b.x, a * b.y); } + template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x * b , a.y * b ); } + template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); } + template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const T& b ) { return Vec2<T>(a.x / b , a.y / b ); } + template<typename T> __forceinline Vec2<T> operator /( const T& a, const Vec2<T>& b ) { return Vec2<T>(a / b.x, a / b.y); } + + template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); } + template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> madd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> msub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); } + + template<typename T> __forceinline Vec2<T> madd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> msub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); } + template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; } + template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; } + template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const T& b ) { a.x *= b ; a.y *= b ; return a; } + template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const T& b ) { a.x /= b ; a.y /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; } + template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; } + template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); } + template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; } + template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; } + template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) { + return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T dot ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); } + template<typename T> __forceinline Vec2<T> cross ( const Vec2<T>& a ) { return Vec2<T>(-a.y,a.x); } + template<typename T> __forceinline T length ( const Vec2<T>& a ) { return sqrt(dot(a,a)); } + template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a ) { return a*rsqrt(dot(a,a)); } + template<typename T> __forceinline T distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); } + template<typename T> __forceinline T det ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; } + + template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) { + const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) ); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) { + return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y)); + } + + template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) { + return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y)); + } + + template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) { + return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y)); + } + + template<typename T> + __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) { + return madd(Vec2<T>(T(1.0f)-t),v0,t*v1); + } + + template<typename T> __forceinline int maxDim ( const Vec2<T>& a ) + { + const Vec2<T> b = abs(a); + if (b.x > b.y) return 0; + else return 1; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) { + return cout << "(" << a.x << ", " << a.y << ")"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Default template instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef Vec2<bool > Vec2b; + typedef Vec2<int > Vec2i; + typedef Vec2<float> Vec2f; +} + +#include "vec2fa.h" + +#if defined __SSE__ +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} + +#if defined(__SSE__) + template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif + +#if defined(__AVX__) + template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif +} diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h new file mode 100644 index 0000000000..a51fb68fd0 --- /dev/null +++ b/thirdparty/embree/common/math/vec2fa.h @@ -0,0 +1,301 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec2fa Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec2fa + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 2 }; + union { + __m128 m128; + struct { float x,y,az,aw; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa( ) {} + __forceinline Vec2fa( const __m128 a ) : m128(a) {} + + __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; } + __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } + + __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } + __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} + + __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec2fa load( const void* const a ) { + return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); + } + + static __forceinline Vec2fa loadu( const void* const a ) { + return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); + } + + static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { + _mm_storeu_ps((float*)ptr,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } + __forceinline Vec2fa operator -( const Vec2fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec2fa abs ( const Vec2fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec2fa sign ( const Vec2fa& a ) { + return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); + } + + __forceinline Vec2fa rcp ( const Vec2fa& a ) + { +#if defined(__AVX512VL__) + const Vec2fa r = _mm_rcp14_ps(a.m128); +#else + const Vec2fa r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); +#else + const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; + } + + __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } + + __forceinline Vec2fa rsqrt( const Vec2fa& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + + __forceinline Vec2fa zero_fix(const Vec2fa& a) { + return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec2fa rcp_safe(const Vec2fa& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec2fa log ( const Vec2fa& a ) { + return Vec2fa(logf(a.x),logf(a.y)); + } + + __forceinline Vec2fa exp ( const Vec2fa& a ) { + return Vec2fa(expf(a.x),expf(a.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } + __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } + __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__SSE4_1__) + __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__SSE4_1__) + __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { + return Vec2fa(powf(a.x,b),powf(a.y,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } + __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } + __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } + __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } +#else + __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } + __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } + __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} + __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } +#endif + + __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } + __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } + __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } + __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } + __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } + __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } + __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } + __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } + __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } + __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } + __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } + __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } + __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); + } +#else + __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec2fa cross ( const Vec2fa& a ) { + return Vec2fa(-a.y,a.x); + } + + __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } + __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f, t, mask); + } + + __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec2fa& a ) + { + const Vec2fa b = abs(a); + if (b.x > b.y) return 0; + else return 1; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } + __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } + __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } +#elif defined (__SSE4_1__) + //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } +#else + //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } + __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { + return cout << "(" << a.x << ", " << a.y << ")"; + } + + typedef Vec2fa Vec2fa_t; +} diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h new file mode 100644 index 0000000000..ce94eff327 --- /dev/null +++ b/thirdparty/embree/common/math/vec3.h @@ -0,0 +1,337 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + struct Vec3fa; + + //////////////////////////////////////////////////////////////////////////////// + /// Generic 3D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Vec3 + { + enum { N = 3 }; + + union { + struct { + T x, y, z; + }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3( ) {} + __forceinline explicit Vec3( const T& a ) : x(a), y(a), z(a) {} + __forceinline Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {} + + __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; } + __forceinline Vec3( const Vec3fa& other ); + + template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {} + template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; } + + __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3( ZeroTy ) : x(zero), y(zero), z(zero) {} + __forceinline Vec3( OneTy ) : x(one), y(one), z(one) {} + __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {} + __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {} + +#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler + __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; } + __forceinline T& operator []( const size_t axis ) { assert(axis < 3); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 3); return components[axis]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); } + template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); } + template<typename T> __forceinline Vec3<T> abs ( const Vec3<T>& a ) { return Vec3<T>(abs (a.x), abs (a.y), abs (a.z)); } + template<typename T> __forceinline Vec3<T> rcp ( const Vec3<T>& a ) { return Vec3<T>(rcp (a.x), rcp (a.y), rcp (a.z)); } + template<typename T> __forceinline Vec3<T> rsqrt ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); } + template<typename T> __forceinline Vec3<T> sqrt ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); } + + template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a ) + { + return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x), + select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y), + select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z)); + } + template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); } + template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); } + template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); } + template<typename T> __forceinline Vec3<T> operator *( const T& a, const Vec3<T>& b ) { return Vec3<T>(a * b.x, a * b.y, a * b.z); } + template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const T& b ) { return Vec3<T>(a.x * b , a.y * b , a.z * b ); } + template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const T& b ) { return Vec3<T>(a.x / b , a.y / b , a.z / b ); } + template<typename T> __forceinline Vec3<T> operator /( const T& a, const Vec3<T>& b ) { return Vec3<T>(a / b.x, a / b.y, a / b.z); } + template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); } + + template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); } + template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); } + + template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); } + template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> madd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); } + template<typename T> __forceinline Vec3<T> msub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); } + template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));} + template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); } + + template<typename T> __forceinline Vec3<T> madd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); } + template<typename T> __forceinline Vec3<T> msub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); } + template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));} + template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T b ) { a.x += b; a.y += b; a.z += b; return a; } + template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } + template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; } + template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; return a; } + template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; } + template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; } + template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); } + template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; } + template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; } + template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) { + return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) { + return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); + } + + template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) { + return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z)); + } + + template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) { + return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); + } + + template<typename T> + __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) { + return madd(Vec3<T>(T(1.0f)-t),v0,t*v1); + } + + template<typename T> __forceinline int maxDim ( const Vec3<T>& a ) + { + const Vec3<T> b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); } + template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); } + template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); } + template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); } + template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); } + template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T sqr ( const Vec3<T>& a ) { return dot(a,a); } + template<typename T> __forceinline T dot ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); } + template<typename T> __forceinline T length ( const Vec3<T>& a ) { return sqrt(sqr(a)); } + template<typename T> __forceinline T rcp_length( const Vec3<T>& a ) { return rsqrt(sqr(a)); } + template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); } + template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); } + template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); } + + template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c ) + { + const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; + const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x; + const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z)); + const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z)); + const auto sx = abs(ab_x) < abs(bc_x); + const auto sy = abs(ab_y) < abs(bc_y); + const auto sz = abs(ab_z) < abs(bc_z); + return Vec3<T>(select(sx,cross_ab.x,cross_bc.x), + select(sy,cross_ab.y,cross_bc.y), + select(sz,cross_ab.z,cross_bc.z)); + } + + template<typename T> __forceinline T sum ( const Vec3<T>& a ) { return a.x+a.y+a.z; } + + template<typename T> __forceinline T halfArea ( const Vec3<T>& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + template<typename T> __forceinline T area ( const Vec3<T>& d ) { return 2.0f*halfArea(d); } + + template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) { + const T d = dot(a,a); return select(d == T( zero ), a , a*rsqrt(d) ); + } + + template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1) + { + const Vec3<T> N = cross(P-Q0,Q1-Q0); + const Vec3<T> D = Q1-Q0; + return dot(N,N)*rcp(dot(D,D)); + } + + template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0) + { + const Vec3<T> N = cross(PmQ0,Q1mQ0); + const Vec3<T> D = Q1mQ0; + return dot(N,N)*rcp(dot(D,D)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + typedef Vec3<bool > Vec3b; + typedef Vec3<int > Vec3i; + typedef Vec3<float> Vec3f; +} + +#include "vec3ba.h" +#include "vec3ia.h" +#include "vec3fa.h" + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined __SSE__ +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<typename Out, typename In> + __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) { + return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k])); + } + + template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } + +#if defined(__AVX__) + template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { + x = a.x; y = a.y; z = a.z; + } +#elif defined(__SSE__) + template<> + __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { + const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); + } +#endif + +#if defined(__SSE__) + template<> + __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) { + return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); + } + + template<int i0, int i1, int i2, int i3> + __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) { + return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z)); + } +#endif + +#if defined(__AVX__) + template<> + __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) { + x = a.x; y = a.y; z = a.z; + } + + template<> + __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) { + return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + template<> + __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) { + return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + + template<int i0, int i1, int i2, int i3> + __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) { + return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z)); + } +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {} +#endif +} diff --git a/thirdparty/embree/common/math/vec3ba.h b/thirdparty/embree/common/math/vec3ba.h new file mode 100644 index 0000000000..a021b522dc --- /dev/null +++ b/thirdparty/embree/common/math/vec3ba.h @@ -0,0 +1,120 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3ba Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3ba + { + ALIGNED_STRUCT_(16); + + union { + __m128 m128; + struct { int x,y,z; }; + }; + + typedef int Scalar; + enum { N = 3 }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba( ) {} + __forceinline Vec3ba( const __m128 input ) : m128(input) {} + __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {} + __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3ba( bool a ) + : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline Vec3ba( bool a, bool b, bool c) + : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3ba( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); } + __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); } + __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; } + __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; } + __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { + return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; + } + __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { + return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; + } + __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; } + __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; } + + __forceinline bool all ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; } + __forceinline bool any ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; } + __forceinline bool none ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; } + + __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) { + return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")"; + } +} diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h new file mode 100644 index 0000000000..586039741d --- /dev/null +++ b/thirdparty/embree/common/math/vec3fa.h @@ -0,0 +1,727 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3fa Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3fa + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 3 }; + union { + __m128 m128; + struct { float x,y,z; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa( ) {} + __forceinline Vec3fa( const __m128 a ) : m128(a) {} + + __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } + //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } + + __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; } + __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} + + __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } + __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } + __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } + __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } + + //__forceinline operator const __m128&() const { return m128; } + //__forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec3fa load( const void* const a ) { + return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); + } + + static __forceinline Vec3fa loadu( const void* const a ) { + return Vec3fa(_mm_loadu_ps((float*)a)); + } + + static __forceinline void storeu ( void* ptr, const Vec3fa& v ) { + _mm_storeu_ps((float*)ptr,v.m128); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } + __forceinline Vec3fa operator -( const Vec3fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec3fa abs ( const Vec3fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec3fa sign ( const Vec3fa& a ) { + return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); + } + + __forceinline Vec3fa rcp ( const Vec3fa& a ) + { +#if defined(__AVX512VL__) + const Vec3fa r = _mm_rcp14_ps(a.m128); +#else + const Vec3fa r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); +#else + const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; + } + + __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); } + + __forceinline Vec3fa rsqrt( const Vec3fa& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + + __forceinline Vec3fa zero_fix(const Vec3fa& a) { + return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec3fa rcp_safe(const Vec3fa& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec3fa log ( const Vec3fa& a ) { + return Vec3fa(logf(a.x),logf(a.y),logf(a.z)); + } + + __forceinline Vec3fa exp ( const Vec3fa& a ) { + return Vec3fa(expf(a.x),expf(a.y),expf(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); } + __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; } + __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__SSE4_1__) + __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__SSE4_1__) + __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) { + return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } +#else + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} + __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } +#endif + + __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } + __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); } + __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); } + __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; } + __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; } + __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; } + __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; } + __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; } + __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec3fa& v) { + const vfloat4 a(v.m128); + const vfloat4 b = shuffle<1>(a); + const vfloat4 c = shuffle<2>(a); + return _mm_cvtss_f32(a+b+c); + } + + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } + __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + + __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } + __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } + __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } + __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } + __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + + __forceinline bool isvalid ( const Vec3fa& v ) { + return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); + } + + __forceinline bool is_finite ( const Vec3fa& a ) { + return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX))); + } + + __forceinline bool isvalid4 ( const Vec3fa& v ) { + return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite4 ( const Vec3fa& a ) { + return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); + } +#else + __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) + { + vfloat4 a0 = vfloat4(a.m128); + vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); + vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); + vfloat4 b1 = vfloat4(b.m128); + return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); + } + + __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); } + __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); } + __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); } + + __forceinline Vec3fa normalize_safe( const Vec3fa& a ) { + const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + } + + /*! differentiated normalization */ + __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp) + { + const float pp = dot(p,p); + const float pdp = dot(p,dp); + return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f.m128, t.m128, mask); + } + + __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) { + return blendv_ps(f.m128, t.m128, s); + } + + __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec3fa& a ) + { + const Vec3fa b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined (__SSE4_1__) + __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } +#else + __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); } + __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + typedef Vec3fa Vec3fa_t; + + + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3fx Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3fx + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 3 }; + union { + __m128 m128; + struct { float x,y,z; union { int a; unsigned u; float w; }; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx( ) {} + __forceinline Vec3fx( const __m128 a ) : m128(a) {} + + __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {} + __forceinline operator Vec3fa () const { return Vec3fa(m128); } + + __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } + //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } + + __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; } + + __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} + + __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } + __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } + __forceinline Vec3fx( const Vec3fa& other, const float w1) { +#if defined (__SSE4_1__) + m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); +#else + const vint4 mask(-1,-1,-1,0); + m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1)); +#endif + } + //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly! + //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly! + __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {} + + //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } + __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } + __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } + __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } + + //__forceinline operator const __m128&() const { return m128; } + //__forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec3fx load( const void* const a ) { + return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); + } + + static __forceinline Vec3fx loadu( const void* const a ) { + return Vec3fx(_mm_loadu_ps((float*)a)); + } + + static __forceinline void storeu ( void* ptr, const Vec3fx& v ) { + _mm_storeu_ps((float*)ptr,v.m128); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; } + __forceinline Vec3fx operator -( const Vec3fx& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec3fx abs ( const Vec3fx& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec3fx sign ( const Vec3fx& a ) { + return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128)); + } + + __forceinline Vec3fx rcp ( const Vec3fx& a ) + { +#if defined(__AVX512VL__) + const Vec3fx r = _mm_rcp14_ps(a.m128); +#else + const Vec3fx r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); +#else + const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; + } + + __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); } + + __forceinline Vec3fx rsqrt( const Vec3fx& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + + __forceinline Vec3fx zero_fix(const Vec3fx& a) { + return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec3fx rcp_safe(const Vec3fx& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec3fx log ( const Vec3fx& a ) { + return Vec3fx(logf(a.x),logf(a.y),logf(a.z)); + } + + __forceinline Vec3fx exp ( const Vec3fx& a ) { + return Vec3fx(expf(a.x),expf(a.y),expf(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); } + __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; } + __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__SSE4_1__) + __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__SSE4_1__) + __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) { + return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } +#else + __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; } + __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; } + __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;} + __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; } +#endif + + __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); } + __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); } + __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); } + __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; } + __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; } + __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; } + __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; } + __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; } + __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec3fx& v) { + const vfloat4 a(v.m128); + const vfloat4 b = shuffle<1>(a); + const vfloat4 c = shuffle<2>(a); + return _mm_cvtss_f32(a+b+c); + } + + __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); } + __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + + __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } + __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } + __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); } + __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); } + __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + + __forceinline bool isvalid ( const Vec3fx& v ) { + return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE))); + } + + __forceinline bool is_finite ( const Vec3fx& a ) { + return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX))); + } + + __forceinline bool isvalid4 ( const Vec3fx& v ) { + return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite4 ( const Vec3fx& a ) { + return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); + } +#else + __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) + { + vfloat4 a0 = vfloat4(a.m128); + vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); + vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); + vfloat4 b1 = vfloat4(b.m128); + return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); + } + + __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); } + __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); } + __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); } + + __forceinline Vec3fx normalize_safe( const Vec3fx& a ) { + const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + } + + /*! differentiated normalization */ + __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp) + { + const float pp = dot(p,p); + const float pdp = dot(p,dp); + return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f.m128, t.m128, mask); + } + + __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) { + return blendv_ps(f.m128, t.m128, s); + } + + __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec3fx& a ) + { + const Vec3fx b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); } + __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); } + __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); } +#elif defined (__SSE4_1__) + __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } +#else + __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); } + __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + + typedef Vec3fx Vec3ff; +} diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h new file mode 100644 index 0000000000..694804c40d --- /dev/null +++ b/thirdparty/embree/common/math/vec3ia.h @@ -0,0 +1,186 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3ia Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3ia + { + ALIGNED_STRUCT_(16); + + union { + __m128i m128; + struct { int x,y,z; }; + }; + + typedef int Scalar; + enum { N = 3 }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia( ) {} + __forceinline Vec3ia( const __m128i a ) : m128(a) {} + __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {} + __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {} + __forceinline Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {} + __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {} + + __forceinline operator const __m128i&() const { return m128; } + __forceinline operator __m128i&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia( ZeroTy ) : m128(_mm_setzero_si128()) {} + __forceinline Vec3ia( OneTy ) : m128(_mm_set1_epi32(1)) {} + __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {} + __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } + __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } +#if defined(__SSSE3__) + __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator +( const Vec3ia& a, const int b ) { return a+Vec3ia(b); } + __forceinline Vec3ia operator +( const int a, const Vec3ia& b ) { return Vec3ia(a)+b; } + + __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } + __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } + +#if defined(__SSE4_1__) + __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } + __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } +#endif + + __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); } + __forceinline Vec3ia operator &( const Vec3ia& a, const int b ) { return a & Vec3ia(b); } + __forceinline Vec3ia operator &( const int a, const Vec3ia& b ) { return Vec3ia(a) & b; } + + __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); } + __forceinline Vec3ia operator |( const Vec3ia& a, const int b ) { return a | Vec3ia(b); } + __forceinline Vec3ia operator |( const int a, const Vec3ia& b ) { return Vec3ia(a) | b; } + + __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); } + __forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); } + __forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; } + + __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); } + __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); } + + __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); } + __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); } + __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; } + __forceinline Vec3ia& operator +=( Vec3ia& a, const int& b ) { return a = a + b; } + + __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } + __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } + +#if defined(__SSE4_1__) + __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } + __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } +#endif + + __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; } + __forceinline Vec3ia& operator &=( Vec3ia& a, const int& b ) { return a = a & b; } + + __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } + __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } + + __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } + __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } + __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } + __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } + __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; } + __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; } + __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); } + __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } + __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); +#endif + } + +#if defined(__SSE4_1__) + __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } + __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } +#else + __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); } + __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } +} diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h new file mode 100644 index 0000000000..0ed107928a --- /dev/null +++ b/thirdparty/embree/common/math/vec4.h @@ -0,0 +1,243 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" +#include "vec3.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Generic 4D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> struct Vec4 + { + enum { N = 4 }; + union { + struct { T x, y, z, w; }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec4( ) {} + __forceinline explicit Vec4( const T& a ) : x(a), y(a), z(a), w(a) {} + __forceinline Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {} + __forceinline Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} + + __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; } + __forceinline Vec4( const Vec3fx& other ); + + template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {} + template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } + + __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } + + __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec4( ZeroTy ) : x(zero), y(zero), z(zero), w(zero) {} + __forceinline Vec4( OneTy ) : x(one), y(one), z(one), w(one) {} + __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {} + __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {} + +#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler + __forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 4); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 4); return components[axis]; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Swizzles + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); } + template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); } + template<typename T> __forceinline Vec4<T> abs ( const Vec4<T>& a ) { return Vec4<T>(abs (a.x), abs (a.y), abs (a.z), abs (a.w)); } + template<typename T> __forceinline Vec4<T> rcp ( const Vec4<T>& a ) { return Vec4<T>(rcp (a.x), rcp (a.y), rcp (a.z), rcp (a.w)); } + template<typename T> __forceinline Vec4<T> rsqrt ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); } + template<typename T> __forceinline Vec4<T> sqrt ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } + template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } + template<typename T> __forceinline Vec4<T> operator *( const T& a, const Vec4<T>& b ) { return Vec4<T>(a * b.x, a * b.y, a * b.z, a * b.w); } + template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const T& b ) { return Vec4<T>(a.x * b , a.y * b , a.z * b , a.w * b ); } + template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } + template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const T& b ) { return Vec4<T>(a.x / b , a.y / b , a.z / b , a.w / b ); } + template<typename T> __forceinline Vec4<T> operator /( const T& a, const Vec4<T>& b ) { return Vec4<T>(a / b.x, a / b.y, a / b.z, a / b.w); } + + template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); } + template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> madd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> msub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); } + + template<typename T> __forceinline Vec4<T> madd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> msub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); } + template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } + template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; } + template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; a.w *= b ; return a; } + template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; a.w /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; } + template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; } + template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); } + template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } + template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; } + template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + if (a.w != b.w) return a.w < b.w; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) { + return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline T dot ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); } + + template<typename T> __forceinline T length ( const Vec4<T>& a ) { return sqrt(dot(a,a)); } + template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a ) { return a*rsqrt(dot(a,a)); } + template<typename T> __forceinline T distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) { + return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); + } + + template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) { + return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w)); + } + + template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) { + return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); + } + + template<typename T> + __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) { + return madd(Vec4<T>(T(1.0f)-t),v0,t*v1); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Default template instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef Vec4<bool > Vec4b; + typedef Vec4<unsigned char> Vec4uc; + typedef Vec4<int > Vec4i; + typedef Vec4<float > Vec4f; +} + +#include "vec3ba.h" +#include "vec3ia.h" +#include "vec3fa.h" + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined __SSE__ +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined __AVX512F__ +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } + +#if defined(__AVX__) + template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { + x = a.x; y = a.y; z = a.z; w = a.w; + } +#elif defined(__SSE__) + template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { + const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); + } +#endif + +#if defined(__AVX__) + template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) { + x = a.x; y = a.y; z = a.z; w = a.w; + } +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {} +#endif +} diff --git a/thirdparty/embree/common/simd/arm/emulation.h b/thirdparty/embree/common/simd/arm/emulation.h new file mode 100644 index 0000000000..1c3875fb27 --- /dev/null +++ b/thirdparty/embree/common/simd/arm/emulation.h @@ -0,0 +1,50 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +/* Make precision match SSE, at the cost of some performance */ +#if !defined(__aarch64__) +# define SSE2NEON_PRECISE_DIV 1 +# define SSE2NEON_PRECISE_SQRT 1 +#endif + +#include "sse2neon.h" + +__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) { + __m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c))); + return _mm_fmadd_ps(a, b, neg_c); +} + +__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c), + vreinterpretq_f32_m128(b), + vreinterpretq_f32_m128(a))); +#else + return _mm_sub_ps(c, _mm_mul_ps(a, b)); +#endif +} + +__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { + return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c)))); +} + + +/* Dummy defines for floating point control */ +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_DIV_ZERO 0x200 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_MASK_DENORM 0x100 +#define _MM_SET_EXCEPTION_MASK(x) +#define _MM_SET_FLUSH_ZERO_MODE(x) + +__forceinline int _mm_getcsr() +{ + return 0; +} + +__forceinline void _mm_mfence() +{ + __sync_synchronize(); +} diff --git a/thirdparty/embree/common/simd/arm/sse2neon.h b/thirdparty/embree/common/simd/arm/sse2neon.h new file mode 100644 index 0000000000..7eb25cf2c5 --- /dev/null +++ b/thirdparty/embree/common/simd/arm/sse2neon.h @@ -0,0 +1,6996 @@ +#ifndef SSE2NEON_H +#define SSE2NEON_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions +// +// This header file does not yet translate all of the SSE intrinsics. +// +// Contributors to this work are: +// John W. Ratcliff <jratcliffscarab@gmail.com> +// Brandon Rowlett <browlett@nvidia.com> +// Ken Fast <kfast@gdeb.com> +// Eric van Beurden <evanbeurden@nvidia.com> +// Alexander Potylitsin <apotylitsin@nvidia.com> +// Hasindu Gamaarachchi <hasindu2008@gmail.com> +// Jim Huang <jserv@biilabs.io> +// Mark Cheng <marktwtn@biilabs.io> +// Malcolm James MacLeod <malcolm@gulden.com> +// Devin Hussey (easyaspi314) <husseydevin@gmail.com> +// Sebastian Pop <spop@amazon.com> +// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com> +// Danila Kutenin <danilak@google.com> +// François Turban (JishinMaster) <francois.turban@gmail.com> +// Pei-Hsuan Hung <afcidk@gmail.com> +// Yang-Hao Yuan <yanghau@biilabs.io> +// Syoyo Fujita <syoyo@lighttransport.com> +// Brecht Van Lommel <brecht@blender.org> + +/* + * sse2neon is freely redistributable under the MIT License. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Tunable configurations */ + +/* Enable precise implementation of math operations + * This would slow down the computation a bit, but gives consistent result with + * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result) + */ +/* _mm_min_ps and _mm_max_ps */ +#ifndef SSE2NEON_PRECISE_MINMAX +#define SSE2NEON_PRECISE_MINMAX (0) +#endif +/* _mm_rcp_ps and _mm_div_ps */ +#ifndef SSE2NEON_PRECISE_DIV +#define SSE2NEON_PRECISE_DIV (0) +#endif +/* _mm_sqrt_ps and _mm_rsqrt_ps */ +#ifndef SSE2NEON_PRECISE_SQRT +#define SSE2NEON_PRECISE_SQRT (0) +#endif +#ifndef SSE2NEON_PRECISE_RSQRT +#define SSE2NEON_PRECISE_RSQRT (0) +#endif + +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#else +#error "Macro name collisions may happen with unsupported compiler." +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif +#define FORCE_INLINE static inline +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif +#endif +#ifndef likely +#define likely(x) (x) +#endif +#ifndef unlikely +#define unlikely(x) (x) +#endif + +#include <stdint.h> +#include <stdlib.h> + +/* Architecture-specific build options */ +/* FIXME: #pragma GCC push_options is only available on GCC */ +#if defined(__GNUC__) +#if defined(__arm__) && __ARM_ARCH == 7 +/* According to ARM C Language Extensions Architecture specification, + * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) + * architecture supported. + */ +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." +#endif +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("fpu=neon") +#endif +#elif defined(__aarch64__) +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("+simd") +#endif +#else +#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." +#endif +#endif + +#include <arm_neon.h> + +/* Rounding functions require either Aarch64 instructions or libm failback */ +#if !defined(__aarch64__) +#include <math.h> +#endif + +/* "__has_builtin" can be used to query support for built-in functions + * provided by gcc/clang and other compilers that support it. + */ +#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ +/* Compatibility with gcc <= 9 */ +#if __GNUC__ <= 9 +#define __has_builtin(x) HAS##x +#define HAS__builtin_popcount 1 +#define HAS__builtin_popcountll 1 +#else +#define __has_builtin(x) 0 +#endif +#endif + +/** + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + +/* indicate immediate constant argument in a given range */ +#define __constrange(a, b) const + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef int64x1_t __m64; +typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ +// On ARM 32-bit architecture, the float64x2_t is not supported. +// The data type __m128d should be represented in a different way for related +// intrinsic conversion. +#if defined(__aarch64__) +typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ +#else +typedef float32x4_t __m128d; +#endif +typedef int64x2_t __m128i; /* 128-bit vector containing integers */ + +/* type-safe casting between types */ + +#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) +#define vreinterpretq_m128_f32(x) (x) +#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) + +#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) +#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) +#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) +#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) +#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) +#define vreinterpretq_f32_m128(x) (x) +#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) + +#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) +#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) +#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) +#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) +#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) +#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) +#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) +#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) +#define vreinterpretq_m128i_s64(x) (x) + +#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) +#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) +#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) +#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) + +#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) +#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) +#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) +#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) +#define vreinterpretq_s64_m128i(x) (x) + +#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) +#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) +#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) +#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) + +#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) +#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) +#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) +#define vreinterpret_m64_s64(x) (x) + +#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) +#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) +#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) +#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) + +#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) +#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) +#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) + +#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) +#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) +#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) +#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) + +#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) +#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) +#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) +#define vreinterpret_s64_m64(x) (x) + +#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) + +#if defined(__aarch64__) +#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) + +#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) +#define vreinterpretq_m128d_f64(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) + +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) + +#define vreinterpretq_f64_m128d(x) (x) +#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) +#else +#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128d_f32(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_f32_m128d(x) (x) +#endif + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an _m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +// casting using SIMDVec +#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) +#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) +#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) + +/* Backwards compatibility for compilers with lack of specific type support */ + +// Older gcc does not define vld1q_u8_x4 type +#if defined(__GNUC__) && !defined(__clang__) && \ + ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \ + (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \ + (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7) +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(p + 0); + ret.val[1] = vld1q_u8(p + 16); + ret.val[2] = vld1q_u8(p + 32); + ret.val[3] = vld1q_u8(p + 48); + return ret; +} +#else +// Wraps vld1q_u8_x4 +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + return vld1q_u8_x4(p); +} +#endif + +/* Function Naming Conventions + * The naming convention of SSE intrinsics is straightforward. A generic SSE + * intrinsic function is given as follows: + * _mm_<name>_<data_type> + * + * The parts of this format are given as follows: + * 1. <name> describes the operation performed by the intrinsic + * 2. <data_type> identifies the data type of the function's primary arguments + * + * This last part, <data_type>, is a little complicated. It identifies the + * content of the input values, and can be set to any of the following values: + * + ps - vectors contain floats (ps stands for packed single-precision) + * + pd - vectors cantain doubles (pd stands for packed double-precision) + * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * signed integers + * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * unsigned integers + * + si128 - unspecified 128-bit vector or 256-bit vector + * + m128/m128i/m128d - identifies input vector types when they are different + * than the type of the returned vector + * + * For example, _mm_setzero_ps. The _mm implies that the function returns + * a 128-bit vector. The _ps at the end implies that the argument vectors + * contain floats. + * + * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) + * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits + * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + * // Set packed 8-bit integers + * // 128 bits, 16 chars, per 8 bits + * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, + * 4, 5, 12, 13, 6, 7, 14, 15); + * // Shuffle packed 8-bit integers + * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb + * + * Data (Number, Binary, Byte Index): + +------+------+-------------+------+------+-------------+ + | 1 | 2 | 3 | 4 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary + +------+------+------+------+------+------+------+------+ + | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 5 | 6 | 7 | 8 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + * Index (Byte Index): + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | + +------+------+------+------+------+------+------+------+ + * Result: + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index + +------+------+------+------+------+------+------+------+ + | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary + +------+------+------+------+------+------+------+------+ + | 256 | 2 | 5 | 6 | Number + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 3 | 7 | 4 | 8 | Number + +------+------+------+------+------+------+-------------+ + */ + +/* Set/get methods */ + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint { + _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ + _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ + _MM_HINT_T1 = 2, /* load data to L2 cache only */ + _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ + _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ + _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ + _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ + _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ +}; + +// Loads one cache line of data from address p to a location closer to the +// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx +FORCE_INLINE void _mm_prefetch(const void *p, int i) +{ + (void) i; + __builtin_prefetch(p); +} + +// Pause the processor. This is typically used in spin-wait loops and depending +// on the x86 processor typical values are in the 40-100 cycle range. The +// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most +// Arm cores. Experience with several databases has shown has shown an 'isb' is +// a reasonable approximation. +FORCE_INLINE void _mm_pause() +{ + __asm__ __volatile__("isb\n"); +} + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 +FORCE_INLINE float _mm_cvtss_f32(__m128 a) +{ + return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Convert the lower single-precision (32-bit) floating-point element in b to a +// double-precision (64-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd +FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) +{ + double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 +#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 +FORCE_INLINE int _mm_cvtss_si64(__m128 a) +{ +#if defined(__aarch64__) + return vgetq_lane_s64( + vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int64_t) ceil(data); + if (unlikely(diff == 0.5)) { + int64_t f = (int64_t) floor(data); + int64_t c = (int64_t) ceil(data); + return c & 1 ? f : c; + } + return (int64_t) floor(data); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi +FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) +{ + return vreinterpret_m64_s32( + vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si +FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) +{ + return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 +#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 +#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 +FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) +{ + return vgetq_lane_s64( + vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0); +} + +// Sets the 128-bit value to zero +// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx +FORCE_INLINE __m128i _mm_setzero_si128(void) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(0)); +} + +// Clears the four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setzero_ps(void) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(0)); +} + +// Return vector of type __m128d with all elements set to zero. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd +FORCE_INLINE __m128d _mm_setzero_pd(void) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(0)); +#else + return vreinterpretq_m128d_f32(vdupq_n_f32(0)); +#endif +} + +// Sets the four single-precision, floating-point values to w. +// +// r0 := r1 := r2 := r3 := w +// +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set1_ps(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to w. +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps1(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to the four inputs. +// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss +FORCE_INLINE __m128 _mm_set_ss(float a) +{ + float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the four single-precision, floating-point values to the four inputs in +// reverse order. +// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the 8 signed 16-bit integer values in reverse order. +// +// Return Value +// r0 := w0 +// r1 := w1 +// ... +// r7 := w7 +FORCE_INLINE __m128i _mm_setr_epi16(short w0, + short w1, + short w2, + short w3, + short w4, + short w5, + short w6, + short w7) +{ + int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; + return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); +} + +// Sets the 4 signed 32-bit integer values in reverse order +// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) +{ + return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); +} + +// Sets the 16 signed 8-bit integer values to b. +// +// r0 := b +// r1 := b +// ... +// r15 := b +// +// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi8(signed char w) +{ + return vreinterpretq_m128i_s8(vdupq_n_s8(w)); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd +FORCE_INLINE __m128d _mm_set1_pd(double d) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(d)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); +#endif +} + +// Sets the 8 signed 16-bit integer values to w. +// +// r0 := w +// r1 := w +// ... +// r7 := w +// +// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set1_epi16(short w) +{ + return vreinterpretq_m128i_s16(vdupq_n_s16(w)); +} + +// Sets the 16 signed 8-bit integer values. +// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi8(signed char b15, + signed char b14, + signed char b13, + signed char b12, + signed char b11, + signed char b10, + signed char b9, + signed char b8, + signed char b7, + signed char b6, + signed char b5, + signed char b4, + signed char b3, + signed char b2, + signed char b1, + signed char b0) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Sets the 8 signed 16-bit integer values. +// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi16(short i7, + short i6, + short i5, + short i4, + short i3, + short i2, + short i1, + short i0) +{ + int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vreinterpretq_m128i_s16(vld1q_s16(data)); +} + +// Sets the 16 signed 8-bit integer values in reverse order. +// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, + signed char b1, + signed char b2, + signed char b3, + signed char b4, + signed char b5, + signed char b6, + signed char b7, + signed char b8, + signed char b9, + signed char b10, + signed char b11, + signed char b12, + signed char b13, + signed char b14, + signed char b15) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Sets the 4 signed 32-bit integer values to i. +// +// r0 := i +// r1 := i +// r2 := i +// r3 := I +// +// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi32(int _i) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) +FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); +} + +// Sets the 4 signed 32-bit integer values. +// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) +{ + return _mm_set_epi64x((int64_t) i1, (int64_t) i2); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) +{ + double ALIGN_STRUCT(16) data[2] = {e0, e1}; +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); +#else + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); +#endif +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd +FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) +{ + return _mm_set_pd(e0, e1); +} + +// Copy double-precision (64-bit) floating-point element a to the lower element +// of dst, and zero the upper element. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd +FORCE_INLINE __m128d _mm_set_sd(double a) +{ + return _mm_set_pd(0, a); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 +#define _mm_set_pd1 _mm_set1_pd + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 +FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + vst1q_f32(p, vdupq_n_f32(a0)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps +#define _mm_store1_ps _mm_store_ps1 + +// Store 4 single-precision (32-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[127:96] +// MEM[mem_addr+63:mem_addr+32] := a[95:64] +// MEM[mem_addr+95:mem_addr+64] := a[63:32] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps +FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) +{ + float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); + float32x4_t rev = vextq_f32(tmp, tmp, 2); + vst1q_f32(p, rev); +} + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Stores the lower single - precision, floating - point value. +// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +{ + vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); +#else + vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); +#endif +} + +// Store the upper double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd +FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd +FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store 2 double-precision (64-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// MEM[mem_addr+127:mem_addr+64] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd +FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) +{ + float32x4_t f = vreinterpretq_f32_m128d(a); + _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 +FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); + vst1q_f64((float64_t *) mem_addr, + vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); +#else + float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); + vst1q_f32((float32_t *) mem_addr, + vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd +#define _mm_store1_pd _mm_store_pd1 + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) +{ + _mm_store_pd(mem_addr, a); +} + +// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. +// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx +FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) +{ + uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); + uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); + *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); +} + +// Stores the lower two single-precision floating point values of a to the +// address p. +// +// *p0 := a0 +// *p1 := a1 +// +// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx +FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_low_f32(a)); +} + +// Stores the upper two single-precision, floating-point values of a to the +// address p. +// +// *p0 := a2 +// *p1 := a3 +// +// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx +FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_high_f32(a)); +} + +// Loads a single single-precision, floating-point value, copying it into all +// four words +// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load1_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_dup_f32(p)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 +#define _mm_load_ps1 _mm_load1_ps + +// Sets the lower two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the upper two values are passed +// through from a. +// +// Return Value +// r0 := *p0 +// r1 := *p1 +// r2 := a2 +// r3 := a3 +// +// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); +} + +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[31:0] := MEM[mem_addr+127:mem_addr+96] +// dst[63:32] := MEM[mem_addr+95:mem_addr+64] +// dst[95:64] := MEM[mem_addr+63:mem_addr+32] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps +FORCE_INLINE __m128 _mm_loadr_ps(const float *p) +{ + float32x4_t v = vrev64q_f32(vld1q_f32(p)); + return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); +} + +// Sets the upper two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the lower two values are passed +// through from a. +// +// r0 := a0 +// r1 := a1 +// r2 := *p0 +// r3 := *p1 +// +// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_loadu_ps(const float *p) +{ + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load unaligned 16-bit integer from memory into the first element of dst. +// +// dst[15:0] := MEM[mem_addr+15:mem_addr] +// dst[MAX:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 +FORCE_INLINE __m128i _mm_loadu_si16(const void *p) +{ + return vreinterpretq_m128i_s16( + vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); +} + +// Load unaligned 64-bit integer from memory into the first element of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[MAX:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 +FORCE_INLINE __m128i _mm_loadu_si64(const void *p) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd +FORCE_INLINE __m128d _mm_load_sd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from 16-byte aligned memory, floating-point +// values. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd +FORCE_INLINE __m128d _mm_load_pd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64(p)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from unaligned memory, floating-point values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd +FORCE_INLINE __m128d _mm_loadu_pd(const double *p) +{ + return _mm_load_pd(p); +} + +// Loads an single - precision, floating - point value into the low word and +// clears the upper three words. +// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_load_ss(const float *p) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); +} + +// Load 64-bit integer from memory into the first element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) +{ + /* Load the lower 64 bits of the value pointed to by p into the + * lower 64 bits of the result, zeroing the upper 64 bits of the result. + */ + return vreinterpretq_m128i_s32( + vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); +#else + return vreinterpretq_m128d_f32( + vcombine_f32(vld1_f32((const float *) p), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +#endif +} + +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[63:0] := MEM[mem_addr+127:mem_addr+64] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd +FORCE_INLINE __m128d _mm_loadr_pd(const double *p) +{ +#if defined(__aarch64__) + float64x2_t v = vld1q_f64(p); + return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); +#else + int64x2_t v = vld1q_s64((const int64_t *) p); + return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); +#endif +} + +// Sets the low word to the single-precision, floating-point value of b +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), + vreinterpretq_f32_m128(a), 0)); +} + +// Move the lower double-precision (64-bit) floating-point element from b to the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd +FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_f32( + vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +} + +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); +} + +// Return vector of type __m128 with undefined elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps +FORCE_INLINE __m128 _mm_undefined_ps(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128 a; + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +/* Logic/Binary operations */ + +// Computes the bitwise AND-NOT of the four single-precision, floating-point +// values of a and b. +// +// r0 := ~a0 & b0 +// r1 := ~a1 & b1 +// r2 := ~a2 & b2 +// r3 := ~a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vbicq_s32(vreinterpretq_s32_m128(b), + vreinterpretq_s32_m128(a))); // *NOTE* argument swap +} + +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) +{ + // *NOTE* argument swap + return vreinterpretq_m128d_s64( + vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); +} + +// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the +// 128-bit value in a. +// +// r := (~a) & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vbicq_s32(vreinterpretq_s32_m128i(b), + vreinterpretq_s32_m128i(a))); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in +// b. +// +// r := a & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise AND of the four single-precision, floating-point values +// of a and b. +// +// r0 := a0 & b0 +// r1 := a1 & b1 +// r2 := a2 & b2 +// r3 := a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] AND b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the four single-precision, floating-point values +// of a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Computes bitwise EXOR (exclusive-or) of the four single-precision, +// floating-point values of a and b. +// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise OR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd +FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. +// +// r := a | b +// +// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in +// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Duplicate the low double-precision (64-bit) floating-point element from a, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd +FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) +{ +#if (__aarch64__) + return vreinterpretq_m128d_f64( + vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_u64( + vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); +#endif +} + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) +{ +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#else + float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); + float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) +{ +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); +#else + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); + float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Moves the upper two values of B into the lower two values of A. +// +// r3 := a3 +// r2 := a2 +// r1 := b3 +// r0 := b2 +FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); +} + +// Moves the lower two values of B into the upper two values of A. +// +// r3 := b1 +// r2 := b0 +// r1 := a1 +// r0 := a0 +FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) +{ + return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) +{ + return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) +{ + return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) +{ + return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) +{ + return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); +} + +// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift +// the result right by imm8 bytes, and store the low 16 bytes in dst. +// +// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) +// dst[127:0] := tmp[127:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 +#define _mm_alignr_epi8(a, b, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) >= 32)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + uint8x16_t tmp_low, tmp_high; \ + if (imm >= 16) { \ + const int idx = imm - 16; \ + tmp_low = vreinterpretq_u8_m128i(a); \ + tmp_high = vdupq_n_u8(0); \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpretq_u8_m128i(b); \ + tmp_high = vreinterpretq_u8_m128i(a); \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift +// the result right by imm8 bytes, and store the low 8 bytes in dst. +// +// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) +// dst[63:0] := tmp[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 +#define _mm_alignr_pi8(a, b, imm) \ + __extension__({ \ + __m64 ret; \ + if (unlikely((imm) >= 16)) { \ + ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else { \ + uint8x8_t tmp_low, tmp_high; \ + if (imm >= 8) { \ + const int idx = imm - 8; \ + tmp_low = vreinterpret_u8_m64(a); \ + tmp_high = vdup_n_u8(0); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpret_u8_m64(b); \ + tmp_high = vreinterpret_u8_m64(a); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +{ + float32x2_t a21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + float32x2_t a33 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); + return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); +} + +// NEON does not support a general purpose permute intrinsic +// Selects four specific single-precision, floating-point values from a and b, +// based on the mask i. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx +#define _mm_shuffle_ps_default(a, b, imm) \ + __extension__({ \ + float32x4_t ret; \ + ret = vmovq_n_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128_f32(ret); \ + }) + +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) +// int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + float32x4_t _input1 = vreinterpretq_f32_m128(a); \ + float32x4_t _input2 = vreinterpretq_f32_m128(b); \ + float32x4_t _shuf = __builtin_shufflevector( \ + _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128_f32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + __m128 ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_ps_1032((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_ps_2301((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_ps_0321((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_ps_2103((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_movelh_ps((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_ps_1001((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_ps_0101((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 1, 0): \ + ret = _mm_shuffle_ps_3210((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 1, 1): \ + ret = _mm_shuffle_ps_0011((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 2, 2): \ + ret = _mm_shuffle_ps_0022((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 2, 0, 0): \ + ret = _mm_shuffle_ps_2200((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 0, 2): \ + ret = _mm_shuffle_ps_3202((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 3, 2): \ + ret = _mm_movehl_ps((b), (a)); \ + break; \ + case _MM_SHUFFLE(1, 1, 3, 3): \ + ret = _mm_shuffle_ps_1133((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 1, 0): \ + ret = _mm_shuffle_ps_2010((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 0, 1): \ + ret = _mm_shuffle_ps_2001((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 3, 2): \ + ret = _mm_shuffle_ps_2032((a), (b)); \ + break; \ + default: \ + ret = _mm_shuffle_ps_default((a), (b), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); +} + +// rotates the least significant 32 bits into the most signficant 32 bits, and +// shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); +} + +// rotates the most significant 32 bits into the least signficant 32 bits, and +// shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) +{ + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) +{ + int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) +{ + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +{ + int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a + uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) + return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); +#elif defined(__GNUC__) + int8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__( + "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + return vreinterpretq_m128i_s8(ret); +#else + // use this line if testing on aarch64 + int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; + return vreinterpretq_m128i_s8( + vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), + vtbl2_s8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +#define _mm_shuffle_epi32_default(a, imm) \ + __extension__({ \ + int32x4_t ret; \ + ret = vmovq_n_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128i_s32(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) +// int imm) +#if defined(__aarch64__) +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ + }) +#else +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ + }) +#endif + +// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. +// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + int32x4_t _input = vreinterpretq_s32_m128i(a); \ + int32x4_t _shuf = __builtin_shufflevector( \ + _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ + vreinterpretq_m128i_s32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032((a)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301((a)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321((a)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101((a)); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122((a)); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332((a)); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat((a), 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat((a), 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat((a), 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat((a), 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default((a), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) +// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflelo_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t lowBits = vget_low_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ + 3); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflelo_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflehi_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t highBits = vget_high_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ + 5); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ + 6); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ + 7); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflehi_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ + (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ + (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// Shuffle double-precision (64-bit) floating-point elements using the control +// in imm8, and store the results in dst. +// +// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_pd(a, b, imm8) \ + vreinterpretq_m128d_s64(__builtin_shufflevector( \ + vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ + ((imm8 & 0x2) >> 1) + 2)) +#else +#define _mm_shuffle_pd(a, b, imm8) \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) +#endif + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[j] +// dst[i+15:i] := b[i+15:i] +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +#define _mm_blend_epi16(a, b, imm) \ + __extension__({ \ + const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \ + ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t _a = vreinterpretq_u16_m128i(a); \ + uint16x8_t _b = vreinterpretq_u16_m128i(b); \ + vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ + }) + +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// +// FOR j := 0 to 15 +// i := j*8 +// IF mask[i+7] +// dst[i+7:i] := b[i+7:i] +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); + uint8x16_t a = vreinterpretq_u8_m128i(_a); + uint8x16_t b = vreinterpretq_u8_m128i(_b); + return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +} + +/* Shifts */ + + +// Shift packed 16-bit integers in a right by imm while shifting in sign +// bits, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) +{ + const int count = (imm & ~15) ? 15 : imm; + return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx +#define _mm_slli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm)) <= 0) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s16( \ + vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. : +// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) +{ + if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ + return a; + if (unlikely(imm > 31)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s32( + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); +} + +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) +{ + if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ + return a; + if (unlikely(imm > 63)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s64( + vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[7:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 +#define _mm_srli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely(imm) == 0) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 16)) { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_u32( \ + vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF imm8[7:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 +#define _mm_srli_epi64(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 64)) { \ + ret = vreinterpretq_m128i_u64( \ + vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) +// ELSE +// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srai_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_s32( \ + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = vreinterpretq_m128i_s32( \ + vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ + } \ + ret; \ + }) + +// Shifts the 128 - bit value in a right by imm bytes while shifting in +// zeros.imm must be an immediate. +// +// r := srl(a, imm*8) +// +// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx +// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) <= 0)) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8( \ + vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm +// must be an immediate. +// +// r := a << (imm * 8) +// +// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_slli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) <= 0)) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8(vextq_s8( \ + vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ + } \ + ret; \ + }) + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16((int16_t) c); + return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// r2 := a2 << count +// r3 := a3 << count +// +// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32((int32_t) c); + return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// +// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64((int64_t) c); + return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// ... +// r7 := srl(a7, count) +// +// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16(-(int16_t) c); + return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// r2 := srl(a2, count) +// r3 := srl(a3, count) +// +// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32(-(int32_t) c); + return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// +// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64(-(int64_t) c); + return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); +} + +// NEON does not provide a version of this function. +// Creates a 16-bit mask from the most significant bits of the 16 signed or +// unsigned 8-bit integers in a and zero extends the upper bits. +// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_epi8(__m128i a) +{ + // Use increasingly wide shifts+adds to collect the sign bits + // together. + // Since the widening shifts would be rather confusing to follow in little + // endian, everything will be illustrated in big endian order instead. This + // has a different result - the bits would actually be reversed on a big + // endian machine. + + // Starting input (only half the elements are shown): + // 89 ff 1d c0 00 10 99 33 + uint8x16_t input = vreinterpretq_u8_m128i(a); + + // Shift out everything but the sign bits with an unsigned shift right. + // + // Bytes of the vector:: + // 89 ff 1d c0 00 10 99 33 + // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) + // | | | | | | | | + // 01 01 00 01 00 00 01 00 + // + // Bits of first important lane(s): + // 10001001 (89) + // \______ + // | + // 00000001 (01) + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with a 16-bit unsigned shift right + add. + // 'xx' represents garbage data which will be ignored in the final result. + // In the important bytes, the add functions like a binary OR. + // + // 01 01 00 01 00 00 01 00 + // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) + // \| \| \| \| + // xx 03 xx 01 xx 00 xx 02 + // + // 00000001 00000001 (01 01) + // \_______ | + // \| + // xxxxxxxx xxxxxx11 (xx 03) + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + + // Repeat with a wider 32-bit shift + add. + // xx 03 xx 01 xx 00 xx 02 + // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> + // 14)) + // \| \| + // xx xx xx 0d xx xx xx 02 + // + // 00000011 00000001 (03 01) + // \\_____ || + // '----.\|| + // xxxxxxxx xxxx1101 (xx 0d) + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + + // Last, an even wider 64-bit shift + add to get our result in the low 8 bit + // lanes. xx xx xx 0d xx xx xx 02 + // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> + // 28)) + // \| + // xx xx xx xx xx xx xx d2 + // + // 00001101 00000010 (0d 02) + // \ \___ | | + // '---. \| | + // xxxxxxxx 11010010 (xx d2) + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. + // xx xx xx xx xx xx xx d2 + // || return paired64[0] + // d2 + // Note: Little endian would return the correct value 4b (01001011) instead. + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) +{ + return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); +} + +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); +} + +// NEON does not provide this method +// Creates a 4-bit mask from the most significant bits of the four +// single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_ps(__m128 a) +{ + uint32x4_t input = vreinterpretq_u32_m128(a); +#if defined(__aarch64__) + static const int32x4_t shift = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); +#else + // Uses the exact same method as _mm_movemask_epi8, see that for details. + // Shift out everything but the sign bits with a 32-bit unsigned shift + // right. + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); + // Merge the two pairs together with a 64-bit unsigned shift right + add. + uint8x16_t paired = + vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + // Extract the result. + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif +} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones +FORCE_INLINE int _mm_test_all_ones(__m128i a) +{ + return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~(uint64_t) 0; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros +FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) +{ + int64x2_t a_and_mask = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); + return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0 + : 1; +} + +/* Math operations */ + +// Subtracts the four single-precision, floating-point values of a and b. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// +// dst[31:0] := a[31:0] - b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_sub_ps(a, b)); +} + +// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, +// and store the results in dst. +// r0 := a0 - b0 +// r1 := a1 - b1 +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or +// unsigned 32-bit integers of a. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// +// dst[63:0] := a[63:0] - b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit +// integers of a and saturates.. +// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit +// integers of a and saturates. +// +// r0 := UnsignedSaturate(a0 - b0) +// r1 := UnsignedSaturate(a1 - b1) +// ... +// r15 := UnsignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r15 := SignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r7 := SignedSaturate(a7 - b7) +// +// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed double-precision (64-bit) floating-point elements in b from +// packed double-precision (64-bit) floating-point elements in a, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] - b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd +FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] - db[0]; + c[1] = da[1] - db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Subtract the lower double-precision (64-bit) floating-point element in b from +// the lower double-precision (64-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd +FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_sub_pd(a, b)); +} + +// Add packed unsigned 16-bit integers in a and b using saturation, and store +// the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..15 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) +{ + int8x16_t a = vreinterpretq_s8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); +#else + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') + // based on ltMask + int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + + return vreinterpretq_m128i_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..7 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); +#else + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative + // 'a') based on ltMask + int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_m128i_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..3 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); +#else + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative + // 'a') based on ltMask + int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); + // res = masked & (~zeroMask) + int32x4_t res = vbicq_s32(masked, zeroMask); + return vreinterpretq_m128i_s32(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 3 +// i := j*16 +// IF b[i+15:i] < 0 +// dst[i+15:i] := -(a[i+15:i]) +// ELSE IF b[i+15:i] == 0 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 +FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); +#else + int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') + // based on ltMask + int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); + // res = masked & (~zeroMask) + int16x4_t res = vbic_s16(masked, zeroMask); + + return vreinterpret_m64_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 1 +// i := j*32 +// IF b[i+31:i] < 0 +// dst[i+31:i] := -(a[i+31:i]) +// ELSE IF b[i+31:i] == 0 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := a[i+31:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 +FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) +{ + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); +#else + int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') + // based on ltMask + int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); + // res = masked & (~zeroMask) + int32x2_t res = vbic_s32(masked, zeroMask); + + return vreinterpret_m64_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// +// FOR j := 0 to 7 +// i := j*8 +// IF b[i+7:i] < 0 +// dst[i+7:i] := -(a[i+7:i]) +// ELSE IF b[i+7:i] == 0 +// dst[i+7:i] := 0 +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 +FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) +{ + int8x8_t a = vreinterpret_s8_m64(_a); + int8x8_t b = vreinterpret_s8_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); +#else + int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') + // based on ltMask + int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); + // res = masked & (~zeroMask) + int8x8_t res = vbic_s8(masked, zeroMask); + + return vreinterpret_m64_s8(res); +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16( + vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb +#define _m_pavgb(a, b) _mm_avg_pu8(a, b) + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw +#define _m_pavgw(a, b) _mm_avg_pu16(a, b) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw +#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw +#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Computes the average of the 16 unsigned 8-bit integers in a and the 16 +// unsigned 8-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r15 := (a15 + b15) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the average of the 8 unsigned 16-bit integers in a and the 8 +// unsigned 16-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r7 := (a7 + b7) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) +{ + return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), + vreinterpretq_u16_m128i(b)); +} + +// Adds the four single-precision, floating-point values of a and b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1] + db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add the lower double-precision (64-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper element from +// a to the upper element of dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd +FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_add_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add 64-bit integers a and b, and store the result in dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// adds the scalar single-precision floating point values of a and b. +// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + // the upper values in the result must be the remnants of <a>. + return vreinterpretq_m128_f32(vaddq_f32(a, value)); +} + +// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or +// unsigned 16-bit integers in b. +// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or +// unsigned 8-bit integers in b. +// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b +// and saturates. +// +// r0 := SignedSaturate(a0 + b0) +// r1 := SignedSaturate(a1 + b1) +// ... +// r7 := SignedSaturate(a7 + b7) +// +// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in +// b and saturates.. +// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or +// unsigned 16-bit integers from b. +// +// r0 := (a0 * b0)[15:0] +// r1 := (a1 * b1)[15:0] +// ... +// r7 := (a7 * b7)[15:0] +// +// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or +// unsigned 32-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// tmp[31:0] := a[i+15:i] * b[i+15:i] +// dst[i+15:i] := tmp[31:16] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Multiplies the four single-precision, floating-point values of a and b. +// +// r0 := a0 * b0 +// r1 := a1 * b1 +// r2 := a2 * b2 +// r3 := a3 * b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Multiply packed double-precision (64-bit) floating-point elements in a and b, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] * db[0]; + c[1] = da[1] * db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Multiply the lower double-precision (64-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper element +// from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_mul_pd(a, b)); +} + +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// +// dst[31:0] := a[31:0] * b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_mul_ps(a, b)); +} + +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// +// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) +// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + // vmull_u32 upcasts instead of masking, so we downcast. + uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); + uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); + return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); +} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// +// dst[63:0] := a[31:0] * b[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) +{ + return vreinterpret_m64_u64(vget_low_u64( + vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); +} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// +// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 +// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +{ + // vmull_s32 upcasts instead of masking, so we downcast. + int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); + int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0) + (a1 * b1) +// r1 := (a2 * b2) + (a3 * b3) +// r2 := (a4 * b4) + (a5 * b5) +// r3 := (a6 * b6) + (a7 * b7) +// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +{ + int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); + int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); + + return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. +// +// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) +// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) +// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) +// ... +// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) +{ + // Has issues due to saturation + // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); + + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + +// a[i+7:i]*b[i+7:i] ) +// ENDFOR +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + uint8x16_t a = vreinterpretq_u8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b))); + int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b))); + return vreinterpretq_m128i_s16( + vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); +#else + // This would be much simpler if x86 would choose to zero extend OR sign + // extend, not both. This could probably be optimized better. + uint16x8_t a = vreinterpretq_u16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // Zero extend a + int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); + int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); + + // Sign extend by shifting left then shifting right. + int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); + int16x8_t b_odd = vshrq_n_s16(b, 8); + + // multiply + int16x8_t prod1 = vmulq_s16(a_even, b_even); + int16x8_t prod2 = vmulq_s16(a_odd, b_odd); + + // saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); +#endif +} + +// Computes the fused multiple add product of 32-bit floating point numbers. +// +// Return Value +// Multiplies A and B, and adds C to the temporary result before returning it. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd +FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), + vreinterpretq_f32_m128(b), + vreinterpretq_f32_m128(a))); +#else + return _mm_add_ps(_mm_mul_ps(a, b), c); +#endif +} + +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) +{ + __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + return _mm_fmadd_ps(b, mask, a); +} + +// Horizontally add adjacent pairs of double-precision (64-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[] = {da[0] + da[1], db[0] + db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) +{ + uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + uint16_t r4 = t[4] + t[5] + t[6] + t[7]; + uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); + return (__m128i) vsetq_lane_u16(r4, r, 4); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) +{ + uint16x4_t t = + vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0)); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// +// FOR j := 0 to 7 +// i := j*8 +// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +// ENDFOR +// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + +// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Divides the four single-precision, floating-point values of a and b. +// +// r0 := a0 / b0 +// r1 := a1 / b1 +// r2 := a2 / b2 +// r3 := a3 / b3 +// +// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV + return vreinterpretq_m128_f32( + vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#endif + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); +#endif +} + +// Divides the scalar single-precision floating point value of a by b. +// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Divide packed double-precision (64-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// dst[i+63:i] := a[i+63:i] / b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd +FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] / db[0]; + c[1] = da[1] / db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Divide the lower double-precision (64-bit) floating-point element in a by the +// lower double-precision (64-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper element from a to the upper +// element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd +FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + float64x2_t tmp = + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_f64( + vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); +#else + return _mm_move_sd(a, _mm_div_pd(a, b)); +#endif +} + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +{ + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#endif + return vreinterpretq_m128_f32(recip); +} + +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// +// dst[31:0] := (1.0 / a[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) +{ + return _mm_move_ss(a, _mm_rcp_ps(a)); +} + +// Computes the approximations of square roots of the four single-precision, +// floating-point values of a. First computes reciprocal square roots and then +// reciprocals of the four values. +// +// r0 := sqrt(a0) +// r1 := sqrt(a1) +// r2 := sqrt(a2) +// r3 := sqrt(a3) +// +// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +{ +#if SSE2NEON_PRECISE_SQRT + float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Test for vrsqrteq_f32(0) -> positive infinity case. + // Change to zero, so that s * 1/sqrt(s) result is zero too. + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t div_by_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); + recip = vreinterpretq_f32_u32( + vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); + + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + + // sqrt(s) = s * 1/sqrt(s) + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); +#elif defined(__aarch64__) + return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#else + float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + float32x4_t sq = vrecpeq_f32(recipsq); + return vreinterpretq_m128_f32(sq); +#endif +} + +// Computes the approximation of the square root of the scalar single-precision +// floating point value of in. +// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); +} + +// Computes the approximations of the reciprocal square roots of the four +// single-precision floating point values of in. +// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +{ + float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); +#if SSE2NEON_PRECISE_RSQRT + // Additional Netwon-Raphson iteration for accuracy + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#endif + return vreinterpretq_m128_f32(out); +} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +{ + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Computes the maximums of the four single-precision, floating-point values of +// a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_b, _a), _a, _b); +#else + return vreinterpretq_m128_f32( + vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Computes the minima of the four single-precision, floating-point values of a +// and b. +// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_a, _b), _a, _b); +#else + return vreinterpretq_m128_f32( + vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Computes the maximum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the minimum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 +FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 +FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 +FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// epi versions of min/max +// Computes the pariwise maximums of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 > b0) ? a0 : b0 +// r1 := (a1 > b1) ? a1 : b1 +// r2 := (a2 > b2) ? a2 : b2 +// r3 := (a3 > b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the pariwise minima of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 < b0) ? a0 : b0 +// r1 := (a1 < b1) ? a1 : b1 +// r2 := (a2 < b2) ? a2 : b2 +// r3 := (a3 < b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16(vshrn_n_u32( + vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0)[31:16] +// r1 := (a1 * b1)[31:16] +// ... +// r7 := (a7 * b7)[31:16] +// +// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + /* FIXME: issue with large values because of result saturation */ + // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), + // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return + // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); + int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16 +FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) +{ + uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab3210 = vmull_u16(a3210, b3210); +#if defined(__aarch64__) + uint32x4_t ab7654 = + vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); + uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), + vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r); +#else + uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab7654 = vmull_u16(a7654, b7654); + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +#endif +} + +// Computes pairwise add of each argument as single-precision, floating-point +// values a and b. +// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32( + vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); +#endif +} + +// Computes pairwise add of each argument as a 16-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) + return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); +#else + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), + vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); +#endif +} + +// Horizontally substract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps +FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vsubq_f32( + vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), + vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); +#else + float32x4x2_t c = + vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); + return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); +#endif +} + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) +{ + return vreinterpret_m64_s32( + vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); +} + +// Computes pairwise difference of each argument as a 16-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Subtract + return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); +} + +// Computes saturated pairwise sub of each argument as a 16-bit signed +// integer values a and b. +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); +#endif +} + +// Computes saturated pairwise difference of each argument as a 16-bit signed +// integer values a and b. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated subtract + return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); +#endif +} + +// Computes pairwise add of each argument as a 32-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + return vreinterpretq_m128i_s32( + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), + vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); +} + +// Computes pairwise difference of each argument as a 32-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) +{ + int64x2_t a = vreinterpretq_s64_m128i(_a); + int64x2_t b = vreinterpretq_s64_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|b0|b2] + // [a1|a2|b1|b3] + int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); + int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); + // Subtract + return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); +} + +// Kahan summation for accurate summation of floating-point numbers. +// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html +FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) +{ + y -= *c; + float t = *sum + y; + *c = (t - *sum) - y; + *sum = t; +} + +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps +FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) +{ +#if defined(__aarch64__) + /* shortcuts */ + if (imm == 0xFF) { + return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); + } + if (imm == 0x7F) { + float32x4_t m = _mm_mul_ps(a, b); + m[3] = 0; + return _mm_set1_ps(vaddvq_f32(m)); + } +#endif + + float s = 0, c = 0; + float32x4_t f32a = vreinterpretq_f32_m128(a); + float32x4_t f32b = vreinterpretq_f32_m128(b); + + /* To improve the accuracy of floating-point summation, Kahan algorithm + * is used for each operation. + */ + if (imm & (1 << 4)) + _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); + if (imm & (1 << 5)) + _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); + if (imm & (1 << 6)) + _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); + if (imm & (1 << 7)) + _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); + s += c; + + float32x4_t res = { + (imm & 0x1) ? s : 0, + (imm & 0x2) ? s : 0, + (imm & 0x4) ? s : 0, + (imm & 0x8) ? s : 0, + }; + return vreinterpretq_m128_f32(res); +} + +/* Compare operations */ + +// Compares for less than +// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmplt_ps(a, b)); +} + +// Compares for greater than. +// +// r0 := (a0 > b0) ? 0xffffffff : 0x0 +// r1 := (a1 > b1) ? 0xffffffff : 0x0 +// r2 := (a2 > b2) ? 0xffffffff : 0x0 +// r3 := (a3 > b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); +} + +// Compares for greater than or equal. +// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpge_ps(a, b)); +} + +// Compares for less than or equal. +// +// r0 := (a0 <= b0) ? 0xffffffff : 0x0 +// r1 := (a1 <= b1) ? 0xffffffff : 0x0 +// r2 := (a2 <= b2) ? 0xffffffff : 0x0 +// r3 := (a3 <= b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmple_ps(a, b)); +} + +// Compares for equality. +// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for equality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); +} + +// Compares for inequality. +// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compares for inequality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) +{ + return _mm_cmplt_ps(a, b); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) +{ + return _mm_cmplt_ss(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) +{ + return _mm_cmple_ps(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) +{ + return _mm_cmple_ss(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +{ + return _mm_cmpgt_ps(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) +{ + return _mm_cmpgt_ss(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +{ + return _mm_cmpge_ps(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) +{ + return _mm_cmpge_ss(a, b); +} + +// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or +// unsigned 8-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd +FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or +// unsigned 16-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); +#else + // ARMv7 lacks vceqq_u64 + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for lesser than. +// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xff : 0x0 +// r1 := (a1 > b1) ? 0xff : 0x0 +// ... +// r15 := (a15 > b15) ? 0xff : 0x0 +// +// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for less than. +// +// r0 := (a0 < b0) ? 0xffff : 0x0 +// r1 := (a1 < b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 < b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xffff : 0x0 +// r1 := (a1 > b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 > b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for less than. +// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for greater than. +// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + // ARMv7 lacks vcgtq_s64. + // This is based off of Clang's SSE2 polyfill: + // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi)) + + // Mask the sign bit out since we need a signed AND an unsigned comparison + // and it is ugly to try and split them. + int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull)); + int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask); + int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask); + // Check if a > b + int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask)); + // Copy upper mask to lower mask + // a_hi > b_hi + int64x2_t gt_hi = vshrq_n_s64(greater, 63); + // Copy lower mask to upper mask + // a_lo > b_lo + int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32); + // Compare for equality + int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask)); + // Copy upper mask to lower mask + // a_hi == b_hi + int64x2_t eq_hi = vshrq_n_s64(equal, 63); + // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi) + int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi)); + return vreinterpretq_m128i_s64(ret); +#endif +} + +// Compares the four 32-bit floats in a and b to check if any values are NaN. +// Ordered compare between each value returns true for "orderable" and false for +// "not orderable" (NaN). +// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see +// also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) +{ + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + uint32x4_t ceqaa = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t ceqbb = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); +} + +// Compares for ordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpord_ps(a, b)); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) +{ + uint32x4_t f32a = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t f32b = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than operation. : +// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important +// note!! The documentation on MSDN is incorrect! If either of the values is a +// NAN the docs say you will get a one, but in fact, it will return a zero!! +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +{ + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_lt_b = + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than operation. : +// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_gt_b = + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than or equal operation. : +// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_le_b = + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than or equal operation. : +// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_ge_b = + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an equality operation. : +// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an inequality operation. : +// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_neq_b = vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0; +} + +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +/* Conversions */ + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss +#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) + +// Convert the signed 64-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss +FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) +{ +#if defined(__aarch64__) + return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int32_t) ceil(data); + if (unlikely(diff == 0.5)) { + int32_t f = (int32_t) floor(data); + int32_t c = (int32_t) ceil(data); + return c & 1 ? f : c; + } + return (int32_t) floor(data); +#endif +} + +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps +FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); +} + +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps +FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then covert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(a[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(a[63:32]) +// dst[95:64] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:96] := Convert_Int32_To_FP32(b[63:32]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps +FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); +} + +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps +FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); +} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps +FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); +} + +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps +FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_u32( + vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values using truncate. +// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 +FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) +{ +#if defined(__aarch64__) + return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); +#else + double ret = *((double *) &a); + return (int64_t) ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x +#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) + +// Converts the four signed 32-bit integer values of a to single-precision, +// floating-point values +// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_u16(u16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_u32(u32x4); +} + +// Converts the two unsigned 8-bit integers in the lower 16 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_s16(s16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_s32(s32x4); +} + +// Converts the two signed 8-bit integers in the lower 32 bits to four +// signed 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four signed 16-bit integers in the lower 64 bits to four signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32( + vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); +} + +// Converts the two signed 16-bit integers in the lower 32 bits two signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) +{ + int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four unsigned 16-bit integers in the lower 64 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) +{ + return vreinterpretq_m128i_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); +} + +// Converts the two unsigned 16-bit integers in the lower 32 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) +{ + uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the two unsigned 32-bit integers in the lower 64 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) +{ + return vreinterpretq_m128i_u64( + vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); +} + +// Converts the two signed 32-bit integers in the lower 64 bits to two signed +// 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values. +// +// r0 := (int) a0 +// r1 := (int) a1 +// r2 := (int) a2 +// r3 := (int) a3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx +// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A +// does not support! It is supported on ARMv8-A however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); +#else + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = + vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 16-bit integers, and store the results in dst. Note: this intrinsic +// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and +// 0x7FFFFFFF. +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 +FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) +{ + return vreinterpret_m64_s16( + vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); +} + +// Copy the lower 32-bit integer in a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +{ + return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 +FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) +{ + return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +// r2 := 0x0 +// r3 := 0x0 +// +// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); +} + +// Moves 64-bit integer a to the least significant 64 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) +{ + return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); +} + +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) +{ + return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); +} + +// Applies a type cast to reinterpret four 32-bit floating point values passed +// in as a 128-bit parameter as packed 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb514099.aspx +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +{ + return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); +} + +// Cast vector of type __m128i to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); +#else + return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); +#endif +} + +// Applies a type cast to reinterpret four 32-bit integers passed in as a +// 128-bit parameter as packed 32-bit floating point values. +// https://msdn.microsoft.com/en-us/library/bb514029.aspx +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +{ + return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); +} + +// Loads 128-bit value. : +// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd +FORCE_INLINE __m128d _mm_load1_pd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := a[63:0] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Loads 128-bit value. : +// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load unaligned 32-bit integer from memory into the first element of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[MAX:32] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 +FORCE_INLINE __m128i _mm_loadu_si32(const void *p) +{ + return vreinterpretq_m128i_s32( + vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// k := 64*j +// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) +// ENDFOR +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps +FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) +{ +#if defined(__aarch64__) + float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); + return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); +#else + float a0 = (float) ((double *) &a)[0]; + float a1 = (float) ((double *) &a)[1]; + return _mm_set_ps(0, 0, a1, a0); +#endif +} + +// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 +FORCE_INLINE double _mm_cvtsd_f64(__m128d a) +{ +#if defined(__aarch64__) + return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#else + return ((double *) &a)[0]; +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// k := 32*j +// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd +FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); +#else + double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) +{ + return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); +} + +// Cast vector of type __m128d to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps +FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) +{ + return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps +FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint32x4_t mask = + vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps +FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) +{ + const uint32_t ALIGN_STRUCT(16) + data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; + uint32x4_t mask = vld1q_u32(data); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd +FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) +{ + uint64x2_t mask = + vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); +#if defined(__aarch64__) + float64x2_t a = vreinterpretq_f64_m128d(_a); + float64x2_t b = vreinterpretq_f64_m128d(_b); + return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); +#else + uint64x2_t a = vreinterpretq_u64_m128d(_a); + uint64x2_t b = vreinterpretq_u64_m128d(_b); + return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); +#endif +} + +typedef struct { + uint16_t res0; + uint8_t res1 : 6; + uint8_t bit22 : 1; + uint8_t bit23 : 1; + uint8_t res2; +#if defined(__aarch64__) + uint32_t res3; +#endif +} fpcr_bitfield; + +// Macro: Set the rounding mode bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The rounding mode may contain any of +// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, +// _MM_ROUND_TOWARD_ZERO +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE +FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ +#else + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + switch (rounding) { + case _MM_ROUND_TOWARD_ZERO: + r.field.bit22 = 1; + r.field.bit23 = 1; + break; + case _MM_ROUND_DOWN: + r.field.bit22 = 0; + r.field.bit23 = 1; + break; + case _MM_ROUND_UP: + r.field.bit22 = 1; + r.field.bit23 = 0; + break; + default: //_MM_ROUND_NEAREST + r.field.bit22 = 0; + r.field.bit23 = 0; + } + +#if defined(__aarch64__) + asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ +#else + asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +FORCE_INLINE void _mm_setcsr(unsigned int a) +{ + _MM_SET_ROUNDING_MODE(a); +} + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +{ +#if defined(__aarch64__) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); + } +#else + float *v_float = (float *) &a; + __m128 zero, neg_inf, pos_inf; + + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return (__m128){floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])}; + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), + ceilf(v_float[3])}; + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); + neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])); + pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]), + ceilf(v_float[2]), ceilf(v_float[3])); + return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero)); + default: //_MM_FROUND_CUR_DIRECTION + return (__m128){roundf(v_float[0]), roundf(v_float[1]), + roundf(v_float[2]), roundf(v_float[3])}; + } +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi +FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpret_m64_s32( + vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)))); +#else + return vreinterpret_m64_s32( + vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128( + _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))))); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 +#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) + +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) +{ + return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} + +// Round the lower single-precision (32-bit) floating-point element in b up to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// +// dst[31:0] := CEIL(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss +FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) +{ + return _mm_move_ss( + a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); +} + +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) +{ + return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} + +// Round the lower single-precision (32-bit) floating-point element in b down to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// +// dst[31:0] := FLOOR(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss +FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) +{ + return _mm_move_ss( + a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); +} + +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 +#define _mm_lddqu_si128 _mm_loadu_si128 + +/* Miscellaneous Operations */ + +// Shifts the 8 signed 16-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// ... +// r7 := a7 >> count +// +// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) +{ + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (unlikely(c > 15)) + return _mm_cmplt_epi16(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c))); +} + +// Shifts the 4 signed 32-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// r2 := a2 >> count +// r3 := a3 >> count +// +// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) +{ + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (unlikely(c > 31)) + return _mm_cmplt_epi32(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c))); +} + +// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and +// saturates. +// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), + vqmovn_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// ... +// r7 := UnsignedSaturate(a7) +// r8 := UnsignedSaturate(b0) +// r9 := UnsignedSaturate(b1) +// ... +// r15 := UnsignedSaturate(b7) +// +// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +{ + return vreinterpretq_m128i_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), + vqmovun_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers +// and saturates. +// +// r0 := SignedSaturate(a0) +// r1 := SignedSaturate(a1) +// r2 := SignedSaturate(a2) +// r3 := SignedSaturate(a3) +// r4 := SignedSaturate(b0) +// r5 := SignedSaturate(b1) +// r6 := SignedSaturate(b2) +// r7 := SignedSaturate(b3) +// +// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), + vqmovn_s32(vreinterpretq_s32_m128i(b)))); +} + +// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// r2 := UnsignedSaturate(a2) +// r3 := UnsignedSaturate(a3) +// r4 := UnsignedSaturate(b0) +// r5 := UnsignedSaturate(b1) +// r6 := UnsignedSaturate(b2) +// r7 := UnsignedSaturate(b3) +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), + vqmovun_s32(vreinterpretq_s32_m128i(b)))); +} + +// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// ... +// r14 := a7 +// r15 := b7 +// +// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the +// lower 4 signed or unsigned 16-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// r4 := a2 +// r5 := b2 +// r6 := a3 +// r7 := b3 +// +// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the +// lower 2 signed or unsigned 32 - bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); +} + +// Selects and interleaves the lower two single-precision, floating-point values +// from a and b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// +// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[63:0] +// dst[127:64] := src2[63:0] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd +FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), + vget_low_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the high half of a and b, and store the results in dst. +// +// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[127:64] +// dst[127:64] := src2[127:64] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd +FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), + vget_high_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Selects and interleaves the upper two single-precision, floating-point values +// from a and b. +// +// r0 := a2 +// r1 := b2 +// r2 := a3 +// r3 := b3 +// +// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a8 +// r1 := b8 +// r2 := a9 +// r3 := b9 +// ... +// r14 := a15 +// r15 := b15 +// +// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the +// upper 4 signed or unsigned 16-bit integers in b. +// +// r0 := a4 +// r1 := b4 +// r2 := a5 +// r3 := b5 +// r4 := a6 +// r5 := b6 +// r6 := a7 +// r7 := b7 +// +// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the +// upper 2 signed or unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper signed or unsigned 64-bit integer in a with the +// upper signed or unsigned 64-bit integer in b. +// +// r0 := a1 +// r1 := b1 +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); +} + +// Horizontally compute the minimum amongst the packed unsigned 16-bit integers +// in a, store the minimum and index in dst, and zero the remaining bits in dst. +// +// index[2:0] := 0 +// min[15:0] := a[15:0] +// FOR j := 0 to 7 +// i := j*16 +// IF a[i+15:i] < min[15:0] +// index[2:0] := j +// min[15:0] := a[i+15:i] +// FI +// ENDFOR +// dst[15:0] := min[15:0] +// dst[18:16] := index[2:0] +// dst[127:19] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 +FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) +{ + __m128i dst; + uint16_t min, idx = 0; + // Find the minimum value +#if defined(__aarch64__) + min = vminvq_u16(vreinterpretq_u16_m128i(a)); +#else + __m64 tmp; + tmp = vreinterpret_m64_u16( + vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), + vget_high_u16(vreinterpretq_u16_m128i(a)))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); +#endif + // Get the index of the minimum value + int i; + for (i = 0; i < 8; i++) { + if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { + idx = (uint16_t) i; + break; + } + a = _mm_srli_si128(a, 2); + } + // Generate result + dst = _mm_setzero_si128(); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); + return dst; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the CF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 +FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), + vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the ZF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 +FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Extracts the selected signed or unsigned 8-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) +#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) + +// Inserts the least significant 8 bits of b into the selected 8-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +#define _mm_insert_epi8(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s8( \ + vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 16-bit integer from a and zero +// extends. +// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +#define _mm_extract_epi16(a, imm) \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) + +// Inserts the least significant 16 bits of b into the selected 16-bit integer +// of a. +// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +#define _mm_insert_epi16(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ + }) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 +#define _mm_insert_pi16(a, b, imm) \ + __extension__({ \ + vreinterpret_m64_s16( \ + vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 32-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) + +// Inserts the least significant 32 bits of b into the selected 32-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +#define _mm_insert_epi32(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 64-bit integer from a and zero +// extends. +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +#define _mm_extract_epi64(a, imm) \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) + +// Inserts the least significant 64 bits of b into the selected 64-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +#define _mm_insert_epi64(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ + }) + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +{ +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcount) + return __builtin_popcount(a); +#else + return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); +#endif +#else + uint32_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + + vst1_u32(&count, count32x2_val); + return count; +#endif +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +{ +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(a); +#else + return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); +#endif +#else + uint64_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(&count, count64x1_val); + return count; +#endif +} + +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ + vget_low_f32(ROW23.val[0])); \ + row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ + vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) + +/* Crypto Extensions */ + +#if defined(__ARM_FEATURE_CRYPTO) +// Wraps vmull_p64 +FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); + return vreinterpretq_u64_p128(vmull_p64(a, b)); +} +#else // ARMv7 polyfill +// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. +// +// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a +// 64-bit->128-bit polynomial multiply. +// +// It needs some work and is somewhat slow, but it is still faster than all +// known scalar methods. +// +// Algorithm adapted to C from +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted +// from "Fast Software Polynomial Multiplication on ARM Processors Using the +// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab +// (https://hal.inria.fr/hal-01506572) +static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), + vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), + vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. +#if defined(__aarch64__) + uint8x16_t lm_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#else + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); +#endif + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); + + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + + // De-interleave +#if defined(__aarch64__) + uint8x16_t t0 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +#else + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); +#endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); +} +#endif // ARMv7 polyfill + +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 +FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) +{ + uint64x2_t a = vreinterpretq_u64_m128i(_a); + uint64x2_t b = vreinterpretq_u64_m128i(_b); + switch (imm & 0x11) { + case 0x00: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); + case 0x01: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); + case 0x10: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); + case 0x11: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); + default: + abort(); + } +} + +#if !defined(__ARM_FEATURE_CRYPTO) +/* clang-format off */ +#define SSE2NEON_AES_DATA(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ + w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ + w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ + w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ + w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ + w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ + w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ + w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ + w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ + w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ + w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ + w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ + w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ + w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ + w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ + w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ + w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ + w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ + w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ + w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ + w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ + w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ + w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ + w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ + w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ + w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ + w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ + w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ + w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ + w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ + w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ + w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ + w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } +/* clang-format on */ + +/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ +#define SSE2NEON_AES_H0(x) (x) +static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); +#undef SSE2NEON_AES_H0 + +// In the absence of crypto extensions, implement aesenc using regular neon +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 +// for more information Reproduced with permission of the author. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, + 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, + 0xc, 0x1, 0x6, 0xb}; + static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); + + // shift rows + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + // sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); + + // mix columns + w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b); + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + // add round key + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ +#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ + (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ + (b0)) +#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) +#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) +#define SSE2NEON_AES_U0(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) +#define SSE2NEON_AES_U1(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) +#define SSE2NEON_AES_U2(p) \ + SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) +#define SSE2NEON_AES_U3(p) \ + SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) + static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { + SSE2NEON_AES_DATA(SSE2NEON_AES_U0), + SSE2NEON_AES_DATA(SSE2NEON_AES_U1), + SSE2NEON_AES_DATA(SSE2NEON_AES_U2), + SSE2NEON_AES_DATA(SSE2NEON_AES_U3), + }; +#undef SSE2NEON_AES_B2W +#undef SSE2NEON_AES_F2 +#undef SSE2NEON_AES_F3 +#undef SSE2NEON_AES_U0 +#undef SSE2NEON_AES_U1 +#undef SSE2NEON_AES_U2 +#undef SSE2NEON_AES_U3 + + uint32_t x0 = _mm_cvtsi128_si32(EncBlock); + uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); + uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); + uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); + + __m128i out = _mm_set_epi32( + (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ + aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), + (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ + aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), + (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ + aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), + (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ + aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, RoundKey); +#endif +} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + /* FIXME: optimized for NEON */ + uint8_t v[4][4] = { + [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, + [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, + [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, + [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, + }; + for (int i = 0; i < 16; i++) + vreinterpretq_nth_u8_m128i(a, i) = + v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); + return a; +} + +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +// +// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) +{ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); + for (int i = 0; i < 4; ++i) { + ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]]; + ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]]; + } + return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, + ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +} +#undef SSE2NEON_AES_DATA + +#else /* __ARM_FEATURE_CRYPTO */ +// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and +// AESMC and then manually applying the real key as an xor operation. This +// unfortunately means an additional xor op; the compiler should be able to +// optimize this away for repeated calls however. See +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +// for more details. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ + vreinterpretq_u8_m128i(b)); +} + +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( + vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + RoundKey); +} + +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + // AESE does ShiftRows and SubBytes on A + uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); + + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) + u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) + u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) + u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + }; + uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; + return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); +} +#endif + +/* Streaming Extensions */ + +// Guarantees that every preceding store is globally visible before any +// subsequent store. +// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx +FORCE_INLINE void _mm_sfence(void) +{ + __sync_synchronize(); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps +FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); +#else + vst1q_f32(p, vreinterpretq_f32_m128(a)); +#endif +} + +// Stores the data in a to the address p without polluting the caches. If the +// cache line containing address p is already in the cache, the cache will be +// updated. +// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); +#endif +} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) +{ +#if __has_builtin(__builtin_nontemporal_store) + return __builtin_nontemporal_load(p); +#else + return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); +#endif +} + +// Cache line containing p is flushed and invalidated from all caches in the +// coherency domain. : +// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx +FORCE_INLINE void _mm_clflush(void const *p) +{ + (void) p; + // no corollary for Neon? +} + +// Allocate aligned blocks of memory. +// https://software.intel.com/en-us/ +// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) +{ + void *ptr; + if (align == 1) + return malloc(size); + if (align == 2 || (sizeof(void *) == 8 && align == 4)) + align = sizeof(void *); + if (!posix_memalign(&ptr, align, size)) + return ptr; + return NULL; +} + +// Free aligned memory that was allocated with _mm_malloc. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free +FORCE_INLINE void _mm_free(void *addr) +{ + free(addr); +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc ^= v; + for (int bit = 0; bit < 8; bit++) { + if (crc & 1) + crc = (crc >> 1) ^ UINT32_C(0x82f63b78); + else + crc = (crc >> 1); + } +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) +FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); +#endif + return crc; +} + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC pop_options +#endif + +#endif diff --git a/thirdparty/embree/common/simd/avx.h b/thirdparty/embree/common/simd/avx.h new file mode 100644 index 0000000000..d3100306ee --- /dev/null +++ b/thirdparty/embree/common/simd/avx.h @@ -0,0 +1,34 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "sse.h" + +#if defined(__AVX512VL__) +#include "vboolf8_avx512.h" +#include "vboold4_avx512.h" +#else +#include "vboolf8_avx.h" +#include "vboold4_avx.h" +#endif + +#if defined(__AVX2__) +#include "vint8_avx2.h" +#include "vuint8_avx2.h" +#if defined(__X86_64__) +#include "vllong4_avx2.h" +#endif +#else +#include "vint8_avx.h" +#include "vuint8_avx.h" +#endif +#include "vfloat8_avx.h" +#if defined(__X86_64__) +#include "vdouble4_avx.h" +#endif + +#if defined(__AVX512F__) +#include "avx512.h" +#endif + diff --git a/thirdparty/embree/common/simd/avx512.h b/thirdparty/embree/common/simd/avx512.h new file mode 100644 index 0000000000..d43bbacea1 --- /dev/null +++ b/thirdparty/embree/common/simd/avx512.h @@ -0,0 +1,41 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "../math/constants.h" +#include "../sys/alloc.h" +#include "varying.h" + +#include "vboolf16_avx512.h" +#include "vint16_avx512.h" +#include "vuint16_avx512.h" +#include "vfloat16_avx512.h" + +#include "vboold8_avx512.h" +#include "vllong8_avx512.h" +#include "vdouble8_avx512.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Prefetching + //////////////////////////////////////////////////////////////////////////////// + +#define PFHINT_L1 0 +#define PFHINT_L2 1 +#define PFHINT_NT 2 + + template<const unsigned int mode> + __forceinline void prefetch(const void * __restrict__ const m) + { + if (mode == PFHINT_L1) + _mm_prefetch((const char*)m,_MM_HINT_T0); + else if (mode == PFHINT_L2) + _mm_prefetch((const char*)m,_MM_HINT_T1); + else if (mode == PFHINT_NT) + _mm_prefetch((const char*)m,_MM_HINT_NTA); + } +} diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h new file mode 100644 index 0000000000..195506b530 --- /dev/null +++ b/thirdparty/embree/common/simd/simd.h @@ -0,0 +1,110 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +/* include SSE wrapper classes */ +#if defined(__SSE__) +# include "sse.h" +#endif + +/* include AVX wrapper classes */ +#if defined(__AVX__) +# include "avx.h" +#endif + +/* include AVX512 wrapper classes */ +#if defined (__AVX512F__) +# include "avx512.h" +#endif + +namespace embree +{ + template <int N> + __forceinline vbool<N> isfinite(const vfloat<N>& v) + { + return (v >= vfloat<N>(-std::numeric_limits<float>::max())) + & (v <= vfloat<N>( std::numeric_limits<float>::max())); + } + + /* foreach unique */ + template<typename vbool, typename vint, typename Closure> + __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure) + { + vbool valid1 = valid0; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int i = vi[j]; + const vbool valid2 = valid1 & (i == vi); + valid1 = andn(valid1, valid2); + closure(valid2, i); + } + } + + /* returns the next unique value i in vi and the corresponding valid_i mask */ + template<typename vbool, typename vint> + __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) + { + assert(any(valid)); + const int j = int(bsf(movemask(valid))); + const int i = vi[j]; + valid_i = valid & (i == vi); + valid = andn(valid, valid_i); + return i; + } + + /* foreach unique index */ + template<typename vbool, typename vint, typename Closure> + __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure) + { + vbool valid1 = valid0; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int i = vi[j]; + const vbool valid2 = valid1 & (i == vi); + valid1 = andn(valid1, valid2); + closure(valid2, i, j); + } + } + + /* returns the index of the next unique value i in vi and the corresponding valid_i mask */ + template<typename vbool, typename vint> + __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) + { + assert(any(valid)); + const int j = int(bsf(movemask(valid))); + const int i = vi[j]; + valid_i = valid & (i == vi); + valid = andn(valid, valid_i); + return j; + } + + template<typename Closure> + __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure) + { + __aligned(64) int U[2*VSIZEX]; + __aligned(64) int V[2*VSIZEX]; + int index = 0; + for (int y=y0; y<y1; y++) { + const bool lasty = y+1>=y1; + const vintx vy = y; + for (int x=x0; x<x1; ) { //x+=VSIZEX) { + const bool lastx = x+VSIZEX >= x1; + vintx vx = x+vintx(step); + vintx::storeu(&U[index], vx); + vintx::storeu(&V[index], vy); + const int dx = min(x1-x,VSIZEX); + index += dx; + x += dx; + if (index >= VSIZEX || (lastx && lasty)) { + const vboolx valid = vintx(step) < vintx(index); + closure(valid, vintx::load(U), vintx::load(V)); + x-= max(0, index-VSIZEX); + index = 0; + } + } + } + } +} diff --git a/thirdparty/embree/common/simd/sse.cpp b/thirdparty/embree/common/simd/sse.cpp new file mode 100644 index 0000000000..535d6943d8 --- /dev/null +++ b/thirdparty/embree/common/simd/sse.cpp @@ -0,0 +1,34 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sse.h" + +namespace embree +{ + const __m128 mm_lookupmask_ps[16] = { + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) + }; + + const __m128d mm_lookupmask_pd[4] = { + _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1)) + }; + +} diff --git a/thirdparty/embree/common/simd/sse.h b/thirdparty/embree/common/simd/sse.h new file mode 100644 index 0000000000..1465fb4fb0 --- /dev/null +++ b/thirdparty/embree/common/simd/sse.h @@ -0,0 +1,35 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "../sys/alloc.h" +#include "../math/constants.h" +#include "varying.h" + +namespace embree +{ +#if defined(__SSE4_1__) + __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { + return _mm_blendv_ps(f,t,mask); + } +#else + __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { + return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); + } +#endif + + extern const __m128 mm_lookupmask_ps[16]; + extern const __m128d mm_lookupmask_pd[4]; +} + +#if defined(__AVX512VL__) +#include "vboolf4_avx512.h" +#else +#include "vboolf4_sse2.h" +#endif +#include "vint4_sse2.h" +#include "vuint4_sse2.h" +#include "vfloat4_sse2.h" diff --git a/thirdparty/embree/common/simd/varying.h b/thirdparty/embree/common/simd/varying.h new file mode 100644 index 0000000000..9b98d326be --- /dev/null +++ b/thirdparty/embree/common/simd/varying.h @@ -0,0 +1,145 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" + +namespace embree +{ + /* Varying numeric types */ + template<int N> + struct vfloat_impl + { + union { float f[N]; int i[N]; }; + __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < N); return f[index]; } + }; + + template<int N> + struct vdouble_impl + { + union { double f[N]; long long i[N]; }; + __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; } + __forceinline double& operator [](size_t index) { assert(index < N); return f[index]; } + }; + + template<int N> + struct vint_impl + { + int i[N]; + __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + template<int N> + struct vuint_impl + { + unsigned int i[N]; + __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + template<int N> + struct vllong_impl + { + long long i[N]; + __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline long long& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + /* Varying bool types */ + template<int N> struct vboolf_impl { int i[N]; }; // for float/int + template<int N> struct vboold_impl { long long i[N]; }; // for double/long long + + /* Varying size constants */ +#if defined(__AVX512VL__) // SKX + const int VSIZEX = 8; // default size + const int VSIZEL = 16; // large size +#elif defined(__AVX__) + const int VSIZEX = 8; + const int VSIZEL = 8; +#else + const int VSIZEX = 4; + const int VSIZEL = 4; +#endif + + template<int N> + struct vtypes { + using vbool = vboolf_impl<N>; + using vboolf = vboolf_impl<N>; + using vboold = vboold_impl<N>; + using vint = vint_impl<N>; + using vuint = vuint_impl<N>; + using vllong = vllong_impl<N>; + using vfloat = vfloat_impl<N>; + using vdouble = vdouble_impl<N>; + }; + + template<> + struct vtypes<1> { + using vbool = bool; + using vboolf = bool; + using vboold = bool; + using vint = int; + using vuint = unsigned int; + using vllong = long long; + using vfloat = float; + using vdouble = double; + }; + + /* Aliases to default types */ + template<int N> using vbool = typename vtypes<N>::vbool; + template<int N> using vboolf = typename vtypes<N>::vboolf; + template<int N> using vboold = typename vtypes<N>::vboold; + template<int N> using vint = typename vtypes<N>::vint; + template<int N> using vuint = typename vtypes<N>::vuint; + template<int N> using vllong = typename vtypes<N>::vllong; + template<int N> using vreal = typename vtypes<N>::vfloat; + template<int N> using vfloat = typename vtypes<N>::vfloat; + template<int N> using vdouble = typename vtypes<N>::vdouble; + + /* 4-wide shortcuts */ + typedef vfloat<4> vfloat4; + typedef vdouble<4> vdouble4; + typedef vreal<4> vreal4; + typedef vint<4> vint4; + typedef vuint<4> vuint4; + typedef vllong<4> vllong4; + typedef vbool<4> vbool4; + typedef vboolf<4> vboolf4; + typedef vboold<4> vboold4; + + /* 8-wide shortcuts */ + typedef vfloat<8> vfloat8; + typedef vdouble<8> vdouble8; + typedef vreal<8> vreal8; + typedef vint<8> vint8; + typedef vuint<8> vuint8; + typedef vllong<8> vllong8; + typedef vbool<8> vbool8; + typedef vboolf<8> vboolf8; + typedef vboold<8> vboold8; + + /* 16-wide shortcuts */ + typedef vfloat<16> vfloat16; + typedef vdouble<16> vdouble16; + typedef vreal<16> vreal16; + typedef vint<16> vint16; + typedef vuint<16> vuint16; + typedef vllong<16> vllong16; + typedef vbool<16> vbool16; + typedef vboolf<16> vboolf16; + typedef vboold<16> vboold16; + + /* Default shortcuts */ + typedef vfloat<VSIZEX> vfloatx; + typedef vdouble<VSIZEX> vdoublex; + typedef vreal<VSIZEX> vrealx; + typedef vint<VSIZEX> vintx; + typedef vuint<VSIZEX> vuintx; + typedef vllong<VSIZEX> vllongx; + typedef vbool<VSIZEX> vboolx; + typedef vboolf<VSIZEX> vboolfx; + typedef vboold<VSIZEX> vbooldx; +} diff --git a/thirdparty/embree/common/simd/vboold4_avx.h b/thirdparty/embree/common/simd/vboold4_avx.h new file mode 100644 index 0000000000..7db0d1c5c1 --- /dev/null +++ b/thirdparty/embree/common/simd/vboold4_avx.h @@ -0,0 +1,169 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX bool type for 64bit data types*/ + template<> + struct vboold<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256d v; + struct { __m128d vl,vh; }; + long long i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold4& a) { v = a.v; } + __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; } + + __forceinline vboold(__m256d a) : v(a) {} + __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {} + + __forceinline operator const __m256() const { return _mm256_castpd_ps(v); } + __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); } + __forceinline operator const __m256d() const { return v; } + + __forceinline vboold(int a) + { + assert(a >= 0 && a <= 255); +#if defined (__AVX2__) + const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1); + const __m256i b = _mm256_set1_epi64x(a); + const __m256i c = _mm256_and_si256(b,mask); + v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask)); +#else + vl = mm_lookupmask_pd[a & 0x3]; + vh = mm_lookupmask_pd[a >> 2]; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {} + __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; } + __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); } + __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); } + __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } + + __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); } + + __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } + __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } + __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } + __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); } + + __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) { + return _mm256_blendv_pd(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); } + __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); } + + +#if defined(__AVX2__) + template<int i0, int i1, int i2, int i3> + __forceinline vboold4 shuffle(const vboold4& v) { + return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i> + __forceinline vboold4 shuffle(const vboold4& v) { + return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i)); + } +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } + __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); } + + __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } + __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); } + __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; } + + __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } + __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); } + __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); } + + __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); } + __forceinline size_t popcnt (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold4& a, size_t index) { return a[index]; } + __forceinline void set (vboold4& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboold4& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " + << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboold4_avx512.h b/thirdparty/embree/common/simd/vboold4_avx512.h new file mode 100644 index 0000000000..ceaad7bba5 --- /dev/null +++ b/thirdparty/embree/common/simd/vboold4_avx512.h @@ -0,0 +1,156 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX-512 bool type */ + template<> + struct vboold<4> + { + typedef vboold4 Bool; + typedef vint4 Int; + + enum { size = 4 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold4& t) { v = t.v; } + __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; } + + __forceinline vboold(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboold(bool b) { v = b ? 0xf : 0x0; } + __forceinline vboold(int t) { v = (__mmask8)t; } + __forceinline vboold(unsigned int t) { v = (__mmask8)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m256i mask64() const { + return _mm256_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(0x0) {} + __forceinline vboold(TrueTy) : v(0xf) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 4); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); } + __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); } + __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } + + __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } + __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } + __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } + __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } + + __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboold4& a) { return a.v == 0xf; } + __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } + __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); } + __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboold4& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } + __forceinline void set(vboold4& a, size_t index) { assert(index < 4); a |= 1 << index; } + __forceinline void clear(vboold4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) + { + cout << "<"; + for (size_t i=0; i<4; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboold8_avx512.h b/thirdparty/embree/common/simd/vboold8_avx512.h new file mode 100644 index 0000000000..66d2054872 --- /dev/null +++ b/thirdparty/embree/common/simd/vboold8_avx512.h @@ -0,0 +1,151 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX-512 bool type */ + template<> + struct vboold<8> + { + typedef vboold8 Bool; + typedef vint8 Int; + + enum { size = 8 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold8& t) { v = t.v; } + __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; } + + __forceinline vboold(const __mmask8& t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboold(bool b) { v = b ? 0xff : 0x00; } + __forceinline vboold(int t) { v = (__mmask8)t; } + __forceinline vboold(unsigned int t) { v = (__mmask8)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int64 mask */ + __forceinline __m512i mask64() const { + return _mm512_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(0x00) {} + __forceinline vboold(TrueTy) : v(0xff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 8); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); } + __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); } + __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } + + __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; } + __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; } + __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } + __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); } + + __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboold8& a) { return a.v == 0xff; } + __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); } + __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); } + __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboold8& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } + __forceinline void set(vboold8& a, size_t index) { assert(index < 8); a |= 1 << index; } + __forceinline void clear(vboold8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a) + { + cout << "<"; + for (size_t i=0; i<8; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf16_avx512.h b/thirdparty/embree/common/simd/vboolf16_avx512.h new file mode 100644 index 0000000000..19841dcea8 --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf16_avx512.h @@ -0,0 +1,153 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 16-wide AVX-512 bool type */ + template<> + struct vboolf<16> + { + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + __mmask16 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf16& t) { v = t.v; } + __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask16& t) { v = t; } + __forceinline operator __mmask16() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; } + __forceinline vboolf(int t) { v = (__mmask16)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask16)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m512i mask32() const { + return _mm512_movm_epi32(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x0000) {} + __forceinline vboolf(TrueTy) : v(0xffff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 16); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); } + __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); } + __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); } + + __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; } + __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; } + __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); } + __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); } + + __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) { + return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf16& a) { return _mm512_kortestc(a,a) != 0; } + __forceinline int any (const vboolf16& a) { return _mm512_kortestz(a,a) == 0; } + __forceinline int none(const vboolf16& a) { return _mm512_kortestz(a,a) != 0; } + + __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); } + __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); } + __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Convertion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); } + __forceinline vboolf16 toMask(const int& a) { return mm512_int2mask(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf16& a, size_t index) { assert(index < 16); a |= 1 << index; } + __forceinline void clear(vboolf16& a, size_t index) { assert(index < 16); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a) + { + cout << "<"; + for (size_t i=0; i<16; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf4_avx512.h b/thirdparty/embree/common/simd/vboolf4_avx512.h new file mode 100644 index 0000000000..e65f66b025 --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf4_avx512.h @@ -0,0 +1,159 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX-512 bool type */ + template<> + struct vboolf<4> + { + typedef vboolf4 Bool; + typedef vint4 Int; + + enum { size = 4 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf4& t) { v = t.v; } + __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; } + __forceinline vboolf(int t) { v = (__mmask8)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } + + __forceinline vboolf(bool a, bool b, bool c, bool d) + : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m256i mask64() const { + return _mm256_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x0) {} + __forceinline vboolf(TrueTy) : v(0xf) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 4); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); } + __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); } + __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } + + __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } + __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } + __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } + __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } + + __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf4& a) { return a.v == 0xf; } + __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } + __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } + __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf4& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf4& a, size_t index) { assert(index < 4); a |= 1 << index; } + __forceinline void clear(vboolf4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) + { + cout << "<"; + for (size_t i=0; i<4; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h new file mode 100644 index 0000000000..fa84b1b6ee --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf4_sse2.h @@ -0,0 +1,189 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide SSE bool type */ + template<> + struct vboolf<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128 v; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf4& other) { v = other.v; } + __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; } + + __forceinline vboolf(__m128 input) : v(input) {} + __forceinline operator const __m128&() const { return v; } + __forceinline operator const __m128i() const { return _mm_castps_si128(v); } + __forceinline operator const __m128d() const { return _mm_castps_pd(v); } + + __forceinline vboolf(bool a) + : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline vboolf(bool a, bool b) + : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} + __forceinline vboolf(bool a, bool b, bool c, bool d) + : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} + __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; } + __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_castps_si128(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {} + __forceinline vboolf(TrueTy) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); } + __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); } + __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } + + __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } + __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } + __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } + __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + + __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { +#if defined(__SSE4_1__) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } + __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf4 shuffle(const vboolf4& v) { + return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0> + __forceinline vboolf4 shuffle(const vboolf4& v) { + return shuffle<i0,i0,i0,i0>(v); + } + +#if defined(__SSE3__) + template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); } + template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); } + template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } +#endif + +#if defined(__SSE4_1__) + template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } + template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); } + template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; } + __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; } + + __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; } + __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; } + __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; } + + __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } + __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } + __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } +#if defined(__SSE4_2__) + __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } +#else + __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; } + __forceinline void set(vboolf4& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboolf4& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf8_avx.h b/thirdparty/embree/common/simd/vboolf8_avx.h new file mode 100644 index 0000000000..ba77cc3c5e --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf8_avx.h @@ -0,0 +1,202 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX bool type */ + template<> + struct vboolf<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256 v; + struct { __m128 vl,vh; }; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf8& a) { v = a.v; } + __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; } + + __forceinline vboolf(__m256 a) : v(a) {} + __forceinline operator const __m256&() const { return v; } + __forceinline operator const __m256i() const { return _mm256_castps_si256(v); } + __forceinline operator const __m256d() const { return _mm256_castps_pd(v); } + + __forceinline vboolf(int a) + { + assert(a >= 0 && a <= 255); +#if defined (__AVX2__) + const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1); + const __m256i b = _mm256_set1_epi32(a); + const __m256i c = _mm256_and_si256(b,mask); + v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask)); +#else + vl = mm_lookupmask_ps[a & 0xF]; + vh = mm_lookupmask_ps[a >> 4]; +#endif + } + + __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {} + + __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {} + __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {} + __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {} + __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {} + + /* return int32 mask */ + __forceinline __m256i mask32() const { + return _mm256_castps_si256(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {} + __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); } + __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); } + __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } + + __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); } + + __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } + __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } + __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } + __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); } + + __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) { + return _mm256_blendv_ps(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); } + __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); } + + template<int i> + __forceinline vboolf8 shuffle(const vboolf8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template<int i0, int i1> + __forceinline vboolf8 shuffle4(const vboolf8& v) { + return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf8 shuffle(const vboolf8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); } + template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); } + template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } + + template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); } + template<int i> __forceinline vboolf4 extract4 (const vboolf8& a) { return _mm256_extractf128_ps(a, i); } + template<> __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } + __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); } + + __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } + __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); } + __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; } + + __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } + __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } + __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } + + __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); } + __forceinline size_t popcnt (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; } + __forceinline void set(vboolf8& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboolf8& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " + << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vboolf8_avx512.h b/thirdparty/embree/common/simd/vboolf8_avx512.h new file mode 100644 index 0000000000..73ff5666e1 --- /dev/null +++ b/thirdparty/embree/common/simd/vboolf8_avx512.h @@ -0,0 +1,159 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX-512 bool type */ + template<> + struct vboolf<8> + { + typedef vboolf8 Bool; + typedef vint8 Int; + + enum { size = 8 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf8& t) { v = t.v; } + __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; } + __forceinline vboolf(int t) { v = (__mmask8)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } + + __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) + : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m256i mask32() const { + return _mm256_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m512i mask64() const { + return _mm512_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x00) {} + __forceinline vboolf(TrueTy) : v(0xff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 8); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); } + __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); } + __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } + + __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } + __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } + __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } + __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); } + + __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf8& a) { return a.v == 0xff; } + __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } + __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } + __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf8& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf8& a, size_t index) { assert(index < 8); a |= 1 << index; } + __forceinline void clear(vboolf8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) + { + cout << "<"; + for (size_t i=0; i<8; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vdouble4_avx.h b/thirdparty/embree/common/simd/vdouble4_avx.h new file mode 100644 index 0000000000..55326de7dd --- /dev/null +++ b/thirdparty/embree/common/simd/vdouble4_avx.h @@ -0,0 +1,321 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX 64-bit double type */ + template<> + struct vdouble<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256d v; + double i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble() {} + __forceinline vdouble(const vdouble4& t) { v = t.v; } + __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; } + + __forceinline vdouble(const __m256d& t) { v = t; } + __forceinline operator __m256d() const { return v; } + + __forceinline vdouble(double i) { + v = _mm256_set1_pd(i); + } + + __forceinline vdouble(double a, double b, double c, double d) { + v = _mm256_set_pd(d,c,b,a); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {} + __forceinline vdouble(OneTy) : v(_mm256_set1_pd(1)) {} + __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {} + __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) { + _mm256_stream_pd(ptr, a); + } + + static __forceinline vdouble4 loadu(const double* addr) { + return _mm256_loadu_pd(addr); + } + + static __forceinline vdouble4 load(const vdouble4* addr) { + return _mm256_load_pd((double*)addr); + } + + static __forceinline vdouble4 load(const double* addr) { + return _mm256_load_pd(addr); + } + + static __forceinline void store(double* ptr, const vdouble4& v) { + _mm256_store_pd(ptr, v); + } + + static __forceinline void storeu(double* ptr, const vdouble4& v) { + _mm256_storeu_pd(ptr, v); + } + + static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline double& operator [](size_t index) { assert(index < 4); return i[index]; } + __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline vdouble4 asDouble(const vllong4& a) { return _mm256_castsi256_pd(a); } + __forceinline vllong4 asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); } +#endif + + __forceinline vdouble4 operator +(const vdouble4& a) { return a; } + __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); } + __forceinline vdouble4 operator +(const vdouble4& a, double b) { return a + vdouble4(b); } + __forceinline vdouble4 operator +(double a, const vdouble4& b) { return vdouble4(a) + b; } + + __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); } + __forceinline vdouble4 operator -(const vdouble4& a, double b) { return a - vdouble4(b); } + __forceinline vdouble4 operator -(double a, const vdouble4& b) { return vdouble4(a) - b; } + + __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); } + __forceinline vdouble4 operator *(const vdouble4& a, double b) { return a * vdouble4(b); } + __forceinline vdouble4 operator *(double a, const vdouble4& b) { return vdouble4(a) * b; } + + __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); } + __forceinline vdouble4 operator &(const vdouble4& a, double b) { return a & vdouble4(b); } + __forceinline vdouble4 operator &(double a, const vdouble4& b) { return vdouble4(a) & b; } + + __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); } + __forceinline vdouble4 operator |(const vdouble4& a, double b) { return a | vdouble4(b); } + __forceinline vdouble4 operator |(double a, const vdouble4& b) { return vdouble4(a) | b; } + + __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); } + __forceinline vdouble4 operator ^(const vdouble4& a, double b) { return a ^ vdouble4(b); } + __forceinline vdouble4 operator ^(double a, const vdouble4& b) { return vdouble4(a) ^ b; } + + __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); } + __forceinline vdouble4 min(const vdouble4& a, double b) { return min(a,vdouble4(b)); } + __forceinline vdouble4 min(double a, const vdouble4& b) { return min(vdouble4(a),b); } + + __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); } + __forceinline vdouble4 max(const vdouble4& a, double b) { return max(a,vdouble4(b)); } + __forceinline vdouble4 max(double a, const vdouble4& b) { return max(vdouble4(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__FMA__) + __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); } + __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); } + __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); } + __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); } +#else + __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; } + __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; } + __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;} + __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; } + __forceinline vdouble4& operator +=(vdouble4& a, double b) { return a = a + b; } + + __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; } + __forceinline vdouble4& operator -=(vdouble4& a, double b) { return a = a - b; } + + __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; } + __forceinline vdouble4& operator *=(vdouble4& a, double b) { return a = a * b; } + + __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; } + __forceinline vdouble4& operator &=(vdouble4& a, double b) { return a = a & b; } + + __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; } + __forceinline vdouble4& operator |=(vdouble4& a, double b) { return a = a | b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } +#endif + + __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); } + __forceinline vboold4 operator ==(double a, const vdouble4& b) { return vdouble4(a) == b; } + + __forceinline vboold4 operator !=(const vdouble4& a, double b) { return a != vdouble4(b); } + __forceinline vboold4 operator !=(double a, const vdouble4& b) { return vdouble4(a) != b; } + + __forceinline vboold4 operator < (const vdouble4& a, double b) { return a < vdouble4(b); } + __forceinline vboold4 operator < (double a, const vdouble4& b) { return vdouble4(a) < b; } + + __forceinline vboold4 operator >=(const vdouble4& a, double b) { return a >= vdouble4(b); } + __forceinline vboold4 operator >=(double a, const vdouble4& b) { return vdouble4(a) >= b; } + + __forceinline vboold4 operator > (const vdouble4& a, double b) { return a > vdouble4(b); } + __forceinline vboold4 operator > (double a, const vdouble4& b) { return vdouble4(a) > b; } + + __forceinline vboold4 operator <=(const vdouble4& a, double b) { return a <= vdouble4(b); } + __forceinline vboold4 operator <=(double a, const vdouble4& b) { return vdouble4(a) <= b; } + + __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; } + __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; } + __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a < b; } + __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; } + __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a > b; } + __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); } + __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); } + __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a < b); } + __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); } + __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a > b); } + __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); } +#endif + + __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) { +#if defined(__AVX512VL__) + return _mm256_mask_blend_pd(m, f, t); +#else + return _mm256_blendv_pd(f, t, m); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i0, int i1> + __forceinline vdouble4 shuffle(const vdouble4& v) { + return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); + } + + template<int i> + __forceinline vdouble4 shuffle(const vdouble4& v) { + return shuffle<i, i>(v); + } + + template<int i0, int i1> + __forceinline vdouble4 shuffle2(const vdouble4& v) { + return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0); + } + + __forceinline double toScalar(const vdouble4& v) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); } + __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); } + + __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); } + __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); } + + __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } + + __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } + + __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } + + __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); } + __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); } + __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<4; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vdouble8_avx512.h b/thirdparty/embree/common/simd/vdouble8_avx512.h new file mode 100644 index 0000000000..98d21bfe4a --- /dev/null +++ b/thirdparty/embree/common/simd/vdouble8_avx512.h @@ -0,0 +1,351 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX-512 64-bit double type */ + template<> + struct vdouble<8> + { + ALIGNED_STRUCT_(64); + + typedef vboold8 Bool; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m512d v; + double i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble() {} + __forceinline vdouble(const vdouble8& t) { v = t.v; } + __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; } + + __forceinline vdouble(const __m512d& t) { v = t; } + __forceinline operator __m512d() const { return v; } + __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); } + + __forceinline vdouble(double i) { + v = _mm512_set1_pd(i); + } + + __forceinline vdouble(double a, double b, double c, double d) { + v = _mm512_set4_pd(d,c,b,a); + } + + __forceinline vdouble(double a0, double a1, double a2, double a3, + double a4, double a5, double a6, double a7) + { + v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {} + __forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {} + __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} + __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) { + _mm512_stream_pd((double*)ptr, a); + } + + static __forceinline vdouble8 loadu(const void* addr) { + return _mm512_loadu_pd((double*)addr); + } + + static __forceinline vdouble8 load(const vdouble8* addr) { + return _mm512_load_pd((double*)addr); + } + + static __forceinline vdouble8 load(const double* addr) { + return _mm512_load_pd(addr); + } + + static __forceinline void store(void* ptr, const vdouble8& v) { + _mm512_store_pd(ptr, v); + } + + static __forceinline void storeu(void* ptr, const vdouble8& v) { + _mm512_storeu_pd(ptr, v); + } + + static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) { + _mm512_mask_storeu_pd(ptr, mask, f); + } + + static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) { + _mm512_mask_store_pd(addr, mask, v2); + } + + static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) { + return _mm512_mask_compress_pd(v, mask, v); + } + + static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) { + return _mm512_mask_compress_pd(a, mask, b); + } + + static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; } + __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); } + __forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); } + + __forceinline vdouble8 operator +(const vdouble8& a) { return a; } + __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); } + __forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); } + __forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; } + + __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); } + __forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); } + __forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; } + + __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); } + __forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); } + __forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; } + + __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); } + __forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); } + __forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; } + + __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); } + __forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); } + __forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; } + + __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); } + __forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); } + __forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; } + + __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); } + __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); } + + __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); } + __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); } + + __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); } + __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); } + __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); } + + __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); } + __forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); } + __forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); } + + __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); } + __forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); } + __forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); } + + __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); } + __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); } + + __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); } + __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); } + __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); } + __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); } + __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; } + __forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; } + + __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; } + __forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; } + + __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; } + __forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; } + + __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; } + __forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; } + + __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; } + __forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; } + + __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; } + __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); } + __forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; } + + __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); } + __forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; } + + __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); } + __forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; } + + __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); } + __forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; } + + __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); } + __forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; } + + __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); } + __forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; } + + __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) { + return _mm512_mask_or_pd(f,m,t,t); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i0, int i1> + __forceinline vdouble8 shuffle(const vdouble8& v) { + return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); + } + + template<int i> + __forceinline vdouble8 shuffle(const vdouble8& v) { + return shuffle<i, i>(v); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vdouble8 shuffle(const vdouble8& v) { + return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1> + __forceinline vdouble8 shuffle4(const vdouble8& v) { + return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); + } + + template<int i> + __forceinline vdouble8 shuffle4(const vdouble8& v) { + return shuffle4<i, i>(v); + } + + template<int i> + __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) { + return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i)); + } + + __forceinline double toScalar(const vdouble8& v) { + return _mm_cvtsd_f64(_mm512_castpd512_pd128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } + + __forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } + + __forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } + + __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); } + __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); } + __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) { + return _mm512_permutexvar_pd(index, v); + } + + __forceinline vdouble8 reverse(const vdouble8& a) { + return permute(a, vllong8(reverse_step)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<8; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vfloat16_avx512.h b/thirdparty/embree/common/simd/vfloat16_avx512.h new file mode 100644 index 0000000000..9f1e2459c4 --- /dev/null +++ b/thirdparty/embree/common/simd/vfloat16_avx512.h @@ -0,0 +1,615 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 16-wide AVX-512 float type */ + template<> + struct vfloat<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512 v; + float f[16]; + int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat16& t) { v = t; } + __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; } + + __forceinline vfloat(const __m512& t) { v = t; } + __forceinline operator __m512() const { return v; } + __forceinline operator __m256() const { return _mm512_castps512_ps256(v); } + __forceinline operator __m128() const { return _mm512_castps512_ps128(v); } + + __forceinline vfloat(float f) { + v = _mm512_set1_ps(f); + } + + __forceinline vfloat(float a, float b, float c, float d) { + v = _mm512_set4_ps(a, b, c, d); + } + + __forceinline vfloat(const vfloat4& i) { + v = _mm512_broadcast_f32x4(i); + } + + __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) { + v = _mm512_castps128_ps512(a); + v = _mm512_insertf32x4(v, b, 1); + v = _mm512_insertf32x4(v, c, 2); + v = _mm512_insertf32x4(v, d, 3); + } + + __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) { + v = _mm512_broadcast_f32x4(a); + v = _mm512_mask_broadcast_f32x4(v,mask,b); + } + + __forceinline vfloat(const vfloat8& i) { + v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i))); + } + + __forceinline vfloat(const vfloat8& a, const vfloat8& b) { + v = _mm512_castps256_ps512(a); +#if defined(__AVX512DQ__) + v = _mm512_insertf32x8(v, b, 1); +#else + v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1)); +#endif + } + + /* WARNING: due to f64x4 the mask is considered as an 8bit mask */ + /*__forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) { + __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a)); + aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b)); + v = _mm512_castpd_ps(aa); + }*/ + + __forceinline explicit vfloat(const vint16& a) { + v = _mm512_cvtepi32_ps(a); + } + + __forceinline explicit vfloat(const vuint16& a) { + v = _mm512_cvtepu32_ps(a); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm512_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm512_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vfloat(NaNTy) : v(_mm512_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr); } + static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); } + + static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); } + + static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); } + + static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) { + _mm512_stream_ps((float*)ptr,a); + } + + static __forceinline vfloat16 broadcast(const float* f) { + return _mm512_set1_ps(*f); + } + + template<int scale = 4> + static __forceinline vfloat16 gather(const float* ptr, const vint16& index) { + return _mm512_i32gather_ps(index, ptr, scale); + } + + template<int scale = 4> + static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) { + vfloat16 r = zero; + return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale); + } + + template<int scale = 4> + static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) { + _mm512_i32scatter_ps(ptr, index, v, scale); + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) { + _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float& operator [](size_t index) { assert(index < 16); return f[index]; } + __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 asFloat(const vint16& a) { return _mm512_castsi512_ps(a); } + __forceinline vint16 asInt (const vfloat16& a) { return _mm512_castps_si512(a); } + __forceinline vuint16 asUInt (const vfloat16& a) { return _mm512_castps_si512(a); } + + __forceinline vint16 toInt (const vfloat16& a) { return vint16(a); } + __forceinline vfloat16 toFloat(const vint16& a) { return vfloat16(a); } + + __forceinline vfloat16 operator +(const vfloat16& a) { return a; } + __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); } + + __forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); } + __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); } + + __forceinline vfloat16 rcp(const vfloat16& a) { + const vfloat16 r = _mm512_rcp14_ps(a); + return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f))); + } + + __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); } + __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); } + + __forceinline vfloat16 rsqrt(const vfloat16& a) + { + const vfloat16 r = _mm512_rsqrt14_ps(a); + return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r, + _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); } + __forceinline vfloat16 operator +(const vfloat16& a, float b) { return a + vfloat16(b); } + __forceinline vfloat16 operator +(float a, const vfloat16& b) { return vfloat16(a) + b; } + + __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); } + __forceinline vfloat16 operator -(const vfloat16& a, float b) { return a - vfloat16(b); } + __forceinline vfloat16 operator -(float a, const vfloat16& b) { return vfloat16(a) - b; } + + __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); } + __forceinline vfloat16 operator *(const vfloat16& a, float b) { return a * vfloat16(b); } + __forceinline vfloat16 operator *(float a, const vfloat16& b) { return vfloat16(a) * b; } + + __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); } + __forceinline vfloat16 operator /(const vfloat16& a, float b) { return a/vfloat16(b); } + __forceinline vfloat16 operator /(float a, const vfloat16& b) { return vfloat16(a)/b; } + + __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); } + __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); } + __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); + } + + __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b); } + __forceinline vfloat16 min(const vfloat16& a, float b) { return _mm512_min_ps(a,vfloat16(b)); } + __forceinline vfloat16 min(const float& a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); } + + __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); } + __forceinline vfloat16 max(const vfloat16& a, float b) { return _mm512_max_ps(a,vfloat16(b)); } + __forceinline vfloat16 max(const float& a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); } + + __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) { + const vint16 ai = _mm512_castps_si512(a); + const vint16 bi = _mm512_castps_si512(b); + const vint16 ci = _mm512_min_epi32(ai,bi); + return _mm512_castsi512_ps(ci); + } + + __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) { + const vint16 ai = _mm512_castps_si512(a); + const vint16 bi = _mm512_castps_si512(b); + const vint16 ci = _mm512_max_epi32(ai,bi); + return _mm512_castsi512_ps(ci); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); } + __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } + __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); } + __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; } + __forceinline vfloat16& operator +=(vfloat16& a, float b) { return a = a + b; } + + __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; } + __forceinline vfloat16& operator -=(vfloat16& a, float b) { return a = a - b; } + + __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; } + __forceinline vfloat16& operator *=(vfloat16& a, float b) { return a = a * b; } + + __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; } + __forceinline vfloat16& operator /=(vfloat16& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vfloat16& a, float b) { return a == vfloat16(b); } + __forceinline vboolf16 operator ==(float a, const vfloat16& b) { return vfloat16(a) == b; } + + __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vfloat16& a, float b) { return a != vfloat16(b); } + __forceinline vboolf16 operator !=(float a, const vfloat16& b) { return vfloat16(a) != b; } + + __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vfloat16& a, float b) { return a < vfloat16(b); } + __forceinline vboolf16 operator < (float a, const vfloat16& b) { return vfloat16(a) < b; } + + __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vfloat16& a, float b) { return a >= vfloat16(b); } + __forceinline vboolf16 operator >=(float a, const vfloat16& b) { return vfloat16(a) >= b; } + + __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vfloat16& a, float b) { return a > vfloat16(b); } + __forceinline vboolf16 operator > (float a, const vfloat16& b) { return vfloat16(a) > b; } + + __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vfloat16& a, float b) { return a <= vfloat16(b); } + __forceinline vboolf16 operator <=(float a, const vfloat16& b) { return vfloat16(a) <= b; } + + __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) { + return _mm512_mask_blend_ps(s, f, t); + } + + __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) { + return madd(t,b-a,a); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 floor(const vfloat16& a) { + return _mm512_floor_ps(a); + } + __forceinline vfloat16 ceil (const vfloat16& a) { + return _mm512_ceil_ps(a); + } + __forceinline vfloat16 round (const vfloat16& a) { + return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + } + __forceinline vint16 floori (const vfloat16& a) { + return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); } + __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); } + + template<int i> + __forceinline vfloat16 shuffle(const vfloat16& v) { + return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat16 shuffle(const vfloat16& v) { + return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i> + __forceinline vfloat16 shuffle4(const vfloat16& v) { + return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat16 shuffle4(const vfloat16& v) { + return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e)); + } + + __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e)); + } + + __forceinline vfloat16 permute(vfloat16 v, __m512i index) { + return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v))); + } + + __forceinline vfloat16 reverse(const vfloat16& v) { + return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); + } + + template<int i> + __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); + }; + + template<int i> + __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); + }; + + __forceinline vfloat16 shift_left_1(const vfloat16& a) { + vfloat16 z = zero; + return mask_align_shift_right<15>(0xfffe,z,a,a); + } + + __forceinline vfloat16 shift_right_1(const vfloat16& x) { + return align_shift_right<1>(zero,x); + } + + __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); } + + + template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); } + + template<int N, int i> + vfloat<N> extractN(const vfloat16& v); + + template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } + template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); } + template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); } + template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); } + + template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } + template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); } + + template<int i> __forceinline vfloat4 extract4 (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); } + template<> __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } + + template<int i> __forceinline vfloat8 extract8 (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); } + template<> __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) + { + vfloat16 a0a2_b0b2 = unpacklo(r0, r2); + vfloat16 c0c2_d0d2 = unpackhi(r0, r2); + vfloat16 a1a3_b1b3 = unpacklo(r1, r3); + vfloat16 c1c3_d1d3 = unpackhi(r1, r3); + + c0 = unpacklo(a0a2_b0b2, a1a3_b1b3); + c1 = unpackhi(a0a2_b0b2, a1a3_b1b3); + c2 = unpacklo(c0c2_d0d2, c1c3_d1d3); + c3 = unpackhi(c0c2_d0d2, c1c3_d1d3); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, + const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + const vfloat4& r8, const vfloat4& r9, const vfloat4& r10, const vfloat4& r11, + const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) + { + return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15), + c0, c1, c2, c3); + } + + __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, + const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, + vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) + { + vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3; + transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3); + + vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7; + transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7); + + c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); + c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); + c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); + c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); + c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); + c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); + c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); + c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, + const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, + const vfloat8& r8, const vfloat8& r9, const vfloat8& r10, const vfloat8& r11, + const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, + vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) + { + return transpose(vfloat16(r0, r8), vfloat16(r1, r9), vfloat16(r2, r10), vfloat16(r3, r11), + vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15), + c0, c1, c2, c3, c4, c5, c6, c7); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 vreduce_add2(vfloat16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline vfloat16 vreduce_min2(vfloat16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vfloat16 vreduce_max2(vfloat16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); } + __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); } + + __forceinline size_t select_min(const vfloat16& v) { + return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ))); + } + + __forceinline size_t select_max(const vfloat16& v) { + return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ))); + } + + __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) + { + const vfloat16 a = select(valid,v,vfloat16(pos_inf)); + const vbool16 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + + __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) + { + const vfloat16 a = select(valid,v,vfloat16(neg_inf)); + const vbool16 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + __forceinline vfloat16 prefix_sum(const vfloat16& a) + { + const vfloat16 z(zero); + vfloat16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) + { + const vfloat16 z(zero); + vfloat16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + __forceinline vfloat16 prefix_min(const vfloat16& a) + { + const vfloat16 z(pos_inf); + vfloat16 v = a; + v = min(v,align_shift_right<16-1>(v,z)); + v = min(v,align_shift_right<16-2>(v,z)); + v = min(v,align_shift_right<16-4>(v,z)); + v = min(v,align_shift_right<16-8>(v,z)); + return v; + } + + __forceinline vfloat16 prefix_max(const vfloat16& a) + { + const vfloat16 z(neg_inf); + vfloat16 v = a; + v = max(v,align_shift_right<16-1>(v,z)); + v = max(v,align_shift_right<16-2>(v,z)); + v = max(v,align_shift_right<16-4>(v,z)); + v = max(v,align_shift_right<16-8>(v,z)); + return v; + } + + + __forceinline vfloat16 reverse_prefix_min(const vfloat16& a) + { + const vfloat16 z(pos_inf); + vfloat16 v = a; + v = min(v,align_shift_right<1>(z,v)); + v = min(v,align_shift_right<2>(z,v)); + v = min(v,align_shift_right<4>(z,v)); + v = min(v,align_shift_right<8>(z,v)); + return v; + } + + __forceinline vfloat16 reverse_prefix_max(const vfloat16& a) + { + const vfloat16 z(neg_inf); + vfloat16 v = a; + v = max(v,align_shift_right<1>(z,v)); + v = max(v,align_shift_right<2>(z,v)); + v = max(v,align_shift_right<4>(z,v)); + v = max(v,align_shift_right<8>(z,v)); + return v; + } + + __forceinline vfloat16 rcp_safe(const vfloat16& a) { + return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h new file mode 100644 index 0000000000..5215bf9730 --- /dev/null +++ b/thirdparty/embree/common/simd/vfloat4_sse2.h @@ -0,0 +1,722 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide SSE float type */ + template<> + struct vfloat<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128 v; float f[4]; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat4& other) { v = other.v; } + __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; } + + __forceinline vfloat(__m128 a) : v(a) {} + __forceinline operator const __m128&() const { return v; } + __forceinline operator __m128&() { return v; } + + __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {} + __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} + + __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} + __forceinline explicit vfloat(const vuint4& x) { + const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); + const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 + const __m128 af = _mm_cvtepi32_ps(a); + const __m128 bf = _mm_castsi128_ps(b); + v = _mm_add_ps(af,bf); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {} + __forceinline vfloat(NaNTy) : v(_mm_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); } + static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); } + + static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } +#else + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); } +#endif + +#if defined(__AVX__) + static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); } +#else + static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); } +#endif + + static __forceinline vfloat4 load_nt (const float* ptr) { +#if defined (__SSE4_1__) + return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr)); +#else + return _mm_load_ps(ptr); +#endif + } + +#if defined(__SSE4_1__) + static __forceinline vfloat4 load(const char* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const char* ptr) { + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + +#if defined(__SSE4_1__) + static __forceinline vfloat4 load(const unsigned char* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const unsigned char* ptr) { + //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + +#if defined(__SSE4_1__) + static __forceinline vfloat4 load(const short* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const short* ptr) { + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + + static __forceinline vfloat4 load(const unsigned short* ptr) { + return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f)); + } + + static __forceinline void store_nt(void* ptr, const vfloat4& v) + { +#if defined (__SSE4_1__) + _mm_stream_ps((float*)ptr,v); +#else + _mm_store_ps((float*)ptr,v); +#endif + } + + template<int scale = 4> + static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { +#if defined(__AVX2__) + return _mm_i32gather_ps(ptr, index, scale); +#else + return vfloat4( + *(float*)(((char*)ptr)+scale*index[0]), + *(float*)(((char*)ptr)+scale*index[1]), + *(float*)(((char*)ptr)+scale*index[2]), + *(float*)(((char*)ptr)+scale*index[3])); +#endif + } + + template<int scale = 4> + static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) { + vfloat4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); +#elif defined(__AVX2__) + return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]); + return r; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v) + { +#if defined(__AVX512VL__) + _mm_i32scatter_ps((float*)ptr, index, v, scale); +#else + *(float*)(((char*)ptr)+scale*index[0]) = v[0]; + *(float*)(((char*)ptr)+scale*index[1]) = v[1]; + *(float*)(((char*)ptr)+scale*index[2]) = v[2]; + *(float*)(((char*)ptr)+scale*index[3]) = v[3]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v) + { +#if defined(__AVX512VL__) + _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale); +#else + if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0]; + if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1]; + if (likely(mask[2])) *(float*)(((char*)ptr)+scale*index[2]) = v[2]; + if (likely(mask[3])) *(float*)(((char*)ptr)+scale*index[3]) = v[3]; +#endif + } + + static __forceinline void store(const vboolf4& mask, char* ptr, const vint4& ofs, const vfloat4& v) { + scatter<1>(mask,ptr,ofs,v); + } + static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) { + scatter<4>(mask,ptr,ofs,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < 4); return f[index]; } + + friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_ps(m, f, t); +#elif defined(__SSE4_1__) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Load/Store + //////////////////////////////////////////////////////////////////////////////// + + template<> struct mem<vfloat4> + { + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return vfloat4::load (mask,ptr); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return vfloat4::loadu(mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::store (mask,ptr,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { vfloat4::storeu(mask,ptr,v); } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 asFloat(const vint4& a) { return _mm_castsi128_ps(a); } + __forceinline vint4 asInt (const vfloat4& a) { return _mm_castps_si128(a); } + __forceinline vuint4 asUInt (const vfloat4& a) { return _mm_castps_si128(a); } + + __forceinline vint4 toInt (const vfloat4& a) { return vint4(a); } + __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } + + __forceinline vfloat4 operator +(const vfloat4& a) { return a; } + __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } + + __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } +#if defined(__AVX512VL__) + __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } +#else + __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } +#endif + __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } + + __forceinline vfloat4 rcp(const vfloat4& a) + { +#if defined(__AVX512VL__) + const vfloat4 r = _mm_rcp14_ps(a); +#else + const vfloat4 r = _mm_rcp_ps(a); +#endif + +#if defined(__AVX2__) + return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); +#else + return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); +#endif + } + __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } + __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } + + __forceinline vfloat4 rsqrt(const vfloat4& a) + { +#if defined(__AVX512VL__) + vfloat4 r = _mm_rsqrt14_ps(a); +#else + vfloat4 r = _mm_rsqrt_ps(a); +#endif + +#if defined(__ARM_NEON) + r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#elif defined(__AVX2__) + r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#else + r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif + return r; + } + + __forceinline vboolf4 isnan(const vfloat4& a) { + const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); +#if defined(__AVX512VL__) + return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT); +#else + return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000))); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); } + __forceinline vfloat4 operator +(const vfloat4& a, float b) { return a + vfloat4(b); } + __forceinline vfloat4 operator +(float a, const vfloat4& b) { return vfloat4(a) + b; } + + __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); } + __forceinline vfloat4 operator -(const vfloat4& a, float b) { return a - vfloat4(b); } + __forceinline vfloat4 operator -(float a, const vfloat4& b) { return vfloat4(a) - b; } + + __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); } + __forceinline vfloat4 operator *(const vfloat4& a, float b) { return a * vfloat4(b); } + __forceinline vfloat4 operator *(float a, const vfloat4& b) { return vfloat4(a) * b; } + + __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); } + __forceinline vfloat4 operator /(const vfloat4& a, float b) { return a/vfloat4(b); } + __forceinline vfloat4 operator /(float a, const vfloat4& b) { return vfloat4(a)/b; } + + __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); } + __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); } + __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); } + __forceinline vfloat4 operator ^(const vfloat4& a, const vint4& b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); } + + __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); } + __forceinline vfloat4 min(const vfloat4& a, float b) { return _mm_min_ps(a,vfloat4(b)); } + __forceinline vfloat4 min(float a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); } + + __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); } + __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } + __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } + +#if defined(__SSE4_1__) + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epu32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epu32(ai,bi); + return _mm_castsi128_ps(ci); + } +#else + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { + return min(a,b); + } + + __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { + return max(a,b); + } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) || defined(__ARM_NEON) + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); } + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); } + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } +#else + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; } + __forceinline vfloat4& operator +=(vfloat4& a, float b) { return a = a + b; } + + __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; } + __forceinline vfloat4& operator -=(vfloat4& a, float b) { return a = a - b; } + + __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; } + __forceinline vfloat4& operator *=(vfloat4& a, float b) { return a = a * b; } + + __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; } + __forceinline vfloat4& operator /=(vfloat4& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } + __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } + __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } + __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } +#endif + + __forceinline vboolf4 operator ==(const vfloat4& a, float b) { return a == vfloat4(b); } + __forceinline vboolf4 operator ==(float a, const vfloat4& b) { return vfloat4(a) == b; } + + __forceinline vboolf4 operator !=(const vfloat4& a, float b) { return a != vfloat4(b); } + __forceinline vboolf4 operator !=(float a, const vfloat4& b) { return vfloat4(a) != b; } + + __forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); } + __forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; } + + __forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); } + __forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; } + + __forceinline vboolf4 operator > (const vfloat4& a, float b) { return a > vfloat4(b); } + __forceinline vboolf4 operator > (float a, const vfloat4& b) { return vfloat4(a) > b; } + + __forceinline vboolf4 operator <=(const vfloat4& a, float b) { return a <= vfloat4(b); } + __forceinline vboolf4 operator <=(float a, const vfloat4& b) { return vfloat4(a) <= b; } + + __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; } + __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; } + __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a < b; } + __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; } + __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a > b; } + __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); } + __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a < b); } + __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); } + __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a > b); } + __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); } +#endif + + template<int mask> + __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f) + { +#if defined(__SSE4_1__) + return _mm_blend_ps(f, t, mask); +#else + return select(vboolf4(mask), t, f); +#endif + } + + __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { + return madd(t,b-a,a); + } + + __forceinline bool isvalid(const vfloat4& v) { + return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite(const vfloat4& a) { + return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); + } + + __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) { + return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } + __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } + __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } + __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } +#elif defined (__SSE4_1__) + __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } + __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); } + __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } +#else + __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); } + __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); } + __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); } + __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); } +#endif + __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } + + __forceinline vint4 floori(const vfloat4& a) { +#if defined(__SSE4_1__) + return vint4(floor(a)); +#else + return vint4(a-vfloat4(0.5f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } + __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat4 shuffle(const vfloat4& v) { + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + +#if defined(__SSE3__) + template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } + template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } + template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } +#endif + + template<int i> + __forceinline vfloat4 shuffle(const vfloat4& v) { + return shuffle<i,i,i,i>(v); + } + + template<int i> __forceinline float extract (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); } + template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } + +#if defined (__SSE4_1__) + template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } + template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); } + template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); } +#else + template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; } + template<int dst> __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; } +#endif + + __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); } + + __forceinline vfloat4 shift_right_1(const vfloat4& x) { + return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); + } + +#if defined (__AVX2__) + __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) { + return _mm_permutevar_ps(a,index); + } + + __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); } + +#endif + +#if defined(__AVX512VL__) + template<int i> + __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) { + return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i)); + } +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting Network + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 sort_ascending(const vfloat4& v) + { + const vfloat4 a0 = v; + const vfloat4 b0 = shuffle<1,0,3,2>(a0); + const vfloat4 c0 = min(a0,b0); + const vfloat4 d0 = max(a0,b0); + const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vfloat4 b1 = shuffle<2,3,0,1>(a1); + const vfloat4 c1 = min(a1,b1); + const vfloat4 d1 = max(a1,b1); + const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vfloat4 b2 = shuffle<0,2,1,3>(a2); + const vfloat4 c2 = min(a2,b2); + const vfloat4 d2 = max(a2,b2); + const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + __forceinline vfloat4 sort_descending(const vfloat4& v) + { + const vfloat4 a0 = v; + const vfloat4 b0 = shuffle<1,0,3,2>(a0); + const vfloat4 c0 = max(a0,b0); + const vfloat4 d0 = min(a0,b0); + const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vfloat4 b1 = shuffle<2,3,0,1>(a1); + const vfloat4 c1 = max(a1,b1); + const vfloat4 d1 = min(a1,b1); + const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vfloat4 b2 = shuffle<0,2,1,3>(a2); + const vfloat4 c2 = max(a2,b2); + const vfloat4 d2 = min(a2,b2); + const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3) + { + vfloat4 l02 = unpacklo(r0,r2); + vfloat4 h02 = unpackhi(r0,r2); + vfloat4 l13 = unpacklo(r1,r3); + vfloat4 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2) + { + vfloat4 l02 = unpacklo(r0,r2); + vfloat4 h02 = unpackhi(r0,r2); + vfloat4 l13 = unpacklo(r1,r3); + vfloat4 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } + __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } + + __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) + { + const vfloat4 a = select(valid,v,vfloat4(pos_inf)); + const vbool4 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) + { + const vfloat4 a = select(valid,v,vfloat4(neg_inf)); + const vbool4 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float dot(const vfloat4& a, const vfloat4& b) { + return reduce_add(a*b); + } + + __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b) + { + const vfloat4 a0 = a; + const vfloat4 b0 = shuffle<1,2,0,3>(b); + const vfloat4 a1 = shuffle<1,2,0,3>(a); + const vfloat4 b1 = b; + return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } + +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vfloat8_avx.h b/thirdparty/embree/common/simd/vfloat8_avx.h new file mode 100644 index 0000000000..13446454e8 --- /dev/null +++ b/thirdparty/embree/common/simd/vfloat8_avx.h @@ -0,0 +1,758 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX float type */ + template<> + struct vfloat<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { __m256 v; float f[8]; int i[8]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat8& other) { v = other.v; } + __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; } + + __forceinline vfloat(__m256 a) : v(a) {} + __forceinline operator const __m256&() const { return v; } + __forceinline operator __m256&() { return v; } + + __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + + __forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {} + __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {} + __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {} + __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {} + __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm256_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm256_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {} + __forceinline vfloat(NaNTy) : v(_mm256_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat8 broadcast(const void* a) { + return _mm256_broadcast_ss((float*)a); + } + + static __forceinline vfloat8 load(const char* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load(const unsigned char* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load(const short* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); } + static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); } + + static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); } +#else + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } +#endif + +#if defined(__AVX2__) + static __forceinline vfloat8 load_nt(void* ptr) { + return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr)); + } +#endif + + static __forceinline void store_nt(void* ptr, const vfloat8& v) { + _mm256_stream_ps((float*)ptr,v); + } + + template<int scale = 4> + static __forceinline vfloat8 gather(const float* ptr, const vint8& index) { +#if defined(__AVX2__) + return _mm256_i32gather_ps(ptr, index ,scale); +#else + return vfloat8( + *(float*)(((char*)ptr)+scale*index[0]), + *(float*)(((char*)ptr)+scale*index[1]), + *(float*)(((char*)ptr)+scale*index[2]), + *(float*)(((char*)ptr)+scale*index[3]), + *(float*)(((char*)ptr)+scale*index[4]), + *(float*)(((char*)ptr)+scale*index[5]), + *(float*)(((char*)ptr)+scale*index[6]), + *(float*)(((char*)ptr)+scale*index[7])); +#endif + } + + template<int scale = 4> + static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) { + vfloat8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale); +#elif defined(__AVX2__) + return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(float*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(float*)(((char*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(float*)(((char*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(float*)(((char*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(float*)(((char*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(float*)(((char*)ptr)+scale*index[7]); + return r; + #endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_ps((float*)ptr, ofs, v, scale); +#else + *(float*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(float*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(float*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(float*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(float*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(float*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(float*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(float*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(float*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(float*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(float*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(float*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(float*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(float*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < 8); return f[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 asFloat(const vint8& a) { return _mm256_castsi256_ps(a); } + __forceinline vint8 asInt (const vfloat8& a) { return _mm256_castps_si256(a); } + + __forceinline vint8 toInt (const vfloat8& a) { return vint8(a); } + __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); } + + __forceinline vfloat8 operator +(const vfloat8& a) { return a; } + __forceinline vfloat8 operator -(const vfloat8& a) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a, mask); + } + __forceinline vfloat8 abs(const vfloat8& a) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); + return _mm256_and_ps(a, mask); + } + __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); } + __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } + + + static __forceinline vfloat8 rcp(const vfloat8& a) + { +#if defined(__AVX512VL__) + const vfloat8 r = _mm256_rcp14_ps(a); +#else + const vfloat8 r = _mm256_rcp_ps(a); +#endif + +#if defined(__AVX2__) + return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f))); +#else + return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a))); +#endif + } + __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); } + __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); } + + static __forceinline vfloat8 rsqrt(const vfloat8& a) + { +#if defined(__AVX512VL__) + const vfloat8 r = _mm256_rsqrt14_ps(a); +#else + const vfloat8 r = _mm256_rsqrt_ps(a); +#endif + +#if defined(__AVX2__) + return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r, + _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +#else + return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r), + _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); } + __forceinline vfloat8 operator +(const vfloat8& a, float b) { return a + vfloat8(b); } + __forceinline vfloat8 operator +(float a, const vfloat8& b) { return vfloat8(a) + b; } + + __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); } + __forceinline vfloat8 operator -(const vfloat8& a, float b) { return a - vfloat8(b); } + __forceinline vfloat8 operator -(float a, const vfloat8& b) { return vfloat8(a) - b; } + + __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); } + __forceinline vfloat8 operator *(const vfloat8& a, float b) { return a * vfloat8(b); } + __forceinline vfloat8 operator *(float a, const vfloat8& b) { return vfloat8(a) * b; } + + __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); } + __forceinline vfloat8 operator /(const vfloat8& a, float b) { return a / vfloat8(b); } + __forceinline vfloat8 operator /(float a, const vfloat8& b) { return vfloat8(a) / b; } + + __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); } + __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); } + __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); } + __forceinline vfloat8 operator ^(const vfloat8& a, const vint8& b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); } + + __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); } + __forceinline vfloat8 min(const vfloat8& a, float b) { return _mm256_min_ps(a, vfloat8(b)); } + __forceinline vfloat8 min(float a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); } + + __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); } + __forceinline vfloat8 max(const vfloat8& a, float b) { return _mm256_max_ps(a, vfloat8(b)); } + __forceinline vfloat8 max(float a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); } + + /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */ +#if defined(__AVX2__) + + static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_min_epi32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_max_epi32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_min_epu32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_max_epu32(ai,bi); + return _mm256_castsi256_ps(ci); + } + +#else + + static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { + return asFloat(min(asInt(a),asInt(b))); + } + + static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { + return asFloat(max(asInt(a),asInt(b))); + } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); } + static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); } + static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); } + static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); } +#else + static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; } + static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; } + static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;} + static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; } + __forceinline vfloat8& operator +=(vfloat8& a, float b) { return a = a + b; } + + __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; } + __forceinline vfloat8& operator -=(vfloat8& a, float b) { return a = a - b; } + + __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; } + __forceinline vfloat8& operator *=(vfloat8& a, float b) { return a = a * b; } + + __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; } + __forceinline vfloat8& operator /=(vfloat8& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); } + + static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_mask_blend_ps(m, f, t); + } +#else + static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } + static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } + static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } + static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } + static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } + static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } + + static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_blendv_ps(f, t, m); + } +#endif + + template<int mask> + __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) { + return _mm256_blend_ps(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vfloat8& a, const float& b) { return a == vfloat8(b); } + __forceinline vboolf8 operator ==(const float& a, const vfloat8& b) { return vfloat8(a) == b; } + + __forceinline vboolf8 operator !=(const vfloat8& a, const float& b) { return a != vfloat8(b); } + __forceinline vboolf8 operator !=(const float& a, const vfloat8& b) { return vfloat8(a) != b; } + + __forceinline vboolf8 operator < (const vfloat8& a, const float& b) { return a < vfloat8(b); } + __forceinline vboolf8 operator < (const float& a, const vfloat8& b) { return vfloat8(a) < b; } + + __forceinline vboolf8 operator >=(const vfloat8& a, const float& b) { return a >= vfloat8(b); } + __forceinline vboolf8 operator >=(const float& a, const vfloat8& b) { return vfloat8(a) >= b; } + + __forceinline vboolf8 operator > (const vfloat8& a, const float& b) { return a > vfloat8(b); } + __forceinline vboolf8 operator > (const float& a, const vfloat8& b) { return vfloat8(a) > b; } + + __forceinline vboolf8 operator <=(const vfloat8& a, const float& b) { return a <= vfloat8(b); } + __forceinline vboolf8 operator <=(const float& a, const vfloat8& b) { return vfloat8(a) <= b; } + + __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; } + __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; } + __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a < b; } + __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; } + __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a > b; } + __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; } + +#if defined(__AVX512VL__) + static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } +#else + static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a < b); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a > b); } + static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); } +#endif + + __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) { + return madd(t,b-a,a); + } + + __forceinline bool isvalid (const vfloat8& v) { + return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE))); + } + + __forceinline bool is_finite (const vfloat8& a) { + return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); + } + + __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) { + return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } + __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } + __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } + __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); } + __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); } + + template<int i> + __forceinline vfloat8 shuffle(const vfloat8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template<int i0, int i1> + __forceinline vfloat8 shuffle4(const vfloat8& v) { + return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat8 shuffle(const vfloat8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); } + template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); } + template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } + + __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } + template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); } + template<size_t i> __forceinline vfloat4 extract4 (const vfloat8& a) { return _mm256_extractf128_ps(a, i); } + template<> __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a); } + + __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); } + +#if defined (__AVX2__) + static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) { + return _mm256_permutevar8x32_ps(a, index); + } +#endif + +#if defined(__AVX512VL__) + template<int i> + static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) { + return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i)); + } +#endif + +#if defined (__AVX_I__) + template<const int mode> + static __forceinline vint4 convert_to_hf16(const vfloat8& a) { + return _mm256_cvtps_ph(a, mode); + } + + static __forceinline vfloat8 convert_from_hf16(const vint4& a) { + return _mm256_cvtph_ps(a); + } +#endif + +#if defined(__AVX512VL__) + static __forceinline vfloat8 shift_right_1(const vfloat8& x) { + return align_shift_right<1>(zero,x); + } +#else + static __forceinline vfloat8 shift_right_1(const vfloat8& x) { + const vfloat8 t0 = shuffle<1,2,3,0>(x); + const vfloat8 t1 = shuffle4<1,0>(t0); + return _mm256_blend_ps(t0,t1,0x88); + } +#endif + + __forceinline vint8 floori(const vfloat8& a) { + return vint8(floor(a)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) + { + vfloat8 l02 = unpacklo(r0,r2); + vfloat8 h02 = unpackhi(r0,r2); + vfloat8 l13 = unpacklo(r1,r3); + vfloat8 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2) + { + vfloat8 l02 = unpacklo(r0,r2); + vfloat8 h02 = unpackhi(r0,r2); + vfloat8 l13 = unpacklo(r1,r3); + vfloat8 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7) + { + vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3); + vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7); + c0 = shuffle4<0,2>(h0,h4); + c1 = shuffle4<0,2>(h1,h5); + c2 = shuffle4<0,2>(h2,h6); + c3 = shuffle4<0,2>(h3,h7); + c4 = shuffle4<1,3>(h0,h4); + c5 = shuffle4<1,3>(h1,h5); + c6 = shuffle4<1,3>(h2,h6); + c7 = shuffle4<1,3>(h3,h7); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) + { + transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2) + { + transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); } + __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) + { + const vfloat8 a = select(valid,v,vfloat8(pos_inf)); + const vbool8 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + + __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) + { + const vfloat8 a = select(valid,v,vfloat8(neg_inf)); + const vbool8 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators (pairs of Vec3fa's) + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { + // return vreduce_add4(a*b); + //} + + __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { + return _mm256_dp_ps(a,b,0x7F); + } + + __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b) + { + const vfloat8 a0 = a; + const vfloat8 b0 = shuffle<1,2,0,3>(b); + const vfloat8 a1 = shuffle<1,2,0,3>(a); + const vfloat8 b1 = b; + return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); + } + + //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); } + //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); } + //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); } + //__forceinline float length (const vfloat<8>& a) { return sqrt(dot(a,a)); } + __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); } + //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); } + //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); } + //__forceinline float area (const vfloat<8>& d) { return 2.0f*halfArea(d); } + //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; } + + //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) { + // const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + //} + + //////////////////////////////////////////////////////////////////////////////// + /// In Register Sorting + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 sort_ascending(const vfloat8& v) + { + const vfloat8 a0 = v; + const vfloat8 b0 = shuffle<1,0,3,2>(a0); + const vfloat8 c0 = min(a0,b0); + const vfloat8 d0 = max(a0,b0); + const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vfloat8 b1 = shuffle<2,3,0,1>(a1); + const vfloat8 c1 = min(a1,b1); + const vfloat8 d1 = max(a1,b1); + const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vfloat8 b2 = shuffle<1,0,3,2>(a2); + const vfloat8 c2 = min(a2,b2); + const vfloat8 d2 = max(a2,b2); + const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vfloat8 b3 = shuffle4<1,0>(a3); + const vfloat8 c3 = min(a3,b3); + const vfloat8 d3 = max(a3,b3); + const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vfloat8 b4 = shuffle<2,3,0,1>(a4); + const vfloat8 c4 = min(a4,b4); + const vfloat8 d4 = max(a4,b4); + const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vfloat8 b5 = shuffle<1,0,3,2>(a5); + const vfloat8 c5 = min(a5,b5); + const vfloat8 d5 = max(a5,b5); + const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + __forceinline vfloat8 sort_descending(const vfloat8& v) + { + const vfloat8 a0 = v; + const vfloat8 b0 = shuffle<1,0,3,2>(a0); + const vfloat8 c0 = max(a0,b0); + const vfloat8 d0 = min(a0,b0); + const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vfloat8 b1 = shuffle<2,3,0,1>(a1); + const vfloat8 c1 = max(a1,b1); + const vfloat8 d1 = min(a1,b1); + const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vfloat8 b2 = shuffle<1,0,3,2>(a2); + const vfloat8 c2 = max(a2,b2); + const vfloat8 d2 = min(a2,b2); + const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vfloat8 b3 = shuffle4<1,0>(a3); + const vfloat8 c3 = max(a3,b3); + const vfloat8 d3 = min(a3,b3); + const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vfloat8 b4 = shuffle<2,3,0,1>(a4); + const vfloat8 c4 = max(a4,b4); + const vfloat8 d4 = min(a4,b4); + const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vfloat8 b5 = shuffle<1,0,3,2>(a5); + const vfloat8 c5 = max(a5,b5); + const vfloat8 d5 = min(a5,b5); + const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vint16_avx512.h b/thirdparty/embree/common/simd/vint16_avx512.h new file mode 100644 index 0000000000..3720c3c9d6 --- /dev/null +++ b/thirdparty/embree/common/simd/vint16_avx512.h @@ -0,0 +1,472 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 16-wide AVX-512 integer type */ + template<> + struct vint<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512i v; + int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint16& t) { v = t.v; } + __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; } + + __forceinline vint(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vint(int i) { + v = _mm512_set1_epi32(i); + } + + __forceinline vint(int a, int b, int c, int d) { + v = _mm512_set4_epi32(d,c,b,a); + } + + __forceinline vint(int a0 , int a1 , int a2 , int a3, + int a4 , int a5 , int a6 , int a7, + int a8 , int a9 , int a10, int a11, + int a12, int a13, int a14, int a15) + { + v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline vint(const vint4& i) { + v = _mm512_broadcast_i32x4(i); + } + + __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) { + v = _mm512_castsi128_si512(a); + v = _mm512_inserti32x4(v, b, 1); + v = _mm512_inserti32x4(v, c, 2); + v = _mm512_inserti32x4(v, d, 3); + } + + __forceinline vint(const vint8& i) { + v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); + } + + __forceinline vint(const vint8& a, const vint8& b) { + v = _mm512_castsi256_si512(a); + v = _mm512_inserti64x4(v, b, 1); + } + + __forceinline explicit vint(const __m512& f) { + v = _mm512_cvtps_epi32(f); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vint(OneTy) : v(_mm512_set1_epi32(1)) {} + __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {} + __forceinline vint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); } + + static __forceinline vint16 load(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); } + + static __forceinline vint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } + static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } + + static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); } + + static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); } + static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); } + + static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); } + static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); } + + static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); } + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); } + + static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); } + + static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) { + return _mm512_mask_compress_epi32(v,mask,v); + } + + static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) { + return _mm512_mask_compress_epi32(a,mask,b); + } + + static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) { + return _mm512_mask_expand_epi32(b,mask,a); + } + + template<int scale = 4> + static __forceinline vint16 gather(const int* ptr, const vint16& index) { + return _mm512_i32gather_epi32(index,ptr,scale); + } + + template<int scale = 4> + static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); + } + + template<int scale = 4> + static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); + } + + template<int scale = 4> + static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) { + _mm512_i32scatter_epi32((int*)ptr,index,v,scale); + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) { + _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int& operator [](size_t index) { assert(index < 16); return i[index]; } + __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; } + + __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } + __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); } + + __forceinline vint16 operator +(const vint16& a) { return a; } + __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); } + __forceinline vint16 operator +(const vint16& a, int b) { return a + vint16(b); } + __forceinline vint16 operator +(int a, const vint16& b) { return vint16(a) + b; } + + __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); } + __forceinline vint16 operator -(const vint16& a, int b) { return a - vint16(b); } + __forceinline vint16 operator -(int a, const vint16& b) { return vint16(a) - b; } + + __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); } + __forceinline vint16 operator *(const vint16& a, int b) { return a * vint16(b); } + __forceinline vint16 operator *(int a, const vint16& b) { return vint16(a) * b; } + + __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); } + __forceinline vint16 operator &(const vint16& a, int b) { return a & vint16(b); } + __forceinline vint16 operator &(int a, const vint16& b) { return vint16(a) & b; } + + __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); } + __forceinline vint16 operator |(const vint16& a, int b) { return a | vint16(b); } + __forceinline vint16 operator |(int a, const vint16& b) { return vint16(a) | b; } + + __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); } + __forceinline vint16 operator ^(const vint16& a, int b) { return a ^ vint16(b); } + __forceinline vint16 operator ^(int a, const vint16& b) { return vint16(a) ^ b; } + + __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); } + __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); } + + __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); } + __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); } + + __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); } + __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); } + __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); } + + __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); } + __forceinline vint16 min(const vint16& a, int b) { return min(a,vint16(b)); } + __forceinline vint16 min(int a, const vint16& b) { return min(vint16(a),b); } + + __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); } + __forceinline vint16 max(const vint16& a, int b) { return max(a,vint16(b)); } + __forceinline vint16 max(int a, const vint16& b) { return max(vint16(a),b); } + + __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); } + __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); } + + __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } + __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } + + __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } + __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; } + __forceinline vint16& operator +=(vint16& a, int b) { return a = a + b; } + + __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; } + __forceinline vint16& operator -=(vint16& a, int b) { return a = a - b; } + + __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; } + __forceinline vint16& operator *=(vint16& a, int b) { return a = a * b; } + + __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; } + __forceinline vint16& operator &=(vint16& a, int b) { return a = a & b; } + + __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; } + __forceinline vint16& operator |=(vint16& a, int b) { return a = a | b; } + + __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; } + __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vint16& a, int b) { return a == vint16(b); } + __forceinline vboolf16 operator ==(int a, const vint16& b) { return vint16(a) == b; } + + __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vint16& a, int b) { return a != vint16(b); } + __forceinline vboolf16 operator !=(int a, const vint16& b) { return vint16(a) != b; } + + __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vint16& a, int b) { return a < vint16(b); } + __forceinline vboolf16 operator < (int a, const vint16& b) { return vint16(a) < b; } + + __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vint16& a, int b) { return a >= vint16(b); } + __forceinline vboolf16 operator >=(int a, const vint16& b) { return vint16(a) >= b; } + + __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vint16& a, int b) { return a > vint16(b); } + __forceinline vboolf16 operator > (int a, const vint16& b) { return vint16(a) > b; } + + __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vint16& a, int b) { return a <= vint16(b); } + __forceinline vboolf16 operator <=(int a, const vint16& b) { return vint16(a) <= b; } + + __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + + __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } + + + __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) { + return _mm512_mask_or_epi32(f,m,t,t); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); } + __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); } + + template<int i> + __forceinline vint16 shuffle(const vint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint16 shuffle(const vint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i> + __forceinline vint16 shuffle4(const vint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint16 shuffle4(const vint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i> + __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) { + return _mm512_alignr_epi32(a, b, i); + }; + + __forceinline int toScalar(const vint16& v) { + return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); + } + + template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); } + + template<int N, int i> + vint<N> extractN(const vint16& v); + + template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v); } + template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); } + template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); } + template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); } + + template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v); } + template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); } + + template<int i> __forceinline vint4 extract4 (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); } + template<> __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v); } + + template<int i> __forceinline vint8 extract8 (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); } + template<> __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 vreduce_min2(vint16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vint16 vreduce_max2(vint16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vint16 vreduce_and2(vint16 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } + + __forceinline vint16 vreduce_or2(vint16 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } + + __forceinline vint16 vreduce_add2(vint16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); } + __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); } + __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 conflict(const vint16& index) + { + return _mm512_conflict_epi32(index); + } + + __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index) + { + return _mm512_mask_conflict_epi32(dest,mask,index); + } + + __forceinline vint16 convert_uint32_t(const __m512& f) { + return _mm512_cvtps_epu32(f); + } + + __forceinline vint16 permute(vint16 v, vint16 index) { + return _mm512_permutexvar_epi32(index,v); + } + + __forceinline vint16 reverse(const vint16 &a) { + return permute(a,vint16(reverse_step)); + } + + __forceinline vint16 prefix_sum(const vint16& a) + { + const vint16 z(zero); + vint16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vint16 reverse_prefix_sum(const vint16& a) + { + const vint16 z(zero); + vint16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + /* this should use a vbool8 and a vint8_64...*/ + template<int scale = 1, int hint = _MM_HINT_T0> + __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset) + { +#if defined(__AVX512PF__) + _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h new file mode 100644 index 0000000000..9814d5c71c --- /dev/null +++ b/thirdparty/embree/common/simd/vint4_sse2.h @@ -0,0 +1,598 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide SSE integer type */ + template<> + struct vint<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128i v; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint4& a) { v = a.v; } + __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; } + + __forceinline vint(__m128i a) : v(a) {} + __forceinline operator const __m128i&() const { return v; } + __forceinline operator __m128i&() { return v; } + + __forceinline vint(int a) : v(_mm_set1_epi32(a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {} + + __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {} +#if defined(__AVX512VL__) + __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} +#else + __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} +#endif + + __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {} + __forceinline vint(OneTy) : v(_mm_set_epi32(1, 1, 1, 1)) {} + __forceinline vint(PosInfTy) : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {} + __forceinline vint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {} + + __forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } + __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} + + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } + static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } + + static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) { + return _mm_mask_compress_epi32(v, mask, v); + } + static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) { + return _mm_mask_compress_epi32(a, mask, b); + } + + static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } + static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } +#else + static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } + static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } +#endif + + +#if defined(__SSE4_1__) + static __forceinline vint4 load(const unsigned char* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + + static __forceinline vint4 loadu(const unsigned char* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } +#else + + static __forceinline vint4 load(const unsigned char* ptr) { + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); + } + + static __forceinline vint4 loadu(const unsigned char* ptr) { + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); + } + +#endif + + static __forceinline vint4 load(const unsigned short* ptr) { +#if defined (__SSE4_1__) + return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); +#else + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); +#endif + } + + static __forceinline void store(unsigned char* ptr, const vint4& v) { +#if defined(__SSE4_1__) + __m128i x = v; + x = _mm_packus_epi32(x, x); + x = _mm_packus_epi16(x, x); + *(int*)ptr = _mm_cvtsi128_si32(x); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (unsigned char)v[i]; +#endif + } + + static __forceinline void store(unsigned short* ptr, const vint4& v) { + for (size_t i=0;i<4;i++) + ptr[i] = (unsigned short)v[i]; + } + + static __forceinline vint4 load_nt(void* ptr) { +#if defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif + } + + static __forceinline void store_nt(void* ptr, const vint4& v) { +#if defined(__SSE4_1__) + _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif + } + + template<int scale = 4> + static __forceinline vint4 gather(const int* ptr, const vint4& index) { +#if defined(__AVX2__) + return _mm_i32gather_epi32(ptr, index, scale); +#else + return vint4( + *(int*)(((char*)ptr)+scale*index[0]), + *(int*)(((char*)ptr)+scale*index[1]), + *(int*)(((char*)ptr)+scale*index[2]), + *(int*)(((char*)ptr)+scale*index[3])); +#endif + } + + template<int scale = 4> + static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) { + vint4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#elif defined(__AVX2__) + return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]); + return r; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v) + { +#if defined(__AVX512VL__) + _mm_i32scatter_epi32((int*)ptr, index, v, scale); +#else + *(int*)(((char*)ptr)+scale*index[0]) = v[0]; + *(int*)(((char*)ptr)+scale*index[1]) = v[1]; + *(int*)(((char*)ptr)+scale*index[2]) = v[2]; + *(int*)(((char*)ptr)+scale*index[3]) = v[3]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v) + { +#if defined(__AVX512VL__) + _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale); +#else + if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0]; + if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1]; + if (likely(mask[2])) *(int*)(((char*)ptr)+scale*index[2]) = v[2]; + if (likely(mask[3])) *(int*)(((char*)ptr)+scale*index[3]) = v[3]; +#endif + } + +#if defined(__x86_64__) + static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } + + friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); } +#else + __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); } +#endif + + __forceinline vint4 operator +(const vint4& a) { return a; } + __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } +#if defined(__SSSE3__) + __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); } + __forceinline vint4 operator +(const vint4& a, int b) { return a + vint4(b); } + __forceinline vint4 operator +(int a, const vint4& b) { return vint4(a) + b; } + + __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); } + __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); } + __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; } + +#if defined(__SSE4_1__) + __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); } +#else + __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } +#endif + __forceinline vint4 operator *(const vint4& a, int b) { return a * vint4(b); } + __forceinline vint4 operator *(int a, const vint4& b) { return vint4(a) * b; } + + __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); } + __forceinline vint4 operator &(const vint4& a, int b) { return a & vint4(b); } + __forceinline vint4 operator &(int a, const vint4& b) { return vint4(a) & b; } + + __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); } + __forceinline vint4 operator |(const vint4& a, int b) { return a | vint4(b); } + __forceinline vint4 operator |(int a, const vint4& b) { return vint4(a) | b; } + + __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); } + __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); } + __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; } + + __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); } + __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); } + + __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); } + __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); } + __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; } + __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; } + + __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; } + __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; } + +#if defined(__SSE4_1__) + __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; } + __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; } +#endif + + __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; } + __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; } + + __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; } + __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; } + + __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; } + __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); } + __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); } + __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); } + __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); } + __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); } +#endif + + __forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4(b); } + __forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4(a) == b; } + + __forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4(b); } + __forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4(a) != b; } + + __forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4(b); } + __forceinline vboolf4 operator < (int a, const vint4& b) { return vint4(a) < b; } + + __forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4(b); } + __forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4(a) >= b; } + + __forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4(b); } + __forceinline vboolf4 operator > (int a, const vint4& b) { return vint4(a) > b; } + + __forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4(b); } + __forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4(a) <= b; } + + __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; } + __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; } + __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; } + __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; } + __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; } + __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); } + __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); } + __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); } + __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); } + __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); } +#endif + + template<int mask> + __forceinline vint4 select(const vint4& t, const vint4& f) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(vboolf4(mask), t, f); +#endif + } + +#if defined(__SSE4_1__) + __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); } + __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); } + + __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); } + __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); } + +#else + __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); } + __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); } +#endif + + __forceinline vint4 min(const vint4& a, int b) { return min(a,vint4(b)); } + __forceinline vint4 min(int a, const vint4& b) { return min(vint4(a),b); } + __forceinline vint4 max(const vint4& a, int b) { return max(a,vint4(b)); } + __forceinline vint4 max(int a, const vint4& b) { return max(vint4(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + + template<int i0, int i1, int i2, int i3> + __forceinline vint4 shuffle(const vint4& v) { + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint4 shuffle(const vint4& a, const vint4& b) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + +#if defined(__SSE3__) + template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } +#endif + + template<int i> + __forceinline vint4 shuffle(const vint4& v) { + return shuffle<i,i,i,i>(v); + } + +#if defined(__SSE4_1__) + template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); } + template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); } +#else + template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; } + template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; } +#endif + + + template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); } + + __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); } + + __forceinline size_t toSizeT(const vint4& v) { +#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround + return toScalar(v); +#else + return _mm_cvtsi128_si64(v); +#endif + } + +#if defined(__AVX512VL__) + + __forceinline vint4 permute(const vint4 &a, const vint4 &index) { + return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index)); + } + + template<int i> + __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) { + return _mm_alignr_epi32(a, b, i); + } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + + __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); } + __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); } + __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + + __forceinline vint4 usort_ascending(const vint4& v) + { + const vint4 a0 = v; + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = umin(a0,b0); + const vint4 d0 = umax(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = umin(a1,b1); + const vint4 d1 = umax(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = umin(a2,b2); + const vint4 d2 = umax(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + __forceinline vint4 usort_descending(const vint4& v) + { + const vint4 a0 = v; + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = umax(a0,b0); + const vint4 d0 = umin(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = umax(a1,b1); + const vint4 d1 = umin(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = umax(a2,b2); + const vint4 d2 = umin(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + +#else + + __forceinline vint4 usort_ascending(const vint4& v) + { + const vint4 a0 = v-vint4(0x80000000); + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = min(a0,b0); + const vint4 d0 = max(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = min(a1,b1); + const vint4 d1 = max(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = min(a2,b2); + const vint4 d2 = max(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3+vint4(0x80000000); + } + + __forceinline vint4 usort_descending(const vint4& v) + { + const vint4 a0 = v-vint4(0x80000000); + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = max(a0,b0); + const vint4 d0 = min(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = max(a1,b1); + const vint4 d1 = min(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = max(a2,b2); + const vint4 d2 = min(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3+vint4(0x80000000); + } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vint8_avx.h b/thirdparty/embree/common/simd/vint8_avx.h new file mode 100644 index 0000000000..f43e9a8c22 --- /dev/null +++ b/thirdparty/embree/common/simd/vint8_avx.h @@ -0,0 +1,470 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + struct { __m128i vl,vh; }; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint8& a) { v = a.v; } + __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } + + __forceinline vint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} + + __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} + + __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vint(OneTy) : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {} + __forceinline vint(PosInfTy) : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {} + __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} + __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } + static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } + + static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + + static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + + static __forceinline void store_nt(void* ptr, const vint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline vint8 load(const unsigned char* ptr) { + vint4 il = vint4::load(ptr+0); + vint4 ih = vint4::load(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 loadu(const unsigned char* ptr) { + vint4 il = vint4::loadu(ptr+0); + vint4 ih = vint4::loadu(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 load(const unsigned short* ptr) { + vint4 il = vint4::load(ptr+0); + vint4 ih = vint4::load(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 loadu(const unsigned short* ptr) { + vint4 il = vint4::loadu(ptr+0); + vint4 ih = vint4::loadu(ptr+4); + return vint8(il,ih); + } + + static __forceinline void store(unsigned char* ptr, const vint8& i) { + vint4 il(i.vl); + vint4 ih(i.vh); + vint4::store(ptr + 0,il); + vint4::store(ptr + 4,ih); + } + + static __forceinline void store(unsigned short* ptr, const vint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template<int scale = 4> + static __forceinline vint8 gather(const int* ptr, const vint8& index) { + return vint8( + *(int*)(((char*)ptr)+scale*index[0]), + *(int*)(((char*)ptr)+scale*index[1]), + *(int*)(((char*)ptr)+scale*index[2]), + *(int*)(((char*)ptr)+scale*index[3]), + *(int*)(((char*)ptr)+scale*index[4]), + *(int*)(((char*)ptr)+scale*index[5]), + *(int*)(((char*)ptr)+scale*index[6]), + *(int*)(((char*)ptr)+scale*index[7])); + } + + template<int scale = 4> + static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) { + vint8 r = zero; + if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(int*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(int*)(((char*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(int*)(((char*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(int*)(((char*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(int*)(((char*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(int*)(((char*)ptr)+scale*index[7]); + return r; + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) + { + *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) + { + if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; + } + + + static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } + + __forceinline vint8 operator +(const vint8& a) { return a; } + __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); } + __forceinline vint8 abs (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } + __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } + __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } + + __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } + __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } + __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } + + __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); } + __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } + __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } + + __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } + __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } + + __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } + __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } + + __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } + __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } + + __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } + __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); } + + __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } + __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } + __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } + + __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); } + __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } + __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } + + __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); } + __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } + __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } + + __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } + __forceinline vint8 umin(const vint8& a, int b) { return umin(a,vint8(b)); } + __forceinline vint8 umin(int a, const vint8& b) { return umin(vint8(a),b); } + + __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } + __forceinline vint8 umax(const vint8& a, int b) { return umax(a,vint8(b)); } + __forceinline vint8 umax(int a, const vint8& b) { return umax(vint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } + __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } + + __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } + __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } + + __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } + __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } + + __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } + __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } + + __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } + __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } + + __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; } + __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } + __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } + + __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } + __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } + __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } + + __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } + __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } + + __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } + __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } + __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } + + __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } + __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } + + __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } + __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } + __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } + + __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } + __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } + __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } + __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } + __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } + __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } + + __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } + __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } + __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } + __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } + __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } + + __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + + template<int i> + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1> + __forceinline vint8 shuffle4(const vint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint8 shuffle(const vint8& a, const vint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } + template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 usort_ascending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umin(a0,b0); + const vint8 d0 = umax(a0,b0); + const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umin(a1,b1); + const vint8 d1 = umax(a1,b1); + const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umin(a2,b2); + const vint8 d2 = umax(a2,b2); + const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umin(a3,b3); + const vint8 d3 = umax(a3,b3); + const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umin(a4,b4); + const vint8 d4 = umax(a4,b4); + const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umin(a5,b5); + const vint8 d5 = umax(a5,b5); + const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); + return a6; + } + + __forceinline vint8 usort_descending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umax(a0,b0); + const vint8 d0 = umin(a0,b0); + const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umax(a1,b1); + const vint8 d1 = umin(a1,b1); + const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umax(a2,b2); + const vint8 d2 = umin(a2,b2); + const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umax(a3,b3); + const vint8 d3 = umin(a3,b3); + const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umax(a4,b4); + const vint8 d4 = umin(a4,b4); + const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umax(a5,b5); + const vint8 d5 = umin(a5,b5); + const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vint8_avx2.h b/thirdparty/embree/common/simd/vint8_avx2.h new file mode 100644 index 0000000000..e04737ffbe --- /dev/null +++ b/thirdparty/embree/common/simd/vint8_avx2.h @@ -0,0 +1,518 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint8& a) { v = a.v; } + __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } + + __forceinline vint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + + __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} +#else + __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} + __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} + __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } + + static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } + static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } + + static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } + +#if defined(__AVX512VL__) + + static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) { + return _mm256_mask_compress_epi32(v, mask, v); + } + static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) { + return _mm256_mask_compress_epi32(a, mask, b); + } + + static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } +#else + static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } +#endif + + static __forceinline vint8 load_nt(void* ptr) { + return _mm256_stream_load_si256((__m256i*)ptr); + } + + static __forceinline void store_nt(void* ptr, const vint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline void store(unsigned char* ptr, const vint8& i) + { + for (size_t j=0; j<8; j++) + ptr[j] = i[j]; + } + + static __forceinline void store(unsigned short* ptr, const vint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template<int scale = 4> + static __forceinline vint8 gather(const int *const ptr, const vint8& index) { + return _mm256_i32gather_epi32(ptr, index, scale); + } + + template<int scale = 4> + static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) { + vint8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#else + return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale); +#endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); +#else + *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(int*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); } +#else + static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } +#endif + + __forceinline vint8 operator +(const vint8& a) { return a; } + __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); } + __forceinline vint8 abs (const vint8& a) { return _mm256_abs_epi32(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); } + __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } + __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } + + __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); } + __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } + __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } + + __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); } + __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } + __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } + + __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); } + __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } + __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } + + __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); } + __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } + __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } + + __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); } + __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } + __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } + + __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); } + __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); } + + __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); } + __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); } + + __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); } + __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); } + __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); } + + __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); } + __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); } + __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); } + + __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); } + __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } + __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } + + __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); } + __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } + __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } + + __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); } + __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } + __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } + + __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } + __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } + + __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } + __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } + + __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } + __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } + + __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } + __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } + + __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; } + __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + + static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); + } +#else + static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } + static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); } + static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } + static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); } + static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } + + static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } +#endif + + template<int mask> + __forceinline vint8 select(const vint8& t, const vint8& f) { + return _mm256_blend_epi32(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } + __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } + + __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } + __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } + + __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } + __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } + + __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } + __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } + + __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } + __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } + + __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } + __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } + + __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } + __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } + __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } + __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } + __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } + __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } + +#if defined(__AVX512VL__) + static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } + static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); } + __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); } + + template<int i> + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1> + __forceinline vint8 shuffle4(const vint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vint8 shuffle(const vint8& a, const vint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + + template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } + template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + __forceinline vint8 permute(const vint8& v, const __m256i& index) { + return _mm256_permutevar8x32_epi32(v, index); + } + + __forceinline vint8 shuffle(const vint8& v, const __m256i& index) { + return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); + } + + template<int i> + static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) { +#if defined(__AVX512VL__) + return _mm256_alignr_epi32(a, b, i); +#else + return _mm256_alignr_epi8(a, b, 4*i); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 usort_ascending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umin(a0,b0); + const vint8 d0 = umax(a0,b0); + const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umin(a1,b1); + const vint8 d1 = umax(a1,b1); + const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umin(a2,b2); + const vint8 d2 = umax(a2,b2); + const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umin(a3,b3); + const vint8 d3 = umax(a3,b3); + const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umin(a4,b4); + const vint8 d4 = umax(a4,b4); + const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umin(a5,b5); + const vint8 d5 = umax(a5,b5); + const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + __forceinline vint8 usort_descending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umax(a0,b0); + const vint8 d0 = umin(a0,b0); + const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umax(a1,b1); + const vint8 d1 = umin(a1,b1); + const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umax(a2,b2); + const vint8 d2 = umin(a2,b2); + const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umax(a3,b3); + const vint8 d3 = umin(a3,b3); + const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umax(a4,b4); + const vint8 d4 = umin(a4,b4); + const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umax(a5,b5); + const vint8 d5 = umin(a5,b5); + const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vllong4_avx2.h b/thirdparty/embree/common/simd/vllong4_avx2.h new file mode 100644 index 0000000000..6c86845877 --- /dev/null +++ b/thirdparty/embree/common/simd/vllong4_avx2.h @@ -0,0 +1,352 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide AVX2 64-bit long long type */ + template<> + struct vllong<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256i v; + long long i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong() {} + __forceinline vllong(const vllong4& t) { v = t.v; } + __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; } + + __forceinline vllong(const __m256i& t) { v = t; } + __forceinline operator __m256i() const { return v; } + __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); } + + + __forceinline vllong(long long i) { + v = _mm256_set1_epi64x(i); + } + + __forceinline vllong(long long a, long long b, long long c, long long d) { + v = _mm256_set_epi64x(d,c,b,a); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vllong(OneTy) : v(_mm256_set1_epi64x(1)) {} + __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {} + __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a)); + } + + static __forceinline vllong4 loadu(const void* addr) + { + return _mm256_loadu_si256((__m256i*)addr); + } + + static __forceinline vllong4 load(const vllong4* addr) { + return _mm256_load_si256((__m256i*)addr); + } + + static __forceinline vllong4 load(const long long* addr) { + return _mm256_load_si256((__m256i*)addr); + } + + static __forceinline void store(void* ptr, const vllong4& v) { + _mm256_store_si256((__m256i*)ptr,v); + } + + static __forceinline void storeu(void* ptr, const vllong4& v) { + _mm256_storeu_si256((__m256i*)ptr,v); + } + + static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) { +#if defined(__AVX512VL__) + _mm256_mask_storeu_epi64(ptr,mask,f); +#else + _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); +#endif + } + + static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) { +#if defined(__AVX512VL__) + _mm256_mask_store_epi64(ptr,mask,f); +#else + _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } + __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) { + #if defined(__AVX512VL__) + return _mm256_mask_blend_epi64(m, f, t); + #else + return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m)); + #endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); } +#else + __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); } +#endif + + __forceinline vllong4 operator +(const vllong4& a) { return a; } + __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); } + __forceinline vllong4 operator +(const vllong4& a, long long b) { return a + vllong4(b); } + __forceinline vllong4 operator +(long long a, const vllong4& b) { return vllong4(a) + b; } + + __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); } + __forceinline vllong4 operator -(const vllong4& a, long long b) { return a - vllong4(b); } + __forceinline vllong4 operator -(long long a, const vllong4& b) { return vllong4(a) - b; } + + /* only low 32bit part */ + __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); } + __forceinline vllong4 operator *(const vllong4& a, long long b) { return a * vllong4(b); } + __forceinline vllong4 operator *(long long a, const vllong4& b) { return vllong4(a) * b; } + + __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); } + __forceinline vllong4 operator &(const vllong4& a, long long b) { return a & vllong4(b); } + __forceinline vllong4 operator &(long long a, const vllong4& b) { return vllong4(a) & b; } + + __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); } + __forceinline vllong4 operator |(const vllong4& a, long long b) { return a | vllong4(b); } + __forceinline vllong4 operator |(long long a, const vllong4& b) { return vllong4(a) | b; } + + __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); } + __forceinline vllong4 operator ^(const vllong4& a, long long b) { return a ^ vllong4(b); } + __forceinline vllong4 operator ^(long long a, const vllong4& b) { return vllong4(a) ^ b; } + + __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); } + //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); } + + __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); } + //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); } + //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); } + + __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); } + + //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); } + //__forceinline vllong4 min(const vllong4& a, long long b) { return min(a,vllong4(b)); } + //__forceinline vllong4 min(long long a, const vllong4& b) { return min(vllong4(a),b); } + + //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); } + //__forceinline vllong4 max(const vllong4& a, long long b) { return max(a,vllong4(b)); } + //__forceinline vllong4 max(long long a, const vllong4& b) { return max(vllong4(a),b); } + +#if defined(__AVX512VL__) + __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); } + __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); } +#else + __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); } + __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; } + __forceinline vllong4& operator +=(vllong4& a, long long b) { return a = a + b; } + + __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; } + __forceinline vllong4& operator -=(vllong4& a, long long b) { return a = a - b; } + + __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; } + __forceinline vllong4& operator *=(vllong4& a, long long b) { return a = a * b; } + + __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; } + __forceinline vllong4& operator &=(vllong4& a, long long b) { return a = a & b; } + + __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; } + __forceinline vllong4& operator |=(vllong4& a, long long b) { return a = a | b; } + + __forceinline vllong4& operator <<=(vllong4& a, long long b) { return a = a << b; } + //__forceinline vllong4& operator >>=(vllong4& a, long long b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); } + __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); } + __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); } + __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); } + __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); } + __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); } +#endif + + __forceinline vboold4 operator ==(const vllong4& a, long long b) { return a == vllong4(b); } + __forceinline vboold4 operator ==(long long a, const vllong4& b) { return vllong4(a) == b; } + + __forceinline vboold4 operator !=(const vllong4& a, long long b) { return a != vllong4(b); } + __forceinline vboold4 operator !=(long long a, const vllong4& b) { return vllong4(a) != b; } + + __forceinline vboold4 operator > (const vllong4& a, long long b) { return a > vllong4(b); } + __forceinline vboold4 operator > (long long a, const vllong4& b) { return vllong4(a) > b; } + + __forceinline vboold4 operator < (const vllong4& a, long long b) { return a < vllong4(b); } + __forceinline vboold4 operator < (long long a, const vllong4& b) { return vllong4(a) < b; } + + __forceinline vboold4 operator >=(const vllong4& a, long long b) { return a >= vllong4(b); } + __forceinline vboold4 operator >=(long long a, const vllong4& b) { return vllong4(a) >= b; } + + __forceinline vboold4 operator <=(const vllong4& a, long long b) { return a <= vllong4(b); } + __forceinline vboold4 operator <=(long long a, const vllong4& b) { return vllong4(a) <= b; } + + __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; } + __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; } + __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a < b; } + __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; } + __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a > b; } + __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); } + __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); } + __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a < b); } + __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); } + __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a > b); } + __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i0, int i1> + __forceinline vllong4 shuffle(const vllong4& v) { + return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); + } + + template<int i> + __forceinline vllong4 shuffle(const vllong4& v) { + return shuffle<i, i>(v); + } + + template<int i0, int i1> + __forceinline vllong4 shuffle2(const vllong4& v) { + return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0)); + } + + __forceinline long long toScalar(const vllong4& v) { + return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); + } + +#if defined(__AVX512VL__) + __forceinline vllong4 permute(const vllong4& a, const __m256i& index) { + // workaround for GCC 7.x +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) + return _mm256_permutex2var_epi64(a,index,a); +#else + return _mm256_permutexvar_epi64(index,a); +#endif + } + + __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) { + return _mm256_permutex2var_epi64(a,index,b); + } + +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + + __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); } + __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } + + __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); } + __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } + + __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); } + __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } + + __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); } + __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); } + __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<4; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vllong8_avx512.h b/thirdparty/embree/common/simd/vllong8_avx512.h new file mode 100644 index 0000000000..ee69411637 --- /dev/null +++ b/thirdparty/embree/common/simd/vllong8_avx512.h @@ -0,0 +1,358 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX-512 64-bit long long type */ + template<> + struct vllong<8> + { + ALIGNED_STRUCT_(64); + + typedef vboold8 Bool; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m512i v; + long long i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong() {} + __forceinline vllong(const vllong8& t) { v = t.v; } + __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; } + + __forceinline vllong(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vllong(long long i) { + v = _mm512_set1_epi64(i); + } + + __forceinline vllong(long long a, long long b, long long c, long long d) { + v = _mm512_set4_epi64(d,c,b,a); + } + + __forceinline vllong(long long a0, long long a1, long long a2, long long a3, + long long a4, long long a5, long long a6, long long a7) + { + v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline vllong(const vllong<4>& i) { + v = _mm512_broadcast_i64x4(i); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {} + __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {} + __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) { + _mm512_stream_si512((__m512i*)ptr,a); + } + + static __forceinline vllong8 loadu(const void* addr) { + return _mm512_loadu_si512(addr); + } + + static __forceinline vllong8 load(const vllong8* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vllong8 load(const long long* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vllong8 load(const unsigned char* ptr) { + return _mm512_cvtepu8_epi64(*(__m128i*)ptr); + } + + static __forceinline void store(void* ptr, const vllong8& v) { + _mm512_store_si512(ptr,v); + } + + static __forceinline void storeu(void* ptr, const vllong8& v) { + _mm512_storeu_si512(ptr,v); + } + + static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) { + _mm512_mask_storeu_epi64(ptr,mask,f); + } + + static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) { + _mm512_mask_store_epi64(addr,mask,v2); + } + + static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) { + return _mm512_mask_compress_epi64(a,mask,b); + } + + static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) { + return _mm512_mask_expand_epi64(b,mask,a); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; } + __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); } + + __forceinline vllong8 operator +(const vllong8& a) { return a; } + __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); } + __forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); } + __forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; } + + __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); } + __forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); } + __forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; } + + __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); } + __forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); } + __forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; } + + __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); } + __forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); } + __forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; } + + __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); } + __forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); } + __forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; } + + __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); } + __forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); } + __forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; } + + __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); } + __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); } + + __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); } + __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); } + + __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); } + __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); } + __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); } + + __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); } + __forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); } + __forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); } + + __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); } + __forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); } + __forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); } + + __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); } + __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); } + + __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); } + __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; } + __forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; } + + __forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; } + __forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; } + + __forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; } + __forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; } + + __forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; } + __forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; } + + __forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; } + __forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; } + + __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; } + __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); } + __forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; } + + __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); } + __forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; } + + __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); } + __forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; } + + __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); } + __forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; } + + __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); } + __forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; } + + __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); } + __forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; } + + __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) { + return _mm512_mask_or_epi64(f,m,t,t); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i0, int i1> + __forceinline vllong8 shuffle(const vllong8& v) { + return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); + } + + template<int i> + __forceinline vllong8 shuffle(const vllong8& v) { + return shuffle<i, i>(v); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vllong8 shuffle(const vllong8& v) { + return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1> + __forceinline vllong8 shuffle4(const vllong8& v) { + return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); + } + + template<int i> + __forceinline vllong8 shuffle4(const vllong8& v) { + return shuffle4<i, i>(v); + } + + template<int i> + __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) { + return _mm512_alignr_epi64(a, b, i); + }; + + __forceinline long long toScalar(const vllong8& v) { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } + + __forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } + + __forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); } + + __forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); } + + __forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } + + __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); } + __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); } + __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); } + __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); } + __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 permute(const vllong8& v, const vllong8& index) { + return _mm512_permutexvar_epi64(index,v); + } + + __forceinline vllong8 reverse(const vllong8& a) { + return permute(a,vllong8(reverse_step)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<8; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vuint16_avx512.h b/thirdparty/embree/common/simd/vuint16_avx512.h new file mode 100644 index 0000000000..c9eb6682ff --- /dev/null +++ b/thirdparty/embree/common/simd/vuint16_avx512.h @@ -0,0 +1,424 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 16-wide AVX-512 unsigned integer type */ + template<> + struct vuint<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vuint16 UInt; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512i v; + unsigned int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint16& t) { v = t.v; } + __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; } + + __forceinline vuint(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vuint(unsigned int i) { + v = _mm512_set1_epi32(i); + } + + __forceinline vuint(const vuint4& i) { + v = _mm512_broadcast_i32x4(i); + } + + __forceinline vuint(const vuint8& i) { + v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); + } + + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) { + v = _mm512_set4_epi32(d,c,b,a); + } + + __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3, + unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7, + unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11, + unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15) + { + v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline explicit vuint(const __m512& f) { + v = _mm512_cvtps_epu32(f); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vuint(OneTy) : v(_mm512_set1_epi32(1)) {} + __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) { + _mm512_stream_si512((__m512i*)ptr,a); + } + + static __forceinline vuint16 loadu(const void* addr) + { + return _mm512_loadu_si512(addr); + } + + static __forceinline vuint16 loadu(const unsigned char* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } + static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } + + static __forceinline vuint16 load(const vuint16* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vuint16 load(const unsigned int* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); } + + + static __forceinline void store(void* ptr, const vuint16& v) { + _mm512_store_si512(ptr,v); + } + + static __forceinline void storeu(void* ptr, const vuint16& v) { + _mm512_storeu_si512(ptr,v); + } + + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) { + _mm512_mask_storeu_epi32(ptr,mask,f); + } + + static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) { + _mm512_mask_store_epi32(addr,mask,v2); + } + + static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) { + return _mm512_mask_compress_epi32(v,mask,v); + } + + static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) { + return _mm512_mask_compress_epi32(a,mask,b); + } + + static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) { + return _mm512_mask_expand_epi32(b,mask,a); + } + + template<int scale = 4> + static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) { + return _mm512_i32gather_epi32(index,ptr,scale); + } + + template<int scale = 4> + static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); + } + + template<int scale = 4> + static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); + } + + template<int scale = 4> + static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) { + _mm512_i32scatter_epi32((int*)ptr,index,v,scale); + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) { + _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int& operator [](size_t index) { assert(index < 16); return i[index]; } + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; } + + __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } + __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); } + + __forceinline vuint16 operator +(const vuint16& a) { return a; } + __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); } + __forceinline vuint16 operator +(const vuint16& a, unsigned int b) { return a + vuint16(b); } + __forceinline vuint16 operator +(unsigned int a, const vuint16& b) { return vuint16(a) + b; } + + __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); } + __forceinline vuint16 operator -(const vuint16& a, unsigned int b) { return a - vuint16(b); } + __forceinline vuint16 operator -(unsigned int a, const vuint16& b) { return vuint16(a) - b; } + + __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); } + __forceinline vuint16 operator *(const vuint16& a, unsigned int b) { return a * vuint16(b); } + __forceinline vuint16 operator *(unsigned int a, const vuint16& b) { return vuint16(a) * b; } + + __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); } + __forceinline vuint16 operator &(const vuint16& a, unsigned int b) { return a & vuint16(b); } + __forceinline vuint16 operator &(unsigned int a, const vuint16& b) { return vuint16(a) & b; } + + __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); } + __forceinline vuint16 operator |(const vuint16& a, unsigned int b) { return a | vuint16(b); } + __forceinline vuint16 operator |(unsigned int a, const vuint16& b) { return vuint16(a) | b; } + + __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); } + __forceinline vuint16 operator ^(const vuint16& a, unsigned int b) { return a ^ vuint16(b); } + __forceinline vuint16 operator ^(unsigned int a, const vuint16& b) { return vuint16(a) ^ b; } + + __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); } + __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); } + + __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); } + __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); } + + __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); } + __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); } + __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); } + + __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); } + __forceinline vuint16 min(const vuint16& a, unsigned int b) { return min(a,vuint16(b)); } + __forceinline vuint16 min(unsigned int a, const vuint16& b) { return min(vuint16(a),b); } + + __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); } + __forceinline vuint16 max(const vuint16& a, unsigned int b) { return max(a,vuint16(b)); } + __forceinline vuint16 max(unsigned int a, const vuint16& b) { return max(vuint16(a),b); } + + __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } + __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } + + __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } + __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; } + __forceinline vuint16& operator +=(vuint16& a, unsigned int b) { return a = a + b; } + + __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; } + __forceinline vuint16& operator -=(vuint16& a, unsigned int b) { return a = a - b; } + + __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; } + __forceinline vuint16& operator *=(vuint16& a, unsigned int b) { return a = a * b; } + + __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; } + __forceinline vuint16& operator &=(vuint16& a, unsigned int b) { return a = a & b; } + + __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; } + __forceinline vuint16& operator |=(vuint16& a, unsigned int b) { return a = a | b; } + + __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; } + __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vuint16& a, unsigned int b) { return a == vuint16(b); } + __forceinline vboolf16 operator ==(unsigned int a, const vuint16& b) { return vuint16(a) == b; } + + __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vuint16& a, unsigned int b) { return a != vuint16(b); } + __forceinline vboolf16 operator !=(unsigned int a, const vuint16& b) { return vuint16(a) != b; } + + __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vuint16& a, unsigned int b) { return a < vuint16(b); } + __forceinline vboolf16 operator < (unsigned int a, const vuint16& b) { return vuint16(a) < b; } + + __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vuint16& a, unsigned int b) { return a >= vuint16(b); } + __forceinline vboolf16 operator >=(unsigned int a, const vuint16& b) { return vuint16(a) >= b; } + + __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vuint16& a, unsigned int b) { return a > vuint16(b); } + __forceinline vboolf16 operator > (unsigned int a, const vuint16& b) { return vuint16(a) > b; } + + __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vuint16& a, unsigned int b) { return a <= vuint16(b); } + __forceinline vboolf16 operator <=(unsigned int a, const vuint16& b) { return vuint16(a) <= b; } + + __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } + + + __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) { + return _mm512_mask_or_epi32(f,m,t,t); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template<int i> + __forceinline vuint16 shuffle(const vuint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint16 shuffle(const vuint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i> + __forceinline vuint16 shuffle4(const vuint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint16 shuffle4(const vuint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i> + __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) { + return _mm512_alignr_epi32(a, b, i); + }; + + __forceinline unsigned int toScalar(const vuint16& v) { + return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 vreduce_min2(vuint16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vuint16 vreduce_max2(vuint16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vuint16 vreduce_and2(vuint16 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } + + __forceinline vuint16 vreduce_or2(vuint16 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } + + __forceinline vuint16 vreduce_add2(vuint16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); } + __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); } + __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); } + __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); } + __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 permute(vuint16 v, vuint16 index) { + return _mm512_permutexvar_epi32(index,v); + } + + __forceinline vuint16 reverse(const vuint16& a) { + return permute(a,vuint16(reverse_step)); + } + + __forceinline vuint16 prefix_sum(const vuint16& a) + { + const vuint16 z(zero); + vuint16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vuint16 reverse_prefix_sum(const vuint16& a) + { + const vuint16 z(zero); + vuint16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h new file mode 100644 index 0000000000..0601b9ab80 --- /dev/null +++ b/thirdparty/embree/common/simd/vuint4_sse2.h @@ -0,0 +1,426 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 4-wide SSE integer type */ + template<> + struct vuint<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vuint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128i v; unsigned int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint4& a) { v = a.v; } + __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; } + + __forceinline vuint(const __m128i a) : v(a) {} + __forceinline operator const __m128i&() const { return v; } + __forceinline operator __m128i&() { return v; } + + + __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {} +#endif + +#if defined(__AVX512VL__) + __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} +#else + __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm_setzero_si128()) {} + __forceinline vuint(OneTy) : v(_mm_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {} + __forceinline vuint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} + __forceinline vuint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } + __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } + static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } + + static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } + +#if defined(__AVX512VL__) + static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } + static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } +#else + static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } + static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } +#endif + +#if defined(__SSE4_1__) + static __forceinline vuint4 load(const unsigned char* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + + static __forceinline vuint4 loadu(const unsigned char* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + +#endif + + static __forceinline vuint4 load(const unsigned short* ptr) { +#if defined (__SSE4_1__) + return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); +#else + return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]); +#endif + } + + static __forceinline vuint4 load_nt(void* ptr) { +#if defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif + } + + static __forceinline void store_nt(void* ptr, const vuint4& v) { +#if defined(__SSE4_1__) + _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif + } + + template<int scale = 4> + static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) { +#if defined(__AVX2__) + return _mm_i32gather_epi32((const int*)ptr, index, scale); +#else + return vuint4( + *(unsigned int*)(((char*)ptr)+scale*index[0]), + *(unsigned int*)(((char*)ptr)+scale*index[1]), + *(unsigned int*)(((char*)ptr)+scale*index[2]), + *(unsigned int*)(((char*)ptr)+scale*index[3])); +#endif + } + + template<int scale = 4> + static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) { + vuint4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#elif defined(__AVX2__) + return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]); + return r; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 4); return i[index]; } + + friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); } +#else + __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); } +#endif + + __forceinline vuint4 operator +(const vuint4& a) { return a; } + __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); } + __forceinline vuint4 operator +(const vuint4& a, unsigned int b) { return a + vuint4(b); } + __forceinline vuint4 operator +(unsigned int a, const vuint4& b) { return vuint4(a) + b; } + + __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); } + __forceinline vuint4 operator -(const vuint4& a, unsigned int b) { return a - vuint4(b); } + __forceinline vuint4 operator -(unsigned int a, const vuint4& b) { return vuint4(a) - b; } + +//#if defined(__SSE4_1__) +// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); } +//#else +// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } +//#endif +// __forceinline vuint4 operator *(const vuint4& a, unsigned int b) { return a * vuint4(b); } +// __forceinline vuint4 operator *(unsigned int a, const vuint4& b) { return vuint4(a) * b; } + + __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); } + __forceinline vuint4 operator &(const vuint4& a, unsigned int b) { return a & vuint4(b); } + __forceinline vuint4 operator &(unsigned int a, const vuint4& b) { return vuint4(a) & b; } + + __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); } + __forceinline vuint4 operator |(const vuint4& a, unsigned int b) { return a | vuint4(b); } + __forceinline vuint4 operator |(unsigned int a, const vuint4& b) { return vuint4(a) | b; } + + __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); } + __forceinline vuint4 operator ^(const vuint4& a, unsigned int b) { return a ^ vuint4(b); } + __forceinline vuint4 operator ^(unsigned int a, const vuint4& b) { return vuint4(a) ^ b; } + + __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); } + __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); } + + __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); } + __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); } + __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; } + __forceinline vuint4& operator +=(vuint4& a, unsigned int b) { return a = a + b; } + + __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; } + __forceinline vuint4& operator -=(vuint4& a, unsigned int b) { return a = a - b; } + +//#if defined(__SSE4_1__) +// __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; } +// __forceinline vuint4& operator *=(vuint4& a, unsigned int b) { return a = a * b; } +//#endif + + __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; } + __forceinline vuint4& operator &=(vuint4& a, unsigned int b) { return a = a & b; } + + __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; } + __forceinline vuint4& operator |=(vuint4& a, unsigned int b) { return a = a | b; } + + __forceinline vuint4& operator <<=(vuint4& a, unsigned int b) { return a = a << b; } + __forceinline vuint4& operator >>=(vuint4& a, unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); } + //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); } + //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a < b); } + //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); } + //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a > b); } +#endif + + __forceinline vboolf4 operator ==(const vuint4& a, unsigned int b) { return a == vuint4(b); } + __forceinline vboolf4 operator ==(unsigned int a, const vuint4& b) { return vuint4(a) == b; } + + __forceinline vboolf4 operator !=(const vuint4& a, unsigned int b) { return a != vuint4(b); } + __forceinline vboolf4 operator !=(unsigned int a, const vuint4& b) { return vuint4(a) != b; } + + //__forceinline vboolf4 operator < (const vuint4& a, unsigned int b) { return a < vuint4(b); } + //__forceinline vboolf4 operator < (unsigned int a, const vuint4& b) { return vuint4(a) < b; } + + //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int b) { return a >= vuint4(b); } + //__forceinline vboolf4 operator >=(unsigned int a, const vuint4& b) { return vuint4(a) >= b; } + + //__forceinline vboolf4 operator > (const vuint4& a, unsigned int b) { return a > vuint4(b); } + //__forceinline vboolf4 operator > (unsigned int a, const vuint4& b) { return vuint4(a) > b; } + + //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int b) { return a <= vuint4(b); } + //__forceinline vboolf4 operator <=(unsigned int a, const vuint4& b) { return vuint4(a) <= b; } + + __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; } + __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; } + //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a < b; } + //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; } + //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a > b; } + //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } + //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } + //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } + //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } + //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); } + //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a < b); } + //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); } + //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a > b); } + //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); } +#endif + + template<int mask> + __forceinline vuint4 select(const vuint4& t, const vuint4& f) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(vboolf4(mask), t, f); +#endif + } + +/*#if defined(__SSE4_1__) + __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); } + __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); } + +#else + __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); } + __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); } +#endif + + __forceinline vuint4 min(const vuint4& a, unsigned int b) { return min(a,vuint4(b)); } + __forceinline vuint4 min(unsigned int a, const vuint4& b) { return min(vuint4(a),b); } + __forceinline vuint4 max(const vuint4& a, unsigned int b) { return max(a,vuint4(b)); } + __forceinline vuint4 max(unsigned int a, const vuint4& b) { return max(vuint4(a),b); }*/ + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint4 shuffle(const vuint4& v) { + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + +#if defined(__SSE3__) + template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } +#endif + + template<int i> + __forceinline vuint4 shuffle(const vuint4& v) { + return shuffle<i,i,i,i>(v); + } + +#if defined(__SSE4_1__) + template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); } + template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); } +#else + template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; } + template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; } +#endif + + + template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); } + + __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + +#if 0 +#if defined(__SSE4_1__) + + __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); } + __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); } + __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + + __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); } + __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); } + __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vuint8_avx.h b/thirdparty/embree/common/simd/vuint8_avx.h new file mode 100644 index 0000000000..589cd9d731 --- /dev/null +++ b/thirdparty/embree/common/simd/vuint8_avx.h @@ -0,0 +1,386 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vuint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vuint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + struct { __m128i vl,vh; }; + unsigned int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint8& a) { v = a.v; } + __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } + + __forceinline vuint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} + + __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} + + __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {} + __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } + static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } + + static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + + static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + + static __forceinline void store_nt(void* ptr, const vuint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline vuint8 load(const unsigned char* ptr) { + vuint4 il = vuint4::load(ptr+0); + vuint4 ih = vuint4::load(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 loadu(const unsigned char* ptr) { + vuint4 il = vuint4::loadu(ptr+0); + vuint4 ih = vuint4::loadu(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 load(const unsigned short* ptr) { + vuint4 il = vuint4::load(ptr+0); + vuint4 ih = vuint4::load(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 loadu(const unsigned short* ptr) { + vuint4 il = vuint4::loadu(ptr+0); + vuint4 ih = vuint4::loadu(ptr+4); + return vuint8(il,ih); + } + + static __forceinline void store(unsigned char* ptr, const vuint8& i) { + vuint4 il(i.vl); + vuint4 ih(i.vh); + vuint4::store(ptr + 0,il); + vuint4::store(ptr + 4,ih); + } + + static __forceinline void store(unsigned short* ptr, const vuint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template<int scale = 4> + static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) { + return vuint8( + *(unsigned int*)(((char*)ptr)+scale*index[0]), + *(unsigned int*)(((char*)ptr)+scale*index[1]), + *(unsigned int*)(((char*)ptr)+scale*index[2]), + *(unsigned int*)(((char*)ptr)+scale*index[3]), + *(unsigned int*)(((char*)ptr)+scale*index[4]), + *(unsigned int*)(((char*)ptr)+scale*index[5]), + *(unsigned int*)(((char*)ptr)+scale*index[6]), + *(unsigned int*)(((char*)ptr)+scale*index[7])); + } + + template<int scale = 4> + static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) { + vuint8 r = zero; + if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(unsigned int*)(((char*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(unsigned int*)(((char*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(unsigned int*)(((char*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(unsigned int*)(((char*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(unsigned int*)(((char*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(unsigned int*)(((char*)ptr)+scale*index[7]); + return r; + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) + { + *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) + { + if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; + } + + + static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } + + __forceinline vuint8 operator +(const vuint8& a) { return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } + __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } + __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } + + __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } + __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } + __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } + + //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); } + //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } + //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } + + __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } + __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } + + __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } + __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } + + __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } + __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } + + __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } + __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); } + + __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } + __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } + __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } + + __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } + __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } + __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } + + __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } + __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } + __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } + __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } + + __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } + __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } + + //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } + //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } + + __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } + __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } + + __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } + __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } + + __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; } + __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } + __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } + + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } + __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } + __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } + + //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)), + // _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); } + //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } + //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } + + //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } + //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } + //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } + + //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)), + // _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); } + //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } + //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } + + //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } + //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } + //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } + + __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } + __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } + + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + + template<int i> + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1> + __forceinline vuint8 shuffle4(const vuint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } + template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } + //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } + + //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } + //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/simd/vuint8_avx2.h b/thirdparty/embree/common/simd/vuint8_avx2.h new file mode 100644 index 0000000000..17b994522f --- /dev/null +++ b/thirdparty/embree/common/simd/vuint8_avx2.h @@ -0,0 +1,446 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define vboolf vboolf_impl +#define vboold vboold_impl +#define vint vint_impl +#define vuint vuint_impl +#define vllong vllong_impl +#define vfloat vfloat_impl +#define vdouble vdouble_impl + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vuint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vuint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + unsigned int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint8& a) { v = a.v; } + __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } + + __forceinline vuint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + + __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} +#else + __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} + __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} + __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint8 load(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vuint8 loadu(const unsigned char* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } + + static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } + static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } + + static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } + +#if defined(__AVX512VL__) + + static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) { + return _mm256_mask_compress_epi32(v, mask, v); + } + static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) { + return _mm256_mask_compress_epi32(a, mask, b); + } + + static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } +#else + static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } +#endif + + static __forceinline vuint8 load_nt(void* ptr) { + return _mm256_stream_load_si256((__m256i*)ptr); + } + + static __forceinline void store_nt(void* ptr, const vuint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline void store(unsigned char* ptr, const vuint8& i) + { + for (size_t j=0; j<8; j++) + ptr[j] = i[j]; + } + + static __forceinline void store(unsigned short* ptr, const vuint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template<int scale = 4> + static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) { + return _mm256_i32gather_epi32((const int*) ptr, index, scale); + } + + template<int scale = 4> + static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) { + vuint8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale); +#else + return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale); +#endif + } + + template<int scale = 4> + static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); +#else + *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template<int scale = 4> + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(unsigned int*)(((char*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(unsigned int*)(((char*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(unsigned int*)(((char*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(unsigned int*)(((char*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(unsigned int*)(((char*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(unsigned int*)(((char*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); } +#else + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } +#endif + + __forceinline vuint8 operator +(const vuint8& a) { return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); } + __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } + __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } + + __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); } + __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } + __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } + + //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); } + //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } + //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } + + __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); } + __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } + __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } + + __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); } + __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } + __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } + + __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); } + __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } + __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } + + __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); } + __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); } + + __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); } + __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); } + + __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); } + __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); } + __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); } + + __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); } + __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); } + __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); } + + __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); } + __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } + __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } + + __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); } + __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } + __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } + __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } + + __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } + __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } + + //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } + //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } + + __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } + __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } + + __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } + __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } + + __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; } + __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); + } +#else + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } + //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); } + //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } + //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); } + //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } +#endif + + template<int mask> + __forceinline vuint8 select(const vuint8& t, const vuint8& f) { + return _mm256_blend_epi32(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } + __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } + + __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } + __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } + + //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } + //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } + + //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } + //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } + + //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } + //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } + + //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } + //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } + + __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } + __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } + //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a < b; } + //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; } + //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a > b; } + //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } + //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a < b); } + //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); } + //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a > b); } + //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); } + __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); } + + template<int i> + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template<int i0, int i1> + __forceinline vuint8 shuffle4(const vuint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1> + __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } + template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + __forceinline vuint8 permute(const vuint8& v, const __m256i& index) { + return _mm256_permutevar8x32_epi32(v, index); + } + + __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) { + return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); + } + + template<int i> + __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) { +#if defined(__AVX512VL__) + return _mm256_alignr_epi32(a, b, i); +#else + return _mm256_alignr_epi8(a, b, 4*i); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } + //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } + + //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } + //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} + +#undef vboolf +#undef vboold +#undef vint +#undef vuint +#undef vllong +#undef vfloat +#undef vdouble diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp new file mode 100644 index 0000000000..abdd269069 --- /dev/null +++ b/thirdparty/embree/common/sys/alloc.cpp @@ -0,0 +1,327 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "alloc.h" +#include "intrinsics.h" +#include "sysinfo.h" +#include "mutex.h" + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + void* alignedMalloc(size_t size, size_t align) + { + if (size == 0) + return nullptr; + + assert((align & (align-1)) == 0); + void* ptr = _mm_malloc(size,align); + + if (size != 0 && ptr == nullptr) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return ptr; + } + + void alignedFree(void* ptr) + { + if (ptr) + _mm_free(ptr); + } + + static bool huge_pages_enabled = false; + static MutexSys os_init_mutex; + + __forceinline bool isHugePageCandidate(const size_t bytes) + { + if (!huge_pages_enabled) + return false; + + /* use huge pages only when memory overhead is low */ + const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1); + return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#include <malloc.h> + +namespace embree +{ + bool win_enable_selockmemoryprivilege (bool verbose) + { + HANDLE hToken; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) { + if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + TOKEN_PRIVILEGES tp; + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) { + if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + SetLastError(ERROR_SUCCESS); + if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl; + return false; + } + + if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl; + return false; + } + + return true; + } + + bool os_init(bool hugepages, bool verbose) + { + Lock<MutexSys> lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + + if (GetLargePageMinimum() != PAGE_SIZE_2M) { + huge_pages_enabled = false; + return false; + } + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { + int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + if (ptr != nullptr) { + hugepages = true; + return ptr; + } + } + + /* fall back to 4k pages */ + int flags = MEM_COMMIT | MEM_RESERVE; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + // -- GODOT start -- + // if (ptr == nullptr) throw std::bad_alloc(); + if (ptr == nullptr) abort(); + // -- GODOT end -- + hugepages = false; + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + if (hugepages) // decommitting huge pages seems not to work under Windows + return bytesOld; + + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + if (!VirtualFree(ptr,0,MEM_RELEASE)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + void os_advise(void *ptr, size_t bytes) + { + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <sys/mman.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <sstream> + +#if defined(__MACOSX__) +#include <mach/vm_statistics.h> +#endif + +namespace embree +{ + bool os_init(bool hugepages, bool verbose) + { + Lock<MutexSys> lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + +#if defined(__LINUX__) + + int hugepagesize = 0; + + std::ifstream file; + file.open("/proc/meminfo",std::ios::in); + if (!file.is_open()) { + if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } + + std::string line; + while (getline(file,line)) + { + std::stringstream sline(line); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string tag; getline(sline,tag,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string val; getline(sline,val,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string unit; getline(sline,unit,' '); + if (tag == "Hugepagesize:" && unit == "kB") { + hugepagesize = std::stoi(val)*1024; + break; + } + } + + if (hugepagesize != PAGE_SIZE_2M) + { + if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } +#endif + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { +#if defined(__MACOSX__) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#elif defined(MAP_HUGETLB) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#endif + } + + /* fallback to 4k pages */ + void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + // -- GODOT start -- + // if (ptr == MAP_FAILED) throw std::bad_alloc(); + if (ptr == MAP_FAILED) abort(); + // -- GODOT end -- + hugepages = false; + + /* advise huge page hint for THP */ + os_advise(ptr,bytes); + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + /* for hugepages we need to also align the size */ + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytes = (bytes+pageSize-1) & ~(pageSize-1); + if (munmap(ptr,bytes) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + /* hint for transparent huge pages (THP) */ + void os_advise(void* pptr, size_t bytes) + { +#if defined(MADV_HUGEPAGE) + madvise(pptr,bytes,MADV_HUGEPAGE); +#endif + } +} + +#endif diff --git a/thirdparty/embree/common/sys/alloc.h b/thirdparty/embree/common/sys/alloc.h new file mode 100644 index 0000000000..4fa474ec1d --- /dev/null +++ b/thirdparty/embree/common/sys/alloc.h @@ -0,0 +1,164 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include <vector> +#include <set> + +namespace embree +{ +#define ALIGNED_STRUCT_(align) \ + void* operator new(size_t size) { return alignedMalloc(size,align); } \ + void operator delete(void* ptr) { alignedFree(ptr); } \ + void* operator new[](size_t size) { return alignedMalloc(size,align); } \ + void operator delete[](void* ptr) { alignedFree(ptr); } + +#define ALIGNED_CLASS_(align) \ + public: \ + ALIGNED_STRUCT_(align) \ + private: + + /*! aligned allocation */ + void* alignedMalloc(size_t size, size_t align); + void alignedFree(void* ptr); + + /*! allocator that performs aligned allocations */ + template<typename T, size_t alignment> + struct aligned_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline pointer allocate( size_type n ) { + return (pointer) alignedMalloc(n*sizeof(value_type),alignment); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return alignedFree(p); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + }; + + /*! allocates pages directly from OS */ + bool win_enable_selockmemoryprivilege(bool verbose); + bool os_init(bool hugepages, bool verbose); + void* os_malloc (size_t bytes, bool& hugepages); + size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages); + void os_free (void* ptr, size_t bytes, bool hugepages); + void os_advise (void* ptr, size_t bytes); + + /*! allocator that performs OS allocations */ + template<typename T> + struct os_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline os_allocator () + : hugepages(false) {} + + __forceinline pointer allocate( size_type n ) { + return (pointer) os_malloc(n*sizeof(value_type),hugepages); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return os_free(p,n*sizeof(value_type),hugepages); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + + bool hugepages; + }; + + /*! allocator for IDs */ + template<typename T, size_t max_id> + struct IDPool + { + typedef T value_type; + + IDPool () + : nextID(0) {} + + T allocate() + { + /* return ID from list */ + if (!IDs.empty()) + { + T id = *IDs.begin(); + IDs.erase(IDs.begin()); + return id; + } + + /* allocate new ID */ + else + { + if (size_t(nextID)+1 > max_id) + return -1; + + return nextID++; + } + } + + /* adds an ID provided by the user */ + bool add(T id) + { + if (id > max_id) + return false; + + /* check if ID should be in IDs set */ + if (id < nextID) { + auto p = IDs.find(id); + if (p == IDs.end()) return false; + IDs.erase(p); + return true; + } + + /* otherwise increase ID set */ + else + { + for (T i=nextID; i<id; i++) { + IDs.insert(i); + } + nextID = id+1; + return true; + } + } + + void deallocate( T id ) + { + assert(id < nextID); + MAYBE_UNUSED auto done = IDs.insert(id).second; + assert(done); + } + + private: + std::set<T> IDs; //!< stores deallocated IDs to be reused + T nextID; //!< next ID to use when IDs vector is empty + }; +} + diff --git a/thirdparty/embree/common/sys/array.h b/thirdparty/embree/common/sys/array.h new file mode 100644 index 0000000000..dd9190c52a --- /dev/null +++ b/thirdparty/embree/common/sys/array.h @@ -0,0 +1,222 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "alloc.h" + +namespace embree +{ + /*! static array with static size */ + template<typename T, size_t N> + class array_t + { + public: + + /********************** Iterators ****************************/ + + __forceinline T* begin() const { return items; }; + __forceinline T* end () const { return items+N; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return N == 0; } + __forceinline size_t size () const { return N; } + __forceinline size_t max_size () const { return N; } + + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < N); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < N); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& front() const { assert(N > 0); return items[0]; }; + __forceinline T& back () const { assert(N > 0); return items[N-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + T items[N]; + }; + + /*! static array with dynamic size */ + template<typename T, size_t N> + class darray_t + { + public: + + __forceinline darray_t () : M(0) {} + + __forceinline darray_t (const T& v) : M(0) { + for (size_t i=0; i<N; i++) items[i] = v; + } + + /********************** Iterators ****************************/ + + __forceinline T* begin() const { return items; }; + __forceinline T* end () const { return items+M; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return M == 0; } + __forceinline size_t size () const { return M; } + __forceinline size_t capacity () const { return N; } + __forceinline size_t max_size () const { return N; } + + void resize(size_t new_size) { + assert(new_size < max_size()); + M = new_size; + } + + /******************** Modifiers **************************/ + + __forceinline void push_back(const T& v) + { + assert(M+1 < max_size()); + items[M++] = v; + } + + __forceinline void pop_back() + { + assert(!empty()); + M--; + } + + __forceinline void clear() { + M = 0; + } + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < M); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < M); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; } + + __forceinline T& front() const { assert(M > 0); return items[0]; }; + __forceinline T& back () const { assert(M > 0); return items[M-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + size_t M; + T items[N]; + }; + + /*! dynamic sized array that is allocated on the stack */ +#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N) + template<typename Ty, size_t max_stack_bytes> + struct __aligned(64) StackArray + { + __forceinline StackArray (const size_t N) + : N(N) + { + if (N*sizeof(Ty) <= max_stack_bytes) + data = &arr[0]; + else + data = (Ty*) alignedMalloc(N*sizeof(Ty),64); + } + + __forceinline ~StackArray () { + if (data != &arr[0]) alignedFree(data); + } + + __forceinline operator Ty* () { return data; } + __forceinline operator const Ty* () const { return data; } + + __forceinline Ty& operator[](const int i) { assert(i>=0 && i<N); return data[i]; } + __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; } + + __forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; } + __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; } + +#if defined(__64BIT__) + __forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; } + __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; } +#endif + + private: + Ty arr[max_stack_bytes/sizeof(Ty)]; + Ty* data; + size_t N; + + private: + StackArray (const StackArray& other) DELETED; // do not implement + StackArray& operator= (const StackArray& other) DELETED; // do not implement + + }; + + /*! dynamic sized array that is allocated on the stack */ + template<typename Ty, size_t max_stack_elements, size_t max_total_elements> + struct __aligned(64) DynamicStackArray + { + __forceinline DynamicStackArray () + : data(&arr[0]) {} + + __forceinline ~DynamicStackArray () + { + if (!isStackAllocated()) + delete[] data; + } + + __forceinline bool isStackAllocated() const { + return data == &arr[0]; + } + + __forceinline size_t size() const + { + if (isStackAllocated()) return max_stack_elements; + else return max_total_elements; + } + + __forceinline void resize(size_t M) + { + assert(M <= max_total_elements); + if (likely(M <= max_stack_elements)) return; + if (likely(!isStackAllocated())) return; + + data = new Ty[max_total_elements]; + + for (size_t i=0; i<max_stack_elements; i++) + data[i] = arr[i]; + } + + __forceinline operator Ty* () { return data; } + __forceinline operator const Ty* () const { return data; } + + __forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; } + __forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; } + +#if defined(__64BIT__) + __forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; } +#endif + + __forceinline DynamicStackArray (const DynamicStackArray& other) + : data(&arr[0]) + { + for (size_t i=0; i<other.size(); i++) + this->operator[] (i) = other[i]; + } + + DynamicStackArray& operator= (const DynamicStackArray& other) + { + for (size_t i=0; i<other.size(); i++) + this->operator[] (i) = other[i]; + + return *this; + } + + private: + Ty arr[max_stack_elements]; + Ty* data; + }; +} diff --git a/thirdparty/embree/common/sys/atomic.h b/thirdparty/embree/common/sys/atomic.h new file mode 100644 index 0000000000..67af254f36 --- /dev/null +++ b/thirdparty/embree/common/sys/atomic.h @@ -0,0 +1,59 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <atomic> +#include "intrinsics.h" + +namespace embree +{ +/* compiler memory barriers */ +#if defined(__INTEL_COMPILER) +//#define __memory_barrier() __memory_barrier() +#elif defined(__GNUC__) || defined(__clang__) +# define __memory_barrier() asm volatile("" ::: "memory") +#elif defined(_MSC_VER) +# define __memory_barrier() _ReadWriteBarrier() +#endif + + template <typename T> + struct atomic : public std::atomic<T> + { + atomic () {} + + atomic (const T& a) + : std::atomic<T>(a) {} + + atomic (const atomic<T>& a) { + this->store(a.load()); + } + + atomic& operator=(const atomic<T>& other) { + this->store(other.load()); + return *this; + } + }; + + template<typename T> + __forceinline void atomic_min(std::atomic<T>& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a <= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } + + template<typename T> + __forceinline void atomic_max(std::atomic<T>& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a >= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } +} diff --git a/thirdparty/embree/common/sys/barrier.cpp b/thirdparty/embree/common/sys/barrier.cpp new file mode 100644 index 0000000000..0c0e39d92d --- /dev/null +++ b/thirdparty/embree/common/sys/barrier.cpp @@ -0,0 +1,289 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "barrier.h" +#include "condition.h" +#include "regression.h" +#include "thread.h" + +#if defined (__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : i(0), enterCount(0), exitCount(0), barrierSize(0) + { + events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + init(N); + } + + __forceinline ~BarrierSysImplementation () + { + CloseHandle(events[0]); + CloseHandle(events[1]); + } + + __forceinline void init(size_t N) + { + barrierSize = N; + enterCount.store(N); + exitCount.store(N); + } + + __forceinline void wait() + { + /* every thread entering the barrier decrements this count */ + size_t i0 = i; + size_t cnt0 = enterCount--; + + /* all threads except the last one are wait in the barrier */ + if (cnt0 > 1) + { + if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0) + THROW_RUNTIME_ERROR("WaitForSingleObjects failed"); + } + + /* the last thread starts all threads waiting at the barrier */ + else + { + i = 1-i; + enterCount.store(barrierSize); + if (SetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("SetEvent failed"); + } + + /* every thread leaving the barrier decrements this count */ + size_t cnt1 = exitCount--; + + /* the last thread that left the barrier resets the event again */ + if (cnt1 == 1) + { + exitCount.store(barrierSize); + if (ResetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("ResetEvent failed"); + } + } + + public: + HANDLE events[2]; + atomic<size_t> i; + atomic<size_t> enterCount; + atomic<size_t> exitCount; + size_t barrierSize; + }; +} + +#else + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : count(0), barrierSize(0) + { + init(N); + } + + __forceinline void init(size_t N) + { + assert(count == 0); + count = 0; + barrierSize = N; + } + + __forceinline void wait() + { + mutex.lock(); + count++; + + if (count == barrierSize) { + count = 0; + cond.notify_all(); + mutex.unlock(); + return; + } + + cond.wait(mutex); + mutex.unlock(); + return; + } + + public: + MutexSys mutex; + ConditionSys cond; + volatile size_t count; + volatile size_t barrierSize; + }; +} + +#endif + +namespace embree +{ + BarrierSys::BarrierSys (size_t N) { + opaque = new BarrierSysImplementation(N); + } + + BarrierSys::~BarrierSys () { + delete (BarrierSysImplementation*) opaque; + } + + void BarrierSys::init(size_t count) { + ((BarrierSysImplementation*) opaque)->init(count); + } + + void BarrierSys::wait() { + ((BarrierSysImplementation*) opaque)->wait(); + } + + LinearBarrierActive::LinearBarrierActive (size_t N) + : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0) + { + if (N == 0) N = getNumberOfLogicalThreads(); + init(N); + } + + LinearBarrierActive::~LinearBarrierActive() + { + delete[] count0; + delete[] count1; + } + + void LinearBarrierActive::init(size_t N) + { + if (threadCount != N) { + threadCount = N; + if (count0) delete[] count0; count0 = new unsigned char[N]; + if (count1) delete[] count1; count1 = new unsigned char[N]; + } + mode = 0; + flag0 = 0; + flag1 = 0; + for (size_t i=0; i<N; i++) count0[i] = 0; + for (size_t i=0; i<N; i++) count1[i] = 0; + } + + void LinearBarrierActive::wait (const size_t threadIndex) + { + if (mode == 0) + { + if (threadIndex == 0) + { + for (size_t i=0; i<threadCount; i++) + count1[i] = 0; + + for (size_t i=1; i<threadCount; i++) + { + while (likely(count0[i] == 0)) + pause_cpu(); + } + mode = 1; + flag1 = 0; + __memory_barrier(); + flag0 = 1; + } + else + { + count0[threadIndex] = 1; + { + while (likely(flag0 == 0)) + pause_cpu(); + } + + } + } + else + { + if (threadIndex == 0) + { + for (size_t i=0; i<threadCount; i++) + count0[i] = 0; + + for (size_t i=1; i<threadCount; i++) + { + while (likely(count1[i] == 0)) + pause_cpu(); + } + + mode = 0; + flag0 = 0; + __memory_barrier(); + flag1 = 1; + } + else + { + count1[threadIndex] = 1; + { + while (likely(flag1 == 0)) + pause_cpu(); + } + } + } + } + + struct barrier_sys_regression_test : public RegressionTest + { + BarrierSys barrier; + std::atomic<size_t> threadID; + std::atomic<size_t> numFailed; + std::vector<size_t> threadResults; + + barrier_sys_regression_test() + : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0) + { + registerRegressionTest(this); + } + + static void thread_alloc(barrier_sys_regression_test* This) + { + size_t tid = This->threadID++; + for (size_t j=0; j<1000; j++) + { + This->barrier.wait(); + This->threadResults[tid] = tid; + This->barrier.wait(); + } + } + + bool run () + { + threadID.store(0); + numFailed.store(0); + + size_t numThreads = getNumberOfLogicalThreads(); + threadResults.resize(numThreads); + barrier.init(numThreads+1); + + /* create threads */ + std::vector<thread_t> threads; + for (size_t i=0; i<numThreads; i++) + threads.push_back(createThread((thread_func)thread_alloc,this)); + + /* run test */ + for (size_t i=0; i<1000; i++) + { + for (size_t i=0; i<numThreads; i++) threadResults[i] = 0; + barrier.wait(); + barrier.wait(); + for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i; + } + + /* destroy threads */ + for (size_t i=0; i<numThreads; i++) + join(threads[i]); + + return numFailed == 0; + } + }; + + barrier_sys_regression_test barrier_sys_regression_test; +} + + diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h new file mode 100644 index 0000000000..37fc036291 --- /dev/null +++ b/thirdparty/embree/common/sys/barrier.h @@ -0,0 +1,112 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "intrinsics.h" +#include "sysinfo.h" +#include "atomic.h" + +namespace embree +{ + /*! system barrier using operating system */ + class BarrierSys + { + public: + + /*! construction / destruction */ + BarrierSys (size_t N = 0); + ~BarrierSys (); + + private: + /*! class in non-copyable */ + BarrierSys (const BarrierSys& other) DELETED; // do not implement + BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement + + public: + /*! intializes the barrier with some number of threads */ + void init(size_t count); + + /*! lets calling thread wait in barrier */ + void wait(); + + private: + void* opaque; + }; + + /*! fast active barrier using atomitc counter */ + struct BarrierActive + { + public: + BarrierActive () + : cntr(0) {} + + void reset() { + cntr.store(0); + } + + void wait (size_t numThreads) + { + cntr++; + while (cntr.load() != numThreads) + pause_cpu(); + } + + private: + std::atomic<size_t> cntr; + }; + + /*! fast active barrier that does not require initialization to some number of threads */ + struct BarrierActiveAutoReset + { + public: + BarrierActiveAutoReset () + : cntr0(0), cntr1(0) {} + + void wait (size_t threadCount) + { + cntr0.fetch_add(1); + while (cntr0 != threadCount) pause_cpu(); + cntr1.fetch_add(1); + while (cntr1 != threadCount) pause_cpu(); + cntr0.fetch_add(-1); + while (cntr0 != 0) pause_cpu(); + cntr1.fetch_add(-1); + while (cntr1 != 0) pause_cpu(); + } + + private: + std::atomic<size_t> cntr0; + std::atomic<size_t> cntr1; + }; + + class LinearBarrierActive + { + public: + + /*! construction and destruction */ + LinearBarrierActive (size_t threadCount = 0); + ~LinearBarrierActive(); + + private: + /*! class in non-copyable */ + LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement + LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement + + public: + /*! intializes the barrier with some number of threads */ + void init(size_t threadCount); + + /*! thread with threadIndex waits in the barrier */ + void wait (const size_t threadIndex); + + private: + volatile unsigned char* count0; + volatile unsigned char* count1; + volatile unsigned int mode; + volatile unsigned int flag0; + volatile unsigned int flag1; + volatile size_t threadCount; + }; +} + diff --git a/thirdparty/embree/common/sys/condition.cpp b/thirdparty/embree/common/sys/condition.cpp new file mode 100644 index 0000000000..606a1d0b04 --- /dev/null +++ b/thirdparty/embree/common/sys/condition.cpp @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "condition.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + InitializeConditionVariable(&cond); + } + + __forceinline ~ConditionImplementation () { + } + + __forceinline void wait(MutexSys& mutex_in) { + SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE); + } + + __forceinline void notify_all() { + WakeAllConditionVariable(&cond); + } + + public: + CONDITION_VARIABLE cond; + }; +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include <pthread.h> +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + if (pthread_cond_init(&cond,nullptr) != 0) + THROW_RUNTIME_ERROR("pthread_cond_init failed"); + } + + __forceinline ~ConditionImplementation() { + MAYBE_UNUSED bool ok = pthread_cond_destroy(&cond) == 0; + assert(ok); + } + + __forceinline void wait(MutexSys& mutex) { + if (pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex) != 0) + THROW_RUNTIME_ERROR("pthread_cond_wait failed"); + } + + __forceinline void notify_all() { + if (pthread_cond_broadcast(&cond) != 0) + THROW_RUNTIME_ERROR("pthread_cond_broadcast failed"); + } + + public: + pthread_cond_t cond; + }; +} +#endif + +namespace embree +{ + ConditionSys::ConditionSys () { + cond = new ConditionImplementation; + } + + ConditionSys::~ConditionSys() { + delete (ConditionImplementation*) cond; + } + + void ConditionSys::wait(MutexSys& mutex) { + ((ConditionImplementation*) cond)->wait(mutex); + } + + void ConditionSys::notify_all() { + ((ConditionImplementation*) cond)->notify_all(); + } +} diff --git a/thirdparty/embree/common/sys/condition.h b/thirdparty/embree/common/sys/condition.h new file mode 100644 index 0000000000..557c6e3482 --- /dev/null +++ b/thirdparty/embree/common/sys/condition.h @@ -0,0 +1,31 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "mutex.h" + +namespace embree +{ + class ConditionSys + { + public: + ConditionSys(); + ~ConditionSys(); + void wait( class MutexSys& mutex ); + void notify_all(); + + template<typename Predicate> + __forceinline void wait( class MutexSys& mutex, const Predicate& pred ) + { + while (!pred()) wait(mutex); + } + + private: + ConditionSys (const ConditionSys& other) DELETED; // do not implement + ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement + + protected: + void* cond; + }; +} diff --git a/thirdparty/embree/common/sys/filename.cpp b/thirdparty/embree/common/sys/filename.cpp new file mode 100644 index 0000000000..f55b224302 --- /dev/null +++ b/thirdparty/embree/common/sys/filename.cpp @@ -0,0 +1,138 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "filename.h" +#include "sysinfo.h" + +namespace embree +{ +#ifdef __WIN32__ + const char path_sep = '\\'; +#else + const char path_sep = '/'; +#endif + + /*! create an empty filename */ + FileName::FileName () {} + + /*! create a valid filename from a string */ + FileName::FileName (const char* in) { + filename = in; + for (size_t i=0; i<filename.size(); i++) + if (filename[i] == '\\' || filename[i] == '/') + filename[i] = path_sep; + while (!filename.empty() && filename[filename.size()-1] == path_sep) + filename.resize(filename.size()-1); + } + + /*! create a valid filename from a string */ + FileName::FileName (const std::string& in) { + filename = in; + for (size_t i=0; i<filename.size(); i++) + if (filename[i] == '\\' || filename[i] == '/') + filename[i] = path_sep; + while (!filename.empty() && filename[filename.size()-1] == path_sep) + filename.resize(filename.size()-1); + } + + /*! returns path to home folder */ + FileName FileName::homeFolder() + { +#ifdef __WIN32__ + const char* home = getenv("UserProfile"); +#else + const char* home = getenv("HOME"); +#endif + if (home) return home; + return ""; + } + + /*! returns path to executable */ + FileName FileName::executableFolder() { + return FileName(getExecutableFileName()).path(); + } + + /*! returns the path */ + FileName FileName::path() const { + size_t pos = filename.find_last_of(path_sep); + if (pos == std::string::npos) return FileName(); + return filename.substr(0,pos); + } + + /*! returns the basename */ + std::string FileName::base() const { + size_t pos = filename.find_last_of(path_sep); + if (pos == std::string::npos) return filename; + return filename.substr(pos+1); + } + + /*! returns the extension */ + std::string FileName::ext() const { + size_t pos = filename.find_last_of('.'); + if (pos == std::string::npos) return ""; + return filename.substr(pos+1); + } + + /*! returns the extension */ + FileName FileName::dropExt() const { + size_t pos = filename.find_last_of('.'); + if (pos == std::string::npos) return filename; + return filename.substr(0,pos); + } + + /*! returns the basename without extension */ + std::string FileName::name() const { + size_t start = filename.find_last_of(path_sep); + if (start == std::string::npos) start = 0; else start++; + size_t end = filename.find_last_of('.'); + if (end == std::string::npos || end < start) end = filename.size(); + return filename.substr(start, end - start); + } + + /*! replaces the extension */ + FileName FileName::setExt(const std::string& ext) const { + size_t start = filename.find_last_of(path_sep); + if (start == std::string::npos) start = 0; else start++; + size_t end = filename.find_last_of('.'); + if (end == std::string::npos || end < start) return FileName(filename+ext); + return FileName(filename.substr(0,end)+ext); + } + + /*! adds the extension */ + FileName FileName::addExt(const std::string& ext) const { + return FileName(filename+ext); + } + + /*! concatenates two filenames to this/other */ + FileName FileName::operator +( const FileName& other ) const { + if (filename == "") return FileName(other); + else return FileName(filename + path_sep + other.filename); + } + + /*! concatenates two filenames to this/other */ + FileName FileName::operator +( const std::string& other ) const { + return operator+(FileName(other)); + } + + /*! removes the base from a filename (if possible) */ + FileName FileName::operator -( const FileName& base ) const { + size_t pos = filename.find_first_of(base); + if (pos == std::string::npos) return *this; + return FileName(filename.substr(pos+1)); + } + + /*! == operator */ + bool operator== (const FileName& a, const FileName& b) { + return a.filename == b.filename; + } + + /*! != operator */ + bool operator!= (const FileName& a, const FileName& b) { + return a.filename != b.filename; + } + + /*! output operator */ + std::ostream& operator<<(std::ostream& cout, const FileName& filename) { + return cout << filename.filename; + } +} diff --git a/thirdparty/embree/common/sys/filename.h b/thirdparty/embree/common/sys/filename.h new file mode 100644 index 0000000000..d5929cd836 --- /dev/null +++ b/thirdparty/embree/common/sys/filename.h @@ -0,0 +1,81 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +namespace embree +{ + /*! Convenience class for handling file names and paths. */ + class FileName + { + public: + + /*! create an empty filename */ + FileName (); + + /*! create a valid filename from a string */ + FileName (const char* filename); + + /*! create a valid filename from a string */ + FileName (const std::string& filename); + + /*! returns path to home folder */ + static FileName homeFolder(); + + /*! returns path to executable */ + static FileName executableFolder(); + + /*! auto convert into a string */ + operator std::string() const { return filename; } + + /*! returns a string of the filename */ + const std::string str() const { return filename; } + + /*! returns a c-string of the filename */ + const char* c_str() const { return filename.c_str(); } + + /*! returns the path of a filename */ + FileName path() const; + + /*! returns the file of a filename */ + std::string base() const; + + /*! returns the base of a filename without extension */ + std::string name() const; + + /*! returns the file extension */ + std::string ext() const; + + /*! drops the file extension */ + FileName dropExt() const; + + /*! replaces the file extension */ + FileName setExt(const std::string& ext = "") const; + + /*! adds file extension */ + FileName addExt(const std::string& ext = "") const; + + /*! concatenates two filenames to this/other */ + FileName operator +( const FileName& other ) const; + + /*! concatenates two filenames to this/other */ + FileName operator +( const std::string& other ) const; + + /*! removes the base from a filename (if possible) */ + FileName operator -( const FileName& base ) const; + + /*! == operator */ + friend bool operator==(const FileName& a, const FileName& b); + + /*! != operator */ + friend bool operator!=(const FileName& a, const FileName& b); + + /*! output operator */ + friend std::ostream& operator<<(std::ostream& cout, const FileName& filename); + + private: + std::string filename; + }; +} diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h new file mode 100644 index 0000000000..ed8dd7d40a --- /dev/null +++ b/thirdparty/embree/common/sys/intrinsics.h @@ -0,0 +1,525 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +#if defined(__WIN32__) +#include <intrin.h> +#endif + +#if defined(__ARM_NEON) +#include "../simd/arm/emulation.h" +#else +#include <immintrin.h> +#endif + +#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) + #if !defined(_tzcnt_u32) + #define _tzcnt_u32 __tzcnt_u32 + #endif + #if !defined(_tzcnt_u64) + #define _tzcnt_u64 __tzcnt_u64 + #endif +#endif + +#if defined(__LZCNT__) + #if !defined(_lzcnt_u32) + #define _lzcnt_u32 __lzcnt32 + #endif + #if !defined(_lzcnt_u64) + #define _lzcnt_u64 __lzcnt64 + #endif +#endif + +#if defined(__WIN32__) +// -- GODOT start -- +#if !defined(NOMINMAX) +// -- GODOT end -- +#define NOMINMAX +// -- GODOT start -- +#endif +#include "windows.h" +// -- GODOT end -- +#endif + +/* normally defined in pmmintrin.h, but we always need this */ +#if !defined(_MM_SET_DENORMALS_ZERO_MODE) +#define _MM_DENORMALS_ZERO_ON (0x0040) +#define _MM_DENORMALS_ZERO_OFF (0x0000) +#define _MM_DENORMALS_ZERO_MASK (0x0040) +#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) +#endif + +namespace embree +{ + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + + __forceinline size_t read_tsc() + { + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + return (size_t)li.QuadPart; + } + + __forceinline int bsf(int v) { +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + + __forceinline unsigned bsf(unsigned v) { +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) + return _tzcnt_u64(v); +#else + unsigned long r = 0; _BitScanForward64(&r,v); return r; +#endif + } +#endif + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + + __forceinline unsigned bscf(unsigned& v) + { + unsigned i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__X86_64__) + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline int bsr(int v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) + return 63 -_lzcnt_u64(v); +#else + unsigned long r = 0; _BitScanReverse64(&r, v); return r; +#endif + } +#endif + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline int btc(int v, int i) { + long r = v; _bittestandcomplement(&r,i); return r; + } + + __forceinline int bts(int v, int i) { + long r = v; _bittestandset(&r,i); return r; + } + + __forceinline int btr(int v, int i) { + long r = v; _bittestandreset(&r,i); return r; + } + +#if defined(__X86_64__) + + __forceinline size_t btc(size_t v, size_t i) { + size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; + } + + __forceinline size_t bts(size_t v, size_t i) { + __int64 r = v; _bittestandset64(&r,i); return r; + } + + __forceinline size_t btr(size_t v, size_t i) { + __int64 r = v; _bittestandreset64(&r,i); return r; + } + +#endif + + __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { + return _InterlockedCompareExchange((volatile long*)p,v,c); + } + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#else + +#if defined(__i386__) && defined(__PIC__) + + __forceinline void __cpuid(int out[4], int op) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "0"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) + : "0" (op1), "2" (op2)); + } + +#elif defined(__X86_ASM__) + + __forceinline void __cpuid(int out[4], int op) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); + } + +#endif + + __forceinline uint64_t read_tsc() { +#if defined(__X86_ASM__) + uint32_t high,low; + asm volatile ("rdtsc" : "=d"(high), "=a"(low)); + return (((uint64_t)high) << 32) + (uint64_t)low; +#else + /* Not supported yet, meaning measuring traversal cost per pixel does not work. */ + return 0; +#endif + } + + __forceinline int bsf(int v) { +#if defined(__AVX2__) + return _tzcnt_u32(v); +#elif defined(__X86_ASM__) + int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_ctz(v); +#endif + } + +#if defined(__64BIT__) + __forceinline unsigned bsf(unsigned v) + { +#if defined(__AVX2__) + return _tzcnt_u32(v); +#elif defined(__X86_ASM__) + unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_ctz(v); +#endif + } +#endif + + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) +#if defined(__X86_64__) + return _tzcnt_u64(v); +#else + return _tzcnt_u32(v); +#endif +#elif defined(__X86_ASM__) + size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_ctzl(v); +#endif + } + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__64BIT__) + __forceinline unsigned int bscf(unsigned int& v) + { + unsigned int i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } + + __forceinline int bsr(int v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#elif defined(__X86_ASM__) + int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_clz(v) ^ 31; +#endif + } + +#if defined(__64BIT__) + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#elif defined(__X86_ASM__) + unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return __builtin_clz(v) ^ 31; +#endif + } +#endif + + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) +#if defined(__X86_64__) + return 63 - _lzcnt_u64(v); +#else + return 31 - _lzcnt_u32(v); +#endif +#elif defined(__X86_ASM__) + size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#else + return (sizeof(v) * 8 - 1) - __builtin_clzl(v); +#endif + } + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline size_t blsr(size_t v) { +#if defined(__AVX2__) +#if defined(__INTEL_COMPILER) + return _blsr_u64(v); +#else +#if defined(__X86_64__) + return __blsr_u64(v); +#else + return __blsr_u32(v); +#endif +#endif +#else + return v & (v-1); +#endif + } + + __forceinline int btc(int v, int i) { +#if defined(__X86_ASM__) + int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#else + return (v ^ (1 << i)); +#endif + } + + __forceinline int bts(int v, int i) { +#if defined(__X86_ASM__) + int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#else + return (v | (v << i)); +#endif + } + + __forceinline int btr(int v, int i) { +#if defined(__X86_ASM__) + int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#else + return (v & ~(v << i)); +#endif + } + + __forceinline size_t btc(size_t v, size_t i) { +#if defined(__X86_ASM__) + size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#else + return (v ^ (1 << i)); +#endif + } + + __forceinline size_t bts(size_t v, size_t i) { +#if defined(__X86_ASM__) + size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#else + return (v | (v << i)); +#endif + } + + __forceinline size_t btr(size_t v, size_t i) { +#if defined(__X86_ASM__) + size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#else + return (v & ~(v << i)); +#endif + } + + __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { + return __sync_val_compare_and_swap(value, comparand, input); + } + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__clang__) || defined(__GNUC__) +#if !defined(_mm_undefined_ps) + __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } +#endif +#if !defined(_mm_undefined_si128) + __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } +#endif +#if !defined(_mm256_undefined_ps) && defined(__AVX__) + __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } +#endif +#if !defined(_mm256_undefined_si256) && defined(__AVX__) + __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } +#endif +#if !defined(_mm512_undefined_ps) && defined(__AVX512F__) + __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } +#endif +#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) + __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } +#endif +#endif + +#if defined(__SSE4_2__) + + __forceinline int popcnt(int in) { + return _mm_popcnt_u32(in); + } + + __forceinline unsigned popcnt(unsigned in) { + return _mm_popcnt_u32(in); + } + +#if defined(__64BIT__) + __forceinline size_t popcnt(size_t in) { + return _mm_popcnt_u64(in); + } +#endif + +#endif + +#if defined(__X86_ASM__) + __forceinline uint64_t rdtsc() + { + int dummy[4]; + __cpuid(dummy,0); + uint64_t clock = read_tsc(); + __cpuid(dummy,0); + return clock; + } +#endif + + __forceinline void pause_cpu(const size_t N = 8) + { + for (size_t i=0; i<N; i++) + _mm_pause(); + } + + /* prefetches */ + __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); } + __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); } + __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); } + __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); } + __forceinline void prefetchEX (const void* ptr) { +#if defined(__INTEL_COMPILER) + _mm_prefetch((const char*)ptr,_MM_HINT_ET0); +#else + _mm_prefetch((const char*)ptr,_MM_HINT_T0); +#endif + } + + __forceinline void prefetchL1EX(const void* ptr) { + prefetchEX(ptr); + } + + __forceinline void prefetchL2EX(const void* ptr) { + prefetchEX(ptr); + } +#if defined(__AVX2__) + __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } + __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } +#if defined(__X86_64__) + __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); } + __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); } +#endif +#endif + +#if defined(__AVX512F__) +#if defined(__INTEL_COMPILER) + __forceinline float mm512_cvtss_f32(__m512 v) { + return _mm512_cvtss_f32(v); + } + __forceinline int mm512_mask2int(__mmask16 k1) { + return _mm512_mask2int(k1); + } + __forceinline __mmask16 mm512_int2mask(int mask) { + return _mm512_int2mask(mask); + } +#else + __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3 + return _mm_cvtss_f32(_mm512_castps512_ps128(v)); + } + __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3 + return (int)k1; + } + __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3 + return (__mmask16)mask; + } +#endif +#endif +} diff --git a/thirdparty/embree/common/sys/library.cpp b/thirdparty/embree/common/sys/library.cpp new file mode 100644 index 0000000000..fc983dffd5 --- /dev/null +++ b/thirdparty/embree/common/sys/library.cpp @@ -0,0 +1,83 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "library.h" +#include "sysinfo.h" +#include "filename.h" + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { + std::string fullName = file+".dll"; + FileName executable = getExecutableFileName(); + HANDLE handle = LoadLibrary((executable.path() + fullName).c_str()); + return lib_t(handle); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return (void*)GetProcAddress(HMODULE(lib),sym.c_str()); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + FreeLibrary(HMODULE(lib)); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <dlfcn.h> + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { +#if defined(__MACOSX__) + std::string fullName = "lib"+file+".dylib"; +#else + std::string fullName = "lib"+file+".so"; +#endif + void* lib = dlopen(fullName.c_str(), RTLD_NOW); + if (lib) return lib_t(lib); + FileName executable = getExecutableFileName(); + lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW); + if (lib == nullptr) { + const char* error = dlerror(); + if (error) { + THROW_RUNTIME_ERROR(error); + } else { + THROW_RUNTIME_ERROR("could not load library "+executable.str()); + } + } + return lib_t(lib); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return dlsym(lib,sym.c_str()); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + dlclose(lib); + } +} +#endif diff --git a/thirdparty/embree/common/sys/library.h b/thirdparty/embree/common/sys/library.h new file mode 100644 index 0000000000..67e14d2420 --- /dev/null +++ b/thirdparty/embree/common/sys/library.h @@ -0,0 +1,21 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +namespace embree +{ + /*! type for shared library */ + typedef struct opaque_lib_t* lib_t; + + /*! loads a shared library */ + lib_t openLibrary(const std::string& file); + + /*! returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym); + + /*! unloads a shared library */ + void closeLibrary(lib_t lib); +} diff --git a/thirdparty/embree/common/sys/mutex.cpp b/thirdparty/embree/common/sys/mutex.cpp new file mode 100644 index 0000000000..789feaf2d8 --- /dev/null +++ b/thirdparty/embree/common/sys/mutex.cpp @@ -0,0 +1,57 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "mutex.h" +#include "regression.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); } + MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; } + void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); } + bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; } + void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); } +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include <pthread.h> +namespace embree +{ + /*! system mutex using pthreads */ + MutexSys::MutexSys() + { + mutex = new pthread_mutex_t; + if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_init failed"); + } + + MutexSys::~MutexSys() + { + MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; + assert(ok); + delete (pthread_mutex_t*)mutex; + } + + void MutexSys::lock() + { + if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_lock failed"); + } + + bool MutexSys::try_lock() { + return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0; + } + + void MutexSys::unlock() + { + if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_unlock failed"); + } +}; +#endif diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h new file mode 100644 index 0000000000..4cb3626d92 --- /dev/null +++ b/thirdparty/embree/common/sys/mutex.h @@ -0,0 +1,98 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "intrinsics.h" +#include "atomic.h" + +namespace embree +{ + /*! system mutex */ + class MutexSys { + friend struct ConditionImplementation; + public: + MutexSys(); + ~MutexSys(); + + private: + MutexSys (const MutexSys& other) DELETED; // do not implement + MutexSys& operator= (const MutexSys& other) DELETED; // do not implement + + public: + void lock(); + bool try_lock(); + void unlock(); + + protected: + void* mutex; + }; + + /*! spinning mutex */ + class SpinLock + { + public: + + SpinLock () + : flag(false) {} + + __forceinline bool isLocked() { + return flag.load(); + } + + __forceinline void lock() + { + while (true) + { + while (flag.load()) + { + _mm_pause(); + _mm_pause(); + } + + bool expected = false; + if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire)) + break; + } + } + + __forceinline bool try_lock() + { + bool expected = false; + if (flag.load() != expected) { + return false; + } + return flag.compare_exchange_strong(expected,true,std::memory_order_acquire); + } + + __forceinline void unlock() { + flag.store(false,std::memory_order_release); + } + + __forceinline void wait_until_unlocked() + { + while(flag.load()) + { + _mm_pause(); + _mm_pause(); + } + } + + public: + atomic<bool> flag; + }; + + /*! safe mutex lock and unlock helper */ + template<typename Mutex> class Lock { + public: + Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); } + Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {} + ~Lock() { if (locked) mutex.unlock(); } + __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); } + __forceinline bool isLocked() const { return locked; } + protected: + Mutex& mutex; + bool locked; + }; +} diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h new file mode 100644 index 0000000000..697e07bb86 --- /dev/null +++ b/thirdparty/embree/common/sys/platform.h @@ -0,0 +1,392 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define _CRT_SECURE_NO_WARNINGS + +#include <cstddef> +#include <cassert> +#include <cstdlib> +#include <cstdio> +#include <memory> +#include <stdexcept> +#include <iostream> +#include <iomanip> +#include <fstream> +#include <string> +#include <cstring> +#include <stdint.h> +#include <functional> + +//////////////////////////////////////////////////////////////////////////////// +/// detect platform +//////////////////////////////////////////////////////////////////////////////// + +/* detect 32 or 64 Intel platform */ +#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +#define __X86_64__ +#define __X86_ASM__ +#elif defined(__i386__) || defined(_M_IX86) +#define __X86_ASM__ +#endif + +/* detect 64 bit platform */ +#if defined(__X86_64__) || defined(__aarch64__) +#define __64BIT__ +#endif + +/* detect Linux platform */ +#if defined(linux) || defined(__linux__) || defined(__LINUX__) +# if !defined(__LINUX__) +# define __LINUX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect FreeBSD platform */ +#if defined(__FreeBSD__) || defined(__FREEBSD__) +# if !defined(__FREEBSD__) +# define __FREEBSD__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */ +#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__) +# if !defined(__WIN32__) +# define __WIN32__ +# endif +#endif + +/* detect Cygwin platform */ +#if defined(__CYGWIN__) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect MAC OS X platform */ +#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__) +# if !defined(__MACOSX__) +# define __MACOSX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* try to detect other Unix systems */ +#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Macros +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __WIN32__ +#define dll_export __declspec(dllexport) +#define dll_import __declspec(dllimport) +#else +#define dll_export __attribute__ ((visibility ("default"))) +#define dll_import +#endif + +// -- GODOT start -- +#if defined(__WIN32__) && !defined(__MINGW32__) +// -- GODOT end -- +#if !defined(__noinline) +#define __noinline __declspec(noinline) +#endif +//#define __forceinline __forceinline +//#define __restrict __restrict +#if defined(__INTEL_COMPILER) +#define __restrict__ __restrict +#else +#define __restrict__ //__restrict // causes issues with MSVC +#endif +#if !defined(__thread) +#define __thread __declspec(thread) +#endif +#if !defined(__aligned) +#define __aligned(...) __declspec(align(__VA_ARGS__)) +#endif +//#define __FUNCTION__ __FUNCTION__ +#define debugbreak() __debugbreak() + +#else +#if !defined(__noinline) +#define __noinline __attribute__((noinline)) +#endif +#if !defined(__forceinline) +#define __forceinline inline __attribute__((always_inline)) +#endif +//#define __restrict __restrict +//#define __thread __thread +#if !defined(__aligned) +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#endif +#if !defined(__FUNCTION__) +#define __FUNCTION__ __PRETTY_FUNCTION__ +#endif +#define debugbreak() asm ("int $3") +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define MAYBE_UNUSED __attribute__((unused)) +#else + #define MAYBE_UNUSED +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly + #define DELETED +#else + #define DELETED = delete +#endif + +// -- GODOT start -- +#if !defined(likely) +// -- GODOT end -- +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#define likely(expr) (expr) +#define unlikely(expr) (expr) +#else +#define likely(expr) __builtin_expect((bool)(expr),true ) +#define unlikely(expr) __builtin_expect((bool)(expr),false) +#endif +// -- GODOT start -- +#endif +// -- GODOT end -- + +//////////////////////////////////////////////////////////////////////////////// +/// Error handling and debugging +//////////////////////////////////////////////////////////////////////////////// + +/* debug printing macros */ +#define STRING(x) #x +#define TOSTRING(x) STRING(x) +#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl +#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl +#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl +#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl +#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl + +#if defined(DEBUG) // only report file and line in debug mode + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define THROW_RUNTIME_ERROR(str) \ + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); + // -- GODOT end -- +#else + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(str); + #define THROW_RUNTIME_ERROR(str) \ + abort(); + // -- GODOT end -- +#endif + +#define FATAL(x) THROW_RUNTIME_ERROR(x) +#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; } + +#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") + +//////////////////////////////////////////////////////////////////////////////// +/// Basic types +//////////////////////////////////////////////////////////////////////////////// + +/* default floating-point type */ +namespace embree { + typedef float real; +} + +/* windows does not have ssize_t */ +#if defined(__WIN32__) +#if defined(__64BIT__) +typedef int64_t ssize_t; +#else +typedef int32_t ssize_t; +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Basic utility functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline std::string toString(long long value) { + return std::to_string(value); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Disable some compiler warnings +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__INTEL_COMPILER) +//#pragma warning(disable:265 ) // floating-point operation result is out of range +//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used +//#pragma warning(disable:869 ) // parameter was never referenced +//#pragma warning(disable:981 ) // operands are evaluated in unspecified order +//#pragma warning(disable:1418) // external function definition with no prior declaration +//#pragma warning(disable:1419) // external declaration in primary source file +//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable +//#pragma warning(disable:94 ) // the size of an array must be greater than zero +//#pragma warning(disable:1599) // declaration hides parameter +//#pragma warning(disable:424 ) // extra ";" ignored +#pragma warning(disable:2196) // routine is both "inline" and "noinline" +//#pragma warning(disable:177 ) // label was declared but never referenced +//#pragma warning(disable:114 ) // function was referenced but not defined +//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function +#pragma warning(disable:15335) // was not vectorized: vectorization possible but seems inefficient +#endif + +#if defined(_MSC_VER) +//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union +#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning) +//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data +#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data +//#pragma warning(disable:4355) // 'this' : used in base member initializer list +//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch +//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch +//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float' +//#pragma warning(disable:4068) // unknown pragma +//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned +//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion) +//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored +#pragma warning(disable:4503) // decorated name length exceeded, name was truncated +#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored +#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used + +# if _MSC_VER < 1910 // prior to Visual studio 2017 (V141) +# pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings +# pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0 +# endif + +#endif + +#if defined(__clang__) && !defined(__INTEL_COMPILER) +//#pragma clang diagnostic ignored "-Wunknown-pragmas" +//#pragma clang diagnostic ignored "-Wunused-variable" +//#pragma clang diagnostic ignored "-Wreorder" +//#pragma clang diagnostic ignored "-Wmicrosoft" +//#pragma clang diagnostic ignored "-Wunused-private-field" +//#pragma clang diagnostic ignored "-Wunused-local-typedef" +//#pragma clang diagnostic ignored "-Wunused-function" +//#pragma clang diagnostic ignored "-Wnarrowing" +//#pragma clang diagnostic ignored "-Wc++11-narrowing" +//#pragma clang diagnostic ignored "-Wdeprecated-register" +//#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +#pragma GCC diagnostic ignored "-Wpragmas" +//#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +//#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +//#pragma GCC diagnostic ignored "-Warray-bounds" +#pragma GCC diagnostic ignored "-Wattributes" +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wparentheses" +#endif + +#if defined(__clang__) && defined(__WIN32__) +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wmicrosoft-cast" +#pragma clang diagnostic ignored "-Wmicrosoft-enum-value" +#pragma clang diagnostic ignored "-Wmicrosoft-include" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunknown-pragmas" +#endif + +/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */ +#if defined(__WIN32__) && defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable: 1478)) // warning: function was declared deprecated +#elif defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("warning (enable : 1478)") // warning: function was declared deprecated +#elif defined(__clang__) +#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(__GNUC__) +#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(_MSC_VER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated +#endif + +/* embree output stream */ +#define embree_ostream std::ostream& +#define embree_cout std::cout +#define embree_cout_uniform std::cout +#define embree_endl std::endl + +//////////////////////////////////////////////////////////////////////////////// +/// Some macros for static profiling +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__GNUC__) +#define IACA_SSC_MARK( MARK_ID ) \ +__asm__ __volatile__ ( \ + "\n\t movl $"#MARK_ID", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : : : "memory" ); + +#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B"); + +#else +#define IACA_UD_BYTES {__asm _emit 0x0F \ + __asm _emit 0x0B} + +#define IACA_SSC_MARK(x) {__asm mov ebx, x\ + __asm _emit 0x64 \ + __asm _emit 0x67 \ + __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); + +#endif + +#define IACA_START {IACA_UD_BYTES \ + IACA_SSC_MARK(111)} +#define IACA_END {IACA_SSC_MARK(222) \ + IACA_UD_BYTES} + +namespace embree +{ + template<typename Closure> + struct OnScopeExitHelper + { + OnScopeExitHelper (const Closure f) : active(true), f(f) {} + ~OnScopeExitHelper() { if (active) f(); } + void deactivate() { active = false; } + bool active; + const Closure f; + }; + + template <typename Closure> + OnScopeExitHelper<Closure> OnScopeExit(const Closure f) { + return OnScopeExitHelper<Closure>(f); + } + +#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2) +#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2 +#define ON_SCOPE_EXIT(code) \ + auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;}) + + template<typename Ty> + std::unique_ptr<Ty> make_unique(Ty* ptr) { + return std::unique_ptr<Ty>(ptr); + } + +} diff --git a/thirdparty/embree/common/sys/ref.h b/thirdparty/embree/common/sys/ref.h new file mode 100644 index 0000000000..c2b56c1908 --- /dev/null +++ b/thirdparty/embree/common/sys/ref.h @@ -0,0 +1,122 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "atomic.h" + +namespace embree +{ + struct NullTy { + }; + + extern MAYBE_UNUSED NullTy null; + + class RefCount + { + public: + RefCount(int val = 0) : refCounter(val) {} + virtual ~RefCount() {}; + + virtual RefCount* refInc() { refCounter.fetch_add(1); return this; } + virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; } + private: + std::atomic<size_t> refCounter; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Reference to single object + //////////////////////////////////////////////////////////////////////////////// + + template<typename Type> + class Ref + { + public: + Type* ptr; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Ref() : ptr(nullptr) {} + __forceinline Ref(NullTy) : ptr(nullptr) {} + __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } + __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; } + + __forceinline Ref(Type* const input) : ptr(input) + { + if (ptr) + ptr->refInc(); + } + + __forceinline ~Ref() + { + if (ptr) + ptr->refDec(); + } + + __forceinline Ref& operator =(const Ref& input) + { + if (input.ptr) + input.ptr->refInc(); + if (ptr) + ptr->refDec(); + ptr = input.ptr; + return *this; + } + + __forceinline Ref& operator =(Ref&& input) + { + if (ptr) + ptr->refDec(); + ptr = input.ptr; + input.ptr = nullptr; + return *this; + } + + __forceinline Ref& operator =(Type* const input) + { + if (input) + input->refInc(); + if (ptr) + ptr->refDec(); + ptr = input; + return *this; + } + + __forceinline Ref& operator =(NullTy) + { + if (ptr) + ptr->refDec(); + ptr = nullptr; + return *this; + } + + __forceinline operator bool() const { return ptr != nullptr; } + + __forceinline const Type& operator *() const { return *ptr; } + __forceinline Type& operator *() { return *ptr; } + __forceinline const Type* operator ->() const { return ptr; } + __forceinline Type* operator ->() { return ptr; } + + template<typename TypeOut> + __forceinline Ref<TypeOut> cast() { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); } + template<typename TypeOut> + __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); } + + template<typename TypeOut> + __forceinline Ref<TypeOut> dynamicCast() { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); } + template<typename TypeOut> + __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); } + }; + + template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr < b.ptr; } + + template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy ) { return a.ptr == nullptr; } + template<typename Type> __forceinline bool operator ==(NullTy , const Ref<Type>& b) { return nullptr == b.ptr; } + template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr == b.ptr; } + + template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy ) { return a.ptr != nullptr; } + template<typename Type> __forceinline bool operator !=(NullTy , const Ref<Type>& b) { return nullptr != b.ptr; } + template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr != b.ptr; } +} diff --git a/thirdparty/embree/common/sys/regression.cpp b/thirdparty/embree/common/sys/regression.cpp new file mode 100644 index 0000000000..45315b1105 --- /dev/null +++ b/thirdparty/embree/common/sys/regression.cpp @@ -0,0 +1,30 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "regression.h" + +namespace embree +{ + /* registerRegressionTest is invoked from static initializers, thus + * we cannot have the regression_tests variable as global static + * variable due to issues with static variable initialization + * order. */ + std::vector<RegressionTest*>& get_regression_tests() + { + static std::vector<RegressionTest*> regression_tests; + return regression_tests; + } + + void registerRegressionTest(RegressionTest* test) + { + get_regression_tests().push_back(test); + } + + RegressionTest* getRegressionTest(size_t index) + { + if (index >= get_regression_tests().size()) + return nullptr; + + return get_regression_tests()[index]; + } +} diff --git a/thirdparty/embree/common/sys/regression.h b/thirdparty/embree/common/sys/regression.h new file mode 100644 index 0000000000..bb0bb94006 --- /dev/null +++ b/thirdparty/embree/common/sys/regression.h @@ -0,0 +1,25 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +#include <vector> + +namespace embree +{ + /*! virtual interface for all regression tests */ + struct RegressionTest + { + RegressionTest (std::string name) : name(name) {} + virtual bool run() = 0; + std::string name; + }; + + /*! registers a regression test */ + void registerRegressionTest(RegressionTest* test); + + /*! run all regression tests */ + RegressionTest* getRegressionTest(size_t index); +} diff --git a/thirdparty/embree/common/sys/string.cpp b/thirdparty/embree/common/sys/string.cpp new file mode 100644 index 0000000000..f42fdc8536 --- /dev/null +++ b/thirdparty/embree/common/sys/string.cpp @@ -0,0 +1,42 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "string.h" + +#include <algorithm> +#include <ctype.h> + +namespace embree +{ + char to_lower(char c) { return char(tolower(int(c))); } + char to_upper(char c) { return char(toupper(int(c))); } + std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; } + std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; } + + Vec2f string_to_Vec2f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); + return Vec2f(x,y); + } + + Vec3f string_to_Vec3f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); + return Vec3f(x,y,z); + } + + Vec4f string_to_Vec4f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); str = str.substr(next+1); + const float w = std::stof(str,&next); + return Vec4f(x,y,z,w); + } +} diff --git a/thirdparty/embree/common/sys/string.h b/thirdparty/embree/common/sys/string.h new file mode 100644 index 0000000000..820076b21c --- /dev/null +++ b/thirdparty/embree/common/sys/string.h @@ -0,0 +1,37 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/vec4.h" + +namespace embree +{ + class IOStreamStateRestorer + { + public: + IOStreamStateRestorer(std::ostream& iostream) + : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) { + } + + ~IOStreamStateRestorer() { + iostream.flags(flags); + iostream.precision(precision); + } + + private: + std::ostream& iostream; + std::ios::fmtflags flags; + std::streamsize precision; + }; + + std::string toLowerCase(const std::string& s); + std::string toUpperCase(const std::string& s); + + Vec2f string_to_Vec2f ( std::string str ); + Vec3f string_to_Vec3f ( std::string str ); + Vec4f string_to_Vec4f ( std::string str ); +} diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp new file mode 100644 index 0000000000..f1a59e511e --- /dev/null +++ b/thirdparty/embree/common/sys/sysinfo.cpp @@ -0,0 +1,656 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sysinfo.h" +#include "intrinsics.h" +#include "string.h" +#include "ref.h" +#if defined(__FREEBSD__) +#include <sys/cpuset.h> +#include <pthread_np.h> +typedef cpuset_t cpu_set_t; +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + NullTy null; + + std::string getPlatformName() + { +#if defined(__LINUX__) && !defined(__64BIT__) + return "Linux (32bit)"; +#elif defined(__LINUX__) && defined(__64BIT__) + return "Linux (64bit)"; +#elif defined(__FREEBSD__) && !defined(__64BIT__) + return "FreeBSD (32bit)"; +#elif defined(__FREEBSD__) && defined(__64BIT__) + return "FreeBSD (64bit)"; +#elif defined(__CYGWIN__) && !defined(__64BIT__) + return "Cygwin (32bit)"; +#elif defined(__CYGWIN__) && defined(__64BIT__) + return "Cygwin (64bit)"; +#elif defined(__WIN32__) && !defined(__64BIT__) + return "Windows (32bit)"; +#elif defined(__WIN32__) && defined(__64BIT__) + return "Windows (64bit)"; +#elif defined(__MACOSX__) && !defined(__64BIT__) + return "Mac OS X (32bit)"; +#elif defined(__MACOSX__) && defined(__64BIT__) + return "Mac OS X (64bit)"; +#elif defined(__UNIX__) && !defined(__64BIT__) + return "Unix (32bit)"; +#elif defined(__UNIX__) && defined(__64BIT__) + return "Unix (64bit)"; +#else + return "Unknown"; +#endif + } + + std::string getCompilerName() + { +#if defined(__INTEL_COMPILER) + int icc_mayor = __INTEL_COMPILER / 100 % 100; + int icc_minor = __INTEL_COMPILER % 100; + std::string version = "Intel Compiler "; + version += toString(icc_mayor); + version += "." + toString(icc_minor); +#if defined(__INTEL_COMPILER_UPDATE) + version += "." + toString(__INTEL_COMPILER_UPDATE); +#endif + return version; +#elif defined(__clang__) + return "CLANG " __clang_version__; +#elif defined (__GNUC__) + return "GCC " __VERSION__; +#elif defined(_MSC_VER) + std::string version = toString(_MSC_FULL_VER); + version.insert(4,"."); + version.insert(9,"."); + version.insert(2,"."); + return "Visual C++ Compiler " + version; +#else + return "Unknown Compiler"; +#endif + } + + std::string getCPUVendor() + { +#if defined(__X86_ASM__) + int cpuinfo[4]; + __cpuid (cpuinfo, 0); + int name[4]; + name[0] = cpuinfo[1]; + name[1] = cpuinfo[3]; + name[2] = cpuinfo[2]; + name[3] = 0; + return (char*)name; +#elif defined(__ARM_NEON) + return "ARM"; +#else + return "Unknown"; +#endif + } + + CPU getCPUModel() + { +#if defined(__X86_ASM__) + if (getCPUVendor() != "GenuineIntel") + return CPU::UNKNOWN; + + int out[4]; + __cpuid(out, 0); + if (out[0] < 1) return CPU::UNKNOWN; + __cpuid(out, 1); + + /* please see CPUID documentation for these formulas */ + uint32_t family_ID = (out[0] >> 8) & 0x0F; + uint32_t extended_family_ID = (out[0] >> 20) & 0xFF; + + uint32_t model_ID = (out[0] >> 4) & 0x0F; + uint32_t extended_model_ID = (out[0] >> 16) & 0x0F; + + uint32_t DisplayFamily = family_ID; + if (family_ID == 0x0F) + DisplayFamily += extended_family_ID; + + uint32_t DisplayModel = model_ID; + if (family_ID == 0x06 || family_ID == 0x0F) + DisplayModel += extended_model_ID << 4; + + uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0); + + // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel) + if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE; + if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE; + if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL; + if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1; + + if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL; + if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING; + +#elif defined(__ARM_NEON) + return CPU::ARM; +#endif + + return CPU::UNKNOWN; + } + + std::string stringOfCPUModel(CPU model) + { + switch (model) { + case CPU::XEON_ICE_LAKE : return "Xeon Ice Lake"; + case CPU::CORE_ICE_LAKE : return "Core Ice Lake"; + case CPU::CORE_TIGER_LAKE : return "Core Tiger Lake"; + case CPU::CORE_COMET_LAKE : return "Core Comet Lake"; + case CPU::CORE_CANNON_LAKE : return "Core Cannon Lake"; + case CPU::CORE_KABY_LAKE : return "Core Kaby Lake"; + case CPU::XEON_SKY_LAKE : return "Xeon Sky Lake"; + case CPU::CORE_SKY_LAKE : return "Core Sky Lake"; + case CPU::XEON_PHI_KNIGHTS_MILL : return "Xeon Phi Knights Mill"; + case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing"; + case CPU::XEON_BROADWELL : return "Xeon Broadwell"; + case CPU::CORE_BROADWELL : return "Core Broadwell"; + case CPU::XEON_HASWELL : return "Xeon Haswell"; + case CPU::CORE_HASWELL : return "Core Haswell"; + case CPU::XEON_IVY_BRIDGE : return "Xeon Ivy Bridge"; + case CPU::CORE_IVY_BRIDGE : return "Core Ivy Bridge"; + case CPU::SANDY_BRIDGE : return "Sandy Bridge"; + case CPU::NEHALEM : return "Nehalem"; + case CPU::CORE2 : return "Core2"; + case CPU::CORE1 : return "Core"; + case CPU::ARM : return "ARM"; + case CPU::UNKNOWN : return "Unknown CPU"; + } + return "Unknown CPU (error)"; + } + +#if defined(__X86_ASM__) + /* constants to access destination registers of CPUID instruction */ + static const int EAX = 0; + static const int EBX = 1; + static const int ECX = 2; + static const int EDX = 3; + + /* cpuid[eax=1].ecx */ + static const int CPU_FEATURE_BIT_SSE3 = 1 << 0; + static const int CPU_FEATURE_BIT_SSSE3 = 1 << 9; + static const int CPU_FEATURE_BIT_FMA3 = 1 << 12; + static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19; + static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20; + //static const int CPU_FEATURE_BIT_MOVBE = 1 << 22; + static const int CPU_FEATURE_BIT_POPCNT = 1 << 23; + //static const int CPU_FEATURE_BIT_XSAVE = 1 << 26; + static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27; + static const int CPU_FEATURE_BIT_AVX = 1 << 28; + static const int CPU_FEATURE_BIT_F16C = 1 << 29; + static const int CPU_FEATURE_BIT_RDRAND = 1 << 30; + + /* cpuid[eax=1].edx */ + static const int CPU_FEATURE_BIT_SSE = 1 << 25; + static const int CPU_FEATURE_BIT_SSE2 = 1 << 26; + + /* cpuid[eax=0x80000001].ecx */ + static const int CPU_FEATURE_BIT_LZCNT = 1 << 5; + + /* cpuid[eax=7,ecx=0].ebx */ + static const int CPU_FEATURE_BIT_BMI1 = 1 << 3; + static const int CPU_FEATURE_BIT_AVX2 = 1 << 5; + static const int CPU_FEATURE_BIT_BMI2 = 1 << 8; + static const int CPU_FEATURE_BIT_AVX512F = 1 << 16; // AVX512F (foundation) + static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17; // AVX512DQ (doubleword and quadword instructions) + static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26; // AVX512PF (prefetch gather/scatter instructions) + static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27; // AVX512ER (exponential and reciprocal instructions) + static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28; // AVX512CD (conflict detection instructions) + static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions) + static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions) + static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions) + + /* cpuid[eax=7,ecx=0].ecx */ + static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions) +#endif + +#if defined(__X86_ASM__) + __noinline int64_t get_xcr0() + { +// -- GODOT start -- +#if defined (__WIN32__) && !defined (__MINGW32__) +// -- GODOT end -- + int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 + xcr0 = _xgetbv(0); + return xcr0; +#else + int xcr0 = 0; + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); + return xcr0; +#endif + } +#endif + + int getCPUFeatures() + { +#if defined(__X86_ASM__) + /* cache CPU features access */ + static int cpu_features = 0; + if (cpu_features) + return cpu_features; + + /* get number of CPUID leaves */ + int cpuid_leaf0[4]; + __cpuid(cpuid_leaf0, 0x00000000); + unsigned nIds = cpuid_leaf0[EAX]; + + /* get number of extended CPUID leaves */ + int cpuid_leafe[4]; + __cpuid(cpuid_leafe, 0x80000000); + unsigned nExIds = cpuid_leafe[EAX]; + + /* get CPUID leaves for EAX = 1,7, and 0x80000001 */ + int cpuid_leaf_1[4] = { 0,0,0,0 }; + int cpuid_leaf_7[4] = { 0,0,0,0 }; + int cpuid_leaf_e1[4] = { 0,0,0,0 }; + if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001); +#if _WIN32 +#if _MSC_VER && (_MSC_FULL_VER < 160040219) +#else + if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0); +#endif +#else + if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0); +#endif + if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001); + + /* detect if OS saves XMM, YMM, and ZMM states */ + bool xmm_enabled = true; + bool ymm_enabled = false; + bool zmm_enabled = false; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) { + int64_t xcr0 = get_xcr0(); + xmm_enabled = ((xcr0 & 0x02) == 0x02); /* checks if xmm are enabled in XCR0 */ + ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */ + zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */ + } + if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED; + if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED; + if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED; + + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE; + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT; + + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3 ) cpu_features |= CPU_FEATURE_FMA3; + if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2; + + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL ) cpu_features |= CPU_FEATURE_AVX512VL; + if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; + + return cpu_features; +#elif defined(__ARM_NEON) + /* emulated features with sse2neon */ + return CPU_FEATURE_SSE|CPU_FEATURE_SSE2|CPU_FEATURE_XMM_ENABLED; +#else + /* Unknown CPU. */ + return 0; +#endif + } + + std::string stringOfCPUFeatures(int features) + { + std::string str; + if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM "; + if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM "; + if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM "; + if (features & CPU_FEATURE_SSE ) str += "SSE "; + if (features & CPU_FEATURE_SSE2 ) str += "SSE2 "; + if (features & CPU_FEATURE_SSE3 ) str += "SSE3 "; + if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 "; + if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 "; + if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 "; + if (features & CPU_FEATURE_POPCNT) str += "POPCNT "; + if (features & CPU_FEATURE_AVX ) str += "AVX "; + if (features & CPU_FEATURE_F16C ) str += "F16C "; + if (features & CPU_FEATURE_RDRAND) str += "RDRAND "; + if (features & CPU_FEATURE_AVX2 ) str += "AVX2 "; + if (features & CPU_FEATURE_FMA3 ) str += "FMA3 "; + if (features & CPU_FEATURE_LZCNT ) str += "LZCNT "; + if (features & CPU_FEATURE_BMI1 ) str += "BMI1 "; + if (features & CPU_FEATURE_BMI2 ) str += "BMI2 "; + if (features & CPU_FEATURE_AVX512F) str += "AVX512F "; + if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ "; + if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF "; + if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER "; + if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD "; + if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW "; + if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; + if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; + if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; + return str; + } + + std::string stringOfISA (int isa) + { + if (isa == SSE) return "SSE"; + if (isa == SSE2) return "SSE2"; + if (isa == SSE3) return "SSE3"; + if (isa == SSSE3) return "SSSE3"; + if (isa == SSE41) return "SSE4.1"; + if (isa == SSE42) return "SSE4.2"; + if (isa == AVX) return "AVX"; + if (isa == AVX2) return "AVX2"; + if (isa == AVX512) return "AVX512"; + return "UNKNOWN"; + } + + bool hasISA(int features, int isa) { + return (features & isa) == isa; + } + + std::string supportedTargetList (int features) + { + std::string v; + if (hasISA(features,SSE)) v += "SSE "; + if (hasISA(features,SSE2)) v += "SSE2 "; + if (hasISA(features,SSE3)) v += "SSE3 "; + if (hasISA(features,SSSE3)) v += "SSSE3 "; + if (hasISA(features,SSE41)) v += "SSE4.1 "; + if (hasISA(features,SSE42)) v += "SSE4.2 "; + if (hasISA(features,AVX)) v += "AVX "; + if (hasISA(features,AVXI)) v += "AVXI "; + if (hasISA(features,AVX2)) v += "AVX2 "; + if (hasISA(features,AVX512)) v += "AVX512 "; + return v; + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#include <psapi.h> + +namespace embree +{ + std::string getExecutableFileName() { + char filename[1024]; + if (!GetModuleFileName(nullptr, filename, sizeof(filename))) + return std::string(); + return std::string(filename); + } + + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount"); + + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0; + for (int i = 0; i < groups; i++) + totalProcessors += pGetActiveProcessorCount(i); + nThreads = totalProcessors; + } + else + { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + nThreads = sysinfo.dwNumberOfProcessors; + } + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE); + if (handle == INVALID_HANDLE_VALUE) return 80; + CONSOLE_SCREEN_BUFFER_INFO info; + memset(&info,0,sizeof(info)); + GetConsoleScreenBufferInfo(handle, &info); + return info.dwSize.X; + } + + double getSeconds() + { + LARGE_INTEGER freq, val; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&val); + return (double)val.QuadPart / (double)freq.QuadPart; + } + + void sleepSeconds(double t) { + Sleep(DWORD(1000.0*t)); + } + + size_t getVirtualMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.QuotaPeakPagedPoolUsage; + } + + size_t getResidentMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.WorkingSetSize; + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__LINUX__) + +#include <stdio.h> +#include <unistd.h> + +namespace embree +{ + std::string getExecutableFileName() + { + std::string pid = "/proc/" + toString(getpid()) + "/exe"; + char buf[4096]; + memset(buf,0,sizeof(buf)); + if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return virt*sysconf(_SC_PAGE_SIZE); + } + + size_t getResidentMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return resident*sysconf(_SC_PAGE_SIZE); + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// FreeBSD Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__FreeBSD__) + +#include <sys/sysctl.h> + +namespace embree +{ + std::string getExecutableFileName() + { + const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 }; + char buf[4096]; + memset(buf,0,sizeof(buf)); + size_t len = sizeof(buf)-1; + if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Mac OS X Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include <mach-o/dyld.h> + +namespace embree +{ + std::string getExecutableFileName() + { + char buf[4096]; + uint32_t size = sizeof(buf); + if (_NSGetExecutablePath(buf, &size) != 0) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <pthread.h> + +namespace embree +{ + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + +// -- GODOT start -- +// #if defined(__MACOSX__) +#if defined(__MACOSX__) || defined(__ANDROID__) +// -- GODOT end -- + nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container + assert(nThreads); +#else + cpu_set_t set; + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) + nThreads = CPU_COUNT(&set); +#endif + + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + struct winsize info; + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80; + return info.ws_col; + } + + double getSeconds() { + struct timeval tp; gettimeofday(&tp,nullptr); + return double(tp.tv_sec) + double(tp.tv_usec)/1E6; + } + + void sleepSeconds(double t) { + usleep(1000000.0*t); + } +} +#endif + diff --git a/thirdparty/embree/common/sys/sysinfo.h b/thirdparty/embree/common/sys/sysinfo.h new file mode 100644 index 0000000000..72351d12e4 --- /dev/null +++ b/thirdparty/embree/common/sys/sysinfo.h @@ -0,0 +1,178 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define CACHELINE_SIZE 64 + +#if !defined(PAGE_SIZE) + #define PAGE_SIZE 4096 +#endif + +#define PAGE_SIZE_2M (2*1024*1024) +#define PAGE_SIZE_4K (4*1024) + +#include "platform.h" + +/* define isa namespace and ISA bitvector */ +#if defined (__AVX512VL__) +# define isa avx512 +# define ISA AVX512 +# define ISA_STR "AVX512" +#elif defined (__AVX2__) +# define isa avx2 +# define ISA AVX2 +# define ISA_STR "AVX2" +#elif defined(__AVXI__) +# define isa avxi +# define ISA AVXI +# define ISA_STR "AVXI" +#elif defined(__AVX__) +# define isa avx +# define ISA AVX +# define ISA_STR "AVX" +#elif defined (__SSE4_2__) +# define isa sse42 +# define ISA SSE42 +# define ISA_STR "SSE4.2" +//#elif defined (__SSE4_1__) // we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11 +//# define isa sse41 +//# define ISA SSE41 +//# define ISA_STR "SSE4.1" +//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC +//# define isa ssse3 +//# define ISA SSSE3 +//# define ISA_STR "SSSE3" +//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang +//# define isa sse3 +//# define ISA SSE3 +//# define ISA_STR "SSE3" +#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) +# define isa sse2 +# define ISA SSE2 +# define ISA_STR "SSE2" +#elif defined(__SSE__) +# define isa sse +# define ISA SSE +# define ISA_STR "SSE" +#else +#error Unknown ISA +#endif + +namespace embree +{ + enum class CPU + { + XEON_ICE_LAKE, + CORE_ICE_LAKE, + CORE_TIGER_LAKE, + CORE_COMET_LAKE, + CORE_CANNON_LAKE, + CORE_KABY_LAKE, + XEON_SKY_LAKE, + CORE_SKY_LAKE, + XEON_PHI_KNIGHTS_MILL, + XEON_PHI_KNIGHTS_LANDING, + XEON_BROADWELL, + CORE_BROADWELL, + XEON_HASWELL, + CORE_HASWELL, + XEON_IVY_BRIDGE, + CORE_IVY_BRIDGE, + SANDY_BRIDGE, + NEHALEM, + CORE2, + CORE1, + ARM, + UNKNOWN, + }; + + /*! get the full path to the running executable */ + std::string getExecutableFileName(); + + /*! return platform name */ + std::string getPlatformName(); + + /*! get the full name of the compiler */ + std::string getCompilerName(); + + /*! return the name of the CPU */ + std::string getCPUVendor(); + + /*! get microprocessor model */ + CPU getCPUModel(); + + /*! converts CPU model into string */ + std::string stringOfCPUModel(CPU model); + + /*! CPU features */ + static const int CPU_FEATURE_SSE = 1 << 0; + static const int CPU_FEATURE_SSE2 = 1 << 1; + static const int CPU_FEATURE_SSE3 = 1 << 2; + static const int CPU_FEATURE_SSSE3 = 1 << 3; + static const int CPU_FEATURE_SSE41 = 1 << 4; + static const int CPU_FEATURE_SSE42 = 1 << 5; + static const int CPU_FEATURE_POPCNT = 1 << 6; + static const int CPU_FEATURE_AVX = 1 << 7; + static const int CPU_FEATURE_F16C = 1 << 8; + static const int CPU_FEATURE_RDRAND = 1 << 9; + static const int CPU_FEATURE_AVX2 = 1 << 10; + static const int CPU_FEATURE_FMA3 = 1 << 11; + static const int CPU_FEATURE_LZCNT = 1 << 12; + static const int CPU_FEATURE_BMI1 = 1 << 13; + static const int CPU_FEATURE_BMI2 = 1 << 14; + static const int CPU_FEATURE_AVX512F = 1 << 16; + static const int CPU_FEATURE_AVX512DQ = 1 << 17; + static const int CPU_FEATURE_AVX512PF = 1 << 18; + static const int CPU_FEATURE_AVX512ER = 1 << 19; + static const int CPU_FEATURE_AVX512CD = 1 << 20; + static const int CPU_FEATURE_AVX512BW = 1 << 21; + static const int CPU_FEATURE_AVX512VL = 1 << 22; + static const int CPU_FEATURE_AVX512IFMA = 1 << 23; + static const int CPU_FEATURE_AVX512VBMI = 1 << 24; + static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; + static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; + static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; + + /*! get CPU features */ + int getCPUFeatures(); + + /*! convert CPU features into a string */ + std::string stringOfCPUFeatures(int features); + + /*! creates a string of all supported targets that are supported */ + std::string supportedTargetList (int isa); + + /*! ISAs */ + static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; + static const int SSE2 = SSE | CPU_FEATURE_SSE2; + static const int SSE3 = SSE2 | CPU_FEATURE_SSE3; + static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3; + static const int SSE41 = SSSE3 | CPU_FEATURE_SSE41; + static const int SSE42 = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT; + static const int AVX = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED; + static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND; + static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; + static const int AVX512 = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; + + /*! converts ISA bitvector into a string */ + std::string stringOfISA(int features); + + /*! return the number of logical threads of the system */ + unsigned int getNumberOfLogicalThreads(); + + /*! returns the size of the terminal window in characters */ + int getTerminalWidth(); + + /*! returns performance counter in seconds */ + double getSeconds(); + + /*! sleeps the specified number of seconds */ + void sleepSeconds(double t); + + /*! returns virtual address space occupied by process */ + size_t getVirtualMemoryBytes(); + + /*! returns resident memory required by process */ + size_t getResidentMemoryBytes(); +} diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp new file mode 100644 index 0000000000..f4014be89b --- /dev/null +++ b/thirdparty/embree/common/sys/thread.cpp @@ -0,0 +1,474 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "thread.h" +#include "sysinfo.h" +#include "string.h" + +#include <iostream> +#if defined(__ARM_NEON) +#include "../simd/arm/emulation.h" +#else +#include <xmmintrin.h> +#endif + +#if defined(PTHREADS_WIN32) +#pragma comment (lib, "pthreadVC.lib") +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +namespace embree +{ + /*! set the affinity of a given thread */ + void setAffinity(HANDLE thread, ssize_t affinity) + { + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY); + typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount"); + SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity"); + SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx"); + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0, group = 0, number = 0; + for (int i = 0; i<groups; i++) { + int processors = pGetActiveProcessorCount(i); + if (totalProcessors + processors > affinity) { + group = i; + number = (int)affinity - totalProcessors; + break; + } + totalProcessors += processors; + } + + GROUP_AFFINITY groupAffinity; + groupAffinity.Group = (WORD)group; + groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number); + groupAffinity.Reserved[0] = 0; + groupAffinity.Reserved[1] = 0; + groupAffinity.Reserved[2] = 0; + if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr)) + WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning + + PROCESSOR_NUMBER processorNumber; + processorNumber.Group = group; + processorNumber.Number = number; + processorNumber.Reserved = 0; + if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr)) + WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning + } + else + { + if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity))) + WARNING("SetThreadAffinityMask failed"); // on purpose only a warning + if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1) + WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning + } + } + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) { + setAffinity(GetCurrentThread(), affinity); + } + + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg) + : f(f), arg(arg) {} + public: + thread_func f; + void* arg; + }; + + DWORD WINAPI threadStartup(LPVOID ptr) + { + ThreadStartupData* parg = (ThreadStartupData*) ptr; + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + parg->f(parg->arg); + delete parg; + return 0; + } + +#if !defined(PTHREADS_WIN32) + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr); + if (thread == nullptr) FATAL("CreateThread failed"); + if (threadID >= 0) setAffinity(thread, threadID); + return thread_t(thread); + } + + /*! the thread calling this function gets yielded */ + void yield() { + SwitchToThread(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + WaitForSingleObject(HANDLE(tid), INFINITE); + CloseHandle(HANDLE(tid)); + } + + /*! destroy a hardware thread by its handle */ + void destroyThread(thread_t tid) { + TerminateThread(HANDLE(tid),0); + CloseHandle(HANDLE(tid)); + } + + /*! creates thread local storage */ + tls_t createTls() { + return tls_t(size_t(TlsAlloc())); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) { + TlsSetValue(DWORD(size_t(tls)), ptr); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) { + return TlsGetValue(DWORD(size_t(tls))); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) { + TlsFree(DWORD(size_t(tls))); + } +#endif +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +// -- GODOT start -- +#if defined(__LINUX__) && !defined(__ANDROID__) +// -- GODOT end -- + +#include <fstream> +#include <sstream> +#include <algorithm> + +namespace embree +{ + static MutexSys mutex; + static std::vector<size_t> threadIDs; + + /* changes thread ID mapping such that we first fill up all thread on one core */ + size_t mapThreadID(size_t threadID) + { + Lock<MutexSys> lock(mutex); + + if (threadIDs.size() == 0) + { + /* parse thread/CPU topology */ + for (size_t cpuID=0;;cpuID++) + { + std::fstream fs; + std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list"); + fs.open (cpu.c_str(), std::fstream::in); + if (fs.fail()) break; + + int i; + while (fs >> i) + { + if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; })) + threadIDs.push_back(i); + if (fs.peek() == ',') + fs.ignore(); + } + fs.close(); + } + +#if 0 + for (size_t i=0;i<threadIDs.size();i++) + std::cout << i << " -> " << threadIDs[i] << std::endl; +#endif + + /* verify the mapping and do not use it if the mapping has errors */ + for (size_t i=0;i<threadIDs.size();i++) { + for (size_t j=0;j<threadIDs.size();j++) { + if (i != j && threadIDs[i] == threadIDs[j]) { + threadIDs.clear(); + } + } + } + } + + /* re-map threadIDs if mapping is available */ + size_t ID = threadID; + if (threadID < threadIDs.size()) + ID = threadIDs[threadID]; + + /* find correct thread to affinitize to */ + cpu_set_t set; + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) + { + for (int i=0, j=0; i<CPU_SETSIZE; i++) + { + if (!CPU_ISSET(i,&set)) continue; + + if (j == ID) { + ID = i; + break; + } + j++; + } + } + + return ID; + } + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpu_set_t cset; + CPU_ZERO(&cset); + size_t threadID = mapThreadID(affinity); + CPU_SET(threadID, &cset); + + pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); + } +} +#endif + +// -- GODOT start -- +//////////////////////////////////////////////////////////////////////////////// +/// Android Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__ANDROID__) + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpu_set_t cset; + CPU_ZERO(&cset); + CPU_SET(affinity, &cset); + + sched_setaffinity(0, sizeof(cset), &cset); + } +} +#endif +// -- GODOT end -- + +//////////////////////////////////////////////////////////////////////////////// +/// FreeBSD Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__FreeBSD__) + +#include <pthread_np.h> + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(affinity, &cset); + + pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// MacOSX Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include <mach/thread_act.h> +#include <mach/thread_policy.h> +#include <mach/mach_init.h> + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { +#if !defined(__ARM_NEON) // affinity seems not supported on M1 chip + + thread_affinity_policy ap; + ap.affinity_tag = affinity; + if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS) + WARNING("setting thread affinity failed"); // on purpose only a warning + +#endif + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) + +#include <pthread.h> +#include <sched.h> + +#if defined(__USE_NUMA__) +#include <numa.h> +#endif + +namespace embree +{ + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg, int affinity) + : f(f), arg(arg), affinity(affinity) {} + public: + thread_func f; + void* arg; + ssize_t affinity; + }; + + static void* threadStartup(ThreadStartupData* parg) + { + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + + /*! Mac OS X does not support setting affinity at thread creation time */ +#if defined(__MACOSX__) + if (parg->affinity >= 0) + setAffinity(parg->affinity); +#endif + + parg->f(parg->arg); + delete parg; + return nullptr; + } + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + /* set stack size */ + pthread_attr_t attr; + pthread_attr_init(&attr); + if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size); + + /* create thread */ + pthread_t* tid = new pthread_t; + if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) { + pthread_attr_destroy(&attr); + delete tid; + FATAL("pthread_create failed"); + } + pthread_attr_destroy(&attr); + + /* set affinity */ +// -- GODOT start -- +#if defined(__LINUX__) && !defined(__ANDROID__) +// -- GODOT end -- + if (threadID >= 0) { + cpu_set_t cset; + CPU_ZERO(&cset); + threadID = mapThreadID(threadID); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +#elif defined(__FreeBSD__) + if (threadID >= 0) { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +// -- GODOT start -- +#elif defined(__ANDROID__) + if (threadID >= 0) { + cpu_set_t cset; + CPU_ZERO(&cset); + CPU_SET(threadID, &cset); + sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset); + } +#endif +// -- GODOT end -- + + return thread_t(tid); + } + + /*! the thread calling this function gets yielded */ + void yield() { + sched_yield(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + if (pthread_join(*(pthread_t*)tid, nullptr) != 0) + FATAL("pthread_join failed"); + delete (pthread_t*)tid; + } + + /*! destroy a hardware thread by its handle */ + void destroyThread(thread_t tid) { +// -- GODOT start -- +#if defined(__ANDROID__) + FATAL("Can't destroy threads on Android."); +#else + pthread_cancel(*(pthread_t*)tid); + delete (pthread_t*)tid; +#endif +// -- GODOT end -- + } + + /*! creates thread local storage */ + tls_t createTls() + { + pthread_key_t* key = new pthread_key_t; + if (pthread_key_create(key,nullptr) != 0) { + delete key; + FATAL("pthread_key_create failed"); + } + + return tls_t(key); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) + { + assert(tls); + return pthread_getspecific(*(pthread_key_t*)tls); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) + { + assert(tls); + if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0) + FATAL("pthread_setspecific failed"); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) + { + assert(tls); + if (pthread_key_delete(*(pthread_key_t*)tls) != 0) + FATAL("pthread_key_delete failed"); + delete (pthread_key_t*)tls; + } +} + +#endif diff --git a/thirdparty/embree/common/sys/thread.h b/thirdparty/embree/common/sys/thread.h new file mode 100644 index 0000000000..92a10d5c5d --- /dev/null +++ b/thirdparty/embree/common/sys/thread.h @@ -0,0 +1,49 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "mutex.h" +#include "alloc.h" +#include "vector.h" +#include <vector> + +namespace embree +{ + /*! type for thread */ + typedef struct opaque_thread_t* thread_t; + + /*! signature of thread start function */ + typedef void (*thread_func)(void*); + + /*! creates a hardware thread running on specific logical thread */ + thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1); + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity); + + /*! the thread calling this function gets yielded */ + void yield(); + + /*! waits until the given thread has terminated */ + void join(thread_t tid); + + /*! destroy handle of a thread */ + void destroyThread(thread_t tid); + + /*! type for handle to thread local storage */ + typedef struct opaque_tls_t* tls_t; + + /*! creates thread local storage */ + tls_t createTls(); + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr); + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls); + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls); +} diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h new file mode 100644 index 0000000000..f832626789 --- /dev/null +++ b/thirdparty/embree/common/sys/vector.h @@ -0,0 +1,242 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "alloc.h" +#include <algorithm> + +namespace embree +{ + template<typename T, typename allocator> + class vector_t + { + public: + typedef T value_type; + typedef T* iterator; + typedef const T* const_iterator; + + __forceinline vector_t () + : size_active(0), size_alloced(0), items(nullptr) {} + + __forceinline explicit vector_t (size_t sz) + : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + template<typename M> + __forceinline explicit vector_t (M alloc, size_t sz) + : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + __forceinline ~vector_t() { + clear(); + } + + __forceinline vector_t (const vector_t& other) + { + size_active = other.size_active; + size_alloced = other.size_alloced; + items = alloc.allocate(size_alloced); + for (size_t i=0; i<size_active; i++) + ::new (&items[i]) value_type(other.items[i]); + } + + __forceinline vector_t (vector_t&& other) + : alloc(std::move(other.alloc)) + { + size_active = other.size_active; other.size_active = 0; + size_alloced = other.size_alloced; other.size_alloced = 0; + items = other.items; other.items = nullptr; + } + + __forceinline vector_t& operator=(const vector_t& other) + { + resize(other.size_active); + for (size_t i=0; i<size_active; i++) + items[i] = value_type(other.items[i]); + return *this; + } + + __forceinline vector_t& operator=(vector_t&& other) + { + clear(); + alloc = std::move(other.alloc); + size_active = other.size_active; other.size_active = 0; + size_alloced = other.size_alloced; other.size_alloced = 0; + items = other.items; other.items = nullptr; + return *this; + } + + /********************** Iterators ****************************/ + + __forceinline iterator begin() { return items; }; + __forceinline const_iterator begin() const { return items; }; + + __forceinline iterator end () { return items+size_active; }; + __forceinline const_iterator end () const { return items+size_active; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return size_active == 0; } + __forceinline size_t size () const { return size_active; } + __forceinline size_t capacity () const { return size_alloced; } + + + __forceinline void resize(size_t new_size) { + internal_resize(new_size,internal_grow_size(new_size)); + } + + __forceinline void reserve(size_t new_alloced) + { + /* do nothing if container already large enough */ + if (new_alloced <= size_alloced) + return; + + /* resize exact otherwise */ + internal_resize(size_active,new_alloced); + } + + __forceinline void shrink_to_fit() { + internal_resize(size_active,size_active); + } + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < size_active); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < size_active); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; } + + __forceinline T& front() const { assert(size_active > 0); return items[0]; }; + __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + + /******************** Modifiers **************************/ + + __forceinline void push_back(const T& nt) + { + const T v = nt; // need local copy as input reference could point to this vector + internal_resize(size_active,internal_grow_size(size_active+1)); + ::new (&items[size_active++]) T(v); + } + + __forceinline void pop_back() + { + assert(!empty()); + size_active--; + alloc.destroy(&items[size_active]); + } + + __forceinline void clear() + { + /* destroy elements */ + for (size_t i=0; i<size_active; i++) + alloc.destroy(&items[i]); + + /* free memory */ + alloc.deallocate(items,size_alloced); + items = nullptr; + size_active = size_alloced = 0; + } + + /******************** Comparisons **************************/ + + friend bool operator== (const vector_t& a, const vector_t& b) + { + if (a.size() != b.size()) return false; + for (size_t i=0; i<a.size(); i++) + if (a[i] != b[i]) + return false; + return true; + } + + friend bool operator!= (const vector_t& a, const vector_t& b) { + return !(a==b); + } + + private: + + __forceinline void internal_resize_init(size_t new_active) + { + assert(size_active == 0); + assert(size_alloced == 0); + assert(items == nullptr); + if (new_active == 0) return; + items = alloc.allocate(new_active); + for (size_t i=0; i<new_active; i++) ::new (&items[i]) T(); + size_active = new_active; + size_alloced = new_active; + } + + __forceinline void internal_resize(size_t new_active, size_t new_alloced) + { + assert(new_active <= new_alloced); + + /* destroy elements */ + if (new_active < size_active) + { + for (size_t i=new_active; i<size_active; i++) + alloc.destroy(&items[i]); + size_active = new_active; + } + + /* only reallocate if necessary */ + if (new_alloced == size_alloced) { + for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T; + size_active = new_active; + return; + } + + /* reallocate and copy items */ + T* old_items = items; + items = alloc.allocate(new_alloced); + for (size_t i=0; i<size_active; i++) { + ::new (&items[i]) T(std::move(old_items[i])); + alloc.destroy(&old_items[i]); + } + + for (size_t i=size_active; i<new_active; i++) { + ::new (&items[i]) T; + } + + alloc.deallocate(old_items,size_alloced); + size_active = new_active; + size_alloced = new_alloced; + } + + __forceinline size_t internal_grow_size(size_t new_alloced) + { + /* do nothing if container already large enough */ + if (new_alloced <= size_alloced) + return size_alloced; + + /* resize to next power of 2 otherwise */ + size_t new_size_alloced = size_alloced; + while (new_size_alloced < new_alloced) { + new_size_alloced = std::max(size_t(1),2*new_size_alloced); + } + return new_size_alloced; + } + + private: + allocator alloc; + size_t size_active; // number of valid items + size_t size_alloced; // number of items allocated + T* items; // data array + }; + + /*! vector class that performs standard allocations */ + template<typename T> + using vector = vector_t<T,std::allocator<T>>; + + /*! vector class that performs aligned allocations */ + template<typename T> + using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >; + + /*! vector class that performs OS allocations */ + template<typename T> + using ovector = vector_t<T,os_allocator<T> >; +} diff --git a/thirdparty/embree/common/tasking/taskscheduler.h b/thirdparty/embree/common/tasking/taskscheduler.h new file mode 100644 index 0000000000..8f3dd87689 --- /dev/null +++ b/thirdparty/embree/common/tasking/taskscheduler.h @@ -0,0 +1,15 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#if defined(TASKING_INTERNAL) +# include "taskschedulerinternal.h" +#elif defined(TASKING_TBB) +# include "taskschedulertbb.h" +#elif defined(TASKING_PPL) +# include "taskschedulerppl.h" +#else +# error "no tasking system enabled" +#endif + diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp new file mode 100644 index 0000000000..ad438588a3 --- /dev/null +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp @@ -0,0 +1,420 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "taskschedulerinternal.h" +#include "../math/math.h" +#include "../sys/sysinfo.h" +#include <algorithm> + +namespace embree +{ + RTC_NAMESPACE_BEGIN + + static MutexSys g_mutex; + size_t TaskScheduler::g_numThreads = 0; + __thread TaskScheduler* TaskScheduler::g_instance = nullptr; + std::vector<Ref<TaskScheduler>> g_instance_vector; + __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr; + TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr; + + template<typename Predicate, typename Body> + __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body) + { + while (true) + { + /*! some rounds that yield */ + for (size_t i=0; i<32; i++) + { + /*! some spinning rounds */ + const size_t threadCount = thread.threadCount(); + for (size_t j=0; j<1024; j+=threadCount) + { + if (!pred()) return; + if (thread.scheduler->steal_from_other_threads(thread)) { + i=j=0; + body(); + } + } + yield(); + } + } + } + + /*! run this task */ + void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible + { + /* try to run if not already stolen */ + if (try_switch_state(INITIALIZED,DONE)) + { + Task* prevTask = thread.task; + thread.task = this; + // -- GODOT start -- + // try { + // if (thread.scheduler->cancellingException == nullptr) + closure->execute(); + // } catch (...) { + // if (thread.scheduler->cancellingException == nullptr) + // thread.scheduler->cancellingException = std::current_exception(); + // } + // -- GODOT end -- + thread.task = prevTask; + add_dependencies(-1); + } + + /* steal until all dependencies have completed */ + steal_loop(thread, + [&] () { return dependencies>0; }, + [&] () { while (thread.tasks.execute_local_internal(thread,this)); }); + + /* now signal our parent task that we are finished */ + if (parent) + parent->add_dependencies(-1); + } + + /*! run this task */ + dll_export void TaskScheduler::Task::run (Thread& thread) { + run_internal(thread); + } + + bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent) + { + /* stop if we run out of local tasks or reach the waiting task */ + if (right == 0 || &tasks[right-1] == parent) + return false; + + /* execute task */ + size_t oldRight = right; + tasks[right-1].run_internal(thread); + if (right != oldRight) { + THROW_RUNTIME_ERROR("you have to wait for spawned subtasks"); + } + + /* pop task and closure from stack */ + right--; + if (tasks[right].stackPtr != size_t(-1)) + stackPtr = tasks[right].stackPtr; + + /* also move left pointer */ + if (left >= right) left.store(right.load()); + + return right != 0; + } + + dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) { + return execute_local_internal(thread,parent); + } + + bool TaskScheduler::TaskQueue::steal(Thread& thread) + { + size_t l = left; + size_t r = right; + if (l < r) + { + l = left++; + if (l >= r) + return false; + } + else + return false; + + if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right])) + return false; + + thread.tasks.right++; + return true; + } + + /* we steal from the left */ + size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft() + { + if (left >= right) return 0; + return tasks[left].N; + } + + void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair) + { + TaskScheduler::ThreadPool* pool = pair->first; + size_t threadIndex = pair->second; + delete pair; + pool->thread_loop(threadIndex); + } + + TaskScheduler::ThreadPool::ThreadPool(bool set_affinity) + : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {} + + dll_export void TaskScheduler::ThreadPool::startThreads() + { + if (running) return; + setNumThreads(numThreads,true); + } + + void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads) + { + Lock<MutexSys> lock(g_mutex); + assert(newNumThreads); + newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads()); + + numThreads = newNumThreads; + if (!startThreads && !running) return; + running = true; + size_t numThreadsActive = numThreadsRunning; + + mutex.lock(); + numThreadsRunning = newNumThreads; + mutex.unlock(); + condition.notify_all(); + + /* start new threads */ + for (size_t t=numThreadsActive; t<numThreads; t++) + { + if (t == 0) continue; + auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t); + threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1)); + } + + /* stop some threads if we reduce the number of threads */ + for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) { + if (t == 0) continue; + embree::join(threads.back()); + threads.pop_back(); + } + } + + TaskScheduler::ThreadPool::~ThreadPool() + { + /* leave all taskschedulers */ + mutex.lock(); + numThreadsRunning = 0; + mutex.unlock(); + condition.notify_all(); + + /* wait for threads to terminate */ + for (size_t i=0; i<threads.size(); i++) + embree::join(threads[i]); + } + + dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler) + { + mutex.lock(); + schedulers.push_back(scheduler); + mutex.unlock(); + condition.notify_all(); + } + + dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler) + { + Lock<MutexSys> lock(mutex); + for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) { + if (scheduler == *it) { + schedulers.erase(it); + return; + } + } + } + + void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex) + { + while (globalThreadIndex < numThreadsRunning) + { + Ref<TaskScheduler> scheduler = NULL; + ssize_t threadIndex = -1; + { + Lock<MutexSys> lock(mutex); + condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); }); + if (globalThreadIndex >= numThreadsRunning) break; + scheduler = schedulers.front(); + threadIndex = scheduler->allocThreadIndex(); + } + scheduler->thread_loop(threadIndex); + } + } + + TaskScheduler::TaskScheduler() + : threadCounter(0), anyTasksRunning(0), hasRootTask(false) + { + threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x. + for (size_t i=0; i<threadLocal.size(); i++) + threadLocal[i].store(nullptr); + } + + TaskScheduler::~TaskScheduler() + { + assert(threadCounter == 0); + } + + dll_export size_t TaskScheduler::threadID() + { + Thread* thread = TaskScheduler::thread(); + if (thread) return thread->threadIndex; + else return 0; + } + + dll_export size_t TaskScheduler::threadIndex() + { + Thread* thread = TaskScheduler::thread(); + if (thread) return thread->threadIndex; + else return 0; + } + + dll_export size_t TaskScheduler::threadCount() { + return threadPool->size(); + } + + dll_export TaskScheduler* TaskScheduler::instance() + { + if (g_instance == NULL) { + Lock<MutexSys> lock(g_mutex); + g_instance = new TaskScheduler; + g_instance_vector.push_back(g_instance); + } + return g_instance; + } + + void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads) + { + if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity); + threadPool->setNumThreads(numThreads,start_threads); + } + + void TaskScheduler::destroy() { + delete threadPool; threadPool = nullptr; + } + + dll_export ssize_t TaskScheduler::allocThreadIndex() + { + size_t threadIndex = threadCounter++; + assert(threadIndex < threadLocal.size()); + return threadIndex; + } + + void TaskScheduler::join() + { + mutex.lock(); + size_t threadIndex = allocThreadIndex(); + condition.wait(mutex, [&] () { return hasRootTask.load(); }); + mutex.unlock(); + // -- GODOT start -- + // std::exception_ptr except = thread_loop(threadIndex); + // if (except != nullptr) std::rethrow_exception(except); + thread_loop(threadIndex); + // -- GODOT end -- + } + + void TaskScheduler::reset() { + hasRootTask = false; + } + + void TaskScheduler::wait_for_threads(size_t threadCount) + { + while (threadCounter < threadCount-1) + pause_cpu(); + } + + dll_export TaskScheduler::Thread* TaskScheduler::thread() { + return thread_local_thread; + } + + dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread) + { + Thread* old = thread_local_thread; + thread_local_thread = thread; + return old; + } + + dll_export bool TaskScheduler::wait() + { + Thread* thread = TaskScheduler::thread(); + if (thread == nullptr) return true; + while (thread->tasks.execute_local_internal(*thread,thread->task)) {}; + return thread->scheduler->cancellingException == nullptr; + } + +// -- GODOT start -- +// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) + void TaskScheduler::thread_loop(size_t threadIndex) +// -- GODOT end -- + { + /* allocate thread structure */ + std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation + Thread& thread = *mthread; + threadLocal[threadIndex].store(&thread); + Thread* oldThread = swapThread(&thread); + + /* main thread loop */ + while (anyTasksRunning) + { + steal_loop(thread, + [&] () { return anyTasksRunning > 0; }, + [&] () { + anyTasksRunning++; + while (thread.tasks.execute_local_internal(thread,nullptr)); + anyTasksRunning--; + }); + } + threadLocal[threadIndex].store(nullptr); + swapThread(oldThread); + + /* remember exception to throw */ + // -- GODOT start -- + // std::exception_ptr except = nullptr; + // if (cancellingException != nullptr) except = cancellingException; + // -- GODOT end -- + /* wait for all threads to terminate */ + threadCounter--; +#if defined(__WIN32__) + size_t loopIndex = 1; +#endif +#define LOOP_YIELD_THRESHOLD (4096) + while (threadCounter > 0) { +#if defined(__WIN32__) + if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0) + yield(); + else + _mm_pause(); + loopIndex++; +#else + yield(); +#endif + } + // -- GODOT start -- + // return except; + return; + // -- GODOT end -- + } + + bool TaskScheduler::steal_from_other_threads(Thread& thread) + { + const size_t threadIndex = thread.threadIndex; + const size_t threadCount = this->threadCounter; + + for (size_t i=1; i<threadCount; i++) + { + pause_cpu(32); + size_t otherThreadIndex = threadIndex+i; + if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount; + + Thread* othread = threadLocal[otherThreadIndex].load(); + if (!othread) + continue; + + if (othread->tasks.steal(thread)) + return true; + } + + return false; + } + + dll_export void TaskScheduler::startThreads() { + threadPool->startThreads(); + } + + dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) { + threadPool->add(scheduler); + } + + dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) { + threadPool->remove(scheduler); + } + + RTC_NAMESPACE_END +} diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h new file mode 100644 index 0000000000..8fa6bb12fa --- /dev/null +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h @@ -0,0 +1,385 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" +#include "../sys/atomic.h" +#include "../math/range.h" +#include "../../include/embree3/rtcore.h" + +#include <list> + +namespace embree +{ + + /* The tasking system exports some symbols to be used by the tutorials. Thus we + hide is also in the API namespace when requested. */ + RTC_NAMESPACE_BEGIN + + struct TaskScheduler : public RefCount + { + ALIGNED_STRUCT_(64); + friend class Device; + + static const size_t TASK_STACK_SIZE = 4*1024; //!< task structure stack + static const size_t CLOSURE_STACK_SIZE = 512*1024; //!< stack for task closures + + struct Thread; + + /*! virtual interface for all tasks */ + struct TaskFunction { + virtual void execute() = 0; + }; + + /*! builds a task interface from a closure */ + template<typename Closure> + struct ClosureTaskFunction : public TaskFunction + { + Closure closure; + __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {} + void execute() { closure(); }; + }; + + struct __aligned(64) Task + { + /*! states a task can be in */ + enum { DONE, INITIALIZED }; + + /*! switch from one state to another */ + __forceinline void switch_state(int from, int to) + { + __memory_barrier(); + MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to); + assert(success); + } + + /*! try to switch from one state to another */ + __forceinline bool try_switch_state(int from, int to) { + __memory_barrier(); + return state.compare_exchange_strong(from,to); + } + + /*! increment/decrement dependency counter */ + void add_dependencies(int n) { + dependencies+=n; + } + + /*! initialize all tasks to DONE state by default */ + __forceinline Task() + : state(DONE) {} + + /*! construction of new task */ + __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N) + : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N) + { + if (parent) parent->add_dependencies(+1); + switch_state(DONE,INITIALIZED); + } + + /*! construction of stolen task, stealing thread will decrement initial dependency */ + __forceinline Task (TaskFunction* closure, Task* parent) + : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1) + { + switch_state(DONE,INITIALIZED); + } + + /*! try to steal this task */ + bool try_steal(Task& child) + { + if (!stealable) return false; + if (!try_switch_state(INITIALIZED,DONE)) return false; + new (&child) Task(closure, this); + return true; + } + + /*! run this task */ + dll_export void run(Thread& thread); + + void run_internal(Thread& thread); + + public: + std::atomic<int> state; //!< state this task is in + std::atomic<int> dependencies; //!< dependencies to wait for + std::atomic<bool> stealable; //!< true if task can be stolen + TaskFunction* closure; //!< the closure to execute + Task* parent; //!< parent task to signal when we are finished + size_t stackPtr; //!< stack location where closure is stored + size_t N; //!< approximative size of task + }; + + struct TaskQueue + { + TaskQueue () + : left(0), right(0), stackPtr(0) {} + + __forceinline void* alloc(size_t bytes, size_t align = 64) + { + size_t ofs = bytes + ((align - stackPtr) & (align-1)); + if (stackPtr + ofs > CLOSURE_STACK_SIZE) + // -- GODOT start -- + // throw std::runtime_error("closure stack overflow"); + abort(); + // -- GODOT end -- + stackPtr += ofs; + return &stack[stackPtr-bytes]; + } + + template<typename Closure> + __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) + { + if (right >= TASK_STACK_SIZE) + // -- GODOT start -- + // throw std::runtime_error("task stack overflow"); + abort(); + // -- GODOT end -- + + /* allocate new task on right side of stack */ + size_t oldStackPtr = stackPtr; + TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure); + new (&tasks[right]) Task(func,thread.task,oldStackPtr,size); + right++; + + /* also move left pointer */ + if (left >= right-1) left = right-1; + } + + dll_export bool execute_local(Thread& thread, Task* parent); + bool execute_local_internal(Thread& thread, Task* parent); + bool steal(Thread& thread); + size_t getTaskSizeAtLeft(); + + bool empty() { return right == 0; } + + public: + + /* task stack */ + Task tasks[TASK_STACK_SIZE]; + __aligned(64) std::atomic<size_t> left; //!< threads steal from left + __aligned(64) std::atomic<size_t> right; //!< new tasks are added to the right + + /* closure stack */ + __aligned(64) char stack[CLOSURE_STACK_SIZE]; + size_t stackPtr; + }; + + /*! thread local structure for each thread */ + struct Thread + { + ALIGNED_STRUCT_(64); + + Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler) + : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {} + + __forceinline size_t threadCount() { + return scheduler->threadCounter; + } + + size_t threadIndex; //!< ID of this thread + TaskQueue tasks; //!< local task queue + Task* task; //!< current active task + Ref<TaskScheduler> scheduler; //!< pointer to task scheduler + }; + + /*! pool of worker threads */ + struct ThreadPool + { + ThreadPool (bool set_affinity); + ~ThreadPool (); + + /*! starts the threads */ + dll_export void startThreads(); + + /*! sets number of threads to use */ + void setNumThreads(size_t numThreads, bool startThreads = false); + + /*! adds a task scheduler object for scheduling */ + dll_export void add(const Ref<TaskScheduler>& scheduler); + + /*! remove the task scheduler object again */ + dll_export void remove(const Ref<TaskScheduler>& scheduler); + + /*! returns number of threads of the thread pool */ + size_t size() const { return numThreads; } + + /*! main loop for all threads */ + void thread_loop(size_t threadIndex); + + private: + std::atomic<size_t> numThreads; + std::atomic<size_t> numThreadsRunning; + bool set_affinity; + std::atomic<bool> running; + std::vector<thread_t> threads; + + private: + MutexSys mutex; + ConditionSys condition; + std::list<Ref<TaskScheduler> > schedulers; + }; + + TaskScheduler (); + ~TaskScheduler (); + + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /*! lets new worker threads join the tasking system */ + void join(); + void reset(); + + /*! let a worker thread allocate a thread index */ + dll_export ssize_t allocThreadIndex(); + + /*! wait for some number of threads available (threadCount includes main thread) */ + void wait_for_threads(size_t threadCount); + + /*! thread loop for all worker threads */ + // -- GODOT start -- + // std::exception_ptr thread_loop(size_t threadIndex); + void thread_loop(size_t threadIndex); + // -- GODOT end -- + + /*! steals a task from a different thread */ + bool steal_from_other_threads(Thread& thread); + + template<typename Predicate, typename Body> + static void steal_loop(Thread& thread, const Predicate& pred, const Body& body); + + /* spawn a new task at the top of the threads task stack */ + template<typename Closure> + void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true) + { + if (useThreadPool) startThreads(); + + size_t threadIndex = allocThreadIndex(); + std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation + Thread& thread = *mthread; + assert(threadLocal[threadIndex].load() == nullptr); + threadLocal[threadIndex] = &thread; + Thread* oldThread = swapThread(&thread); + thread.tasks.push_right(thread,size,closure); + { + Lock<MutexSys> lock(mutex); + anyTasksRunning++; + hasRootTask = true; + condition.notify_all(); + } + + if (useThreadPool) addScheduler(this); + + while (thread.tasks.execute_local(thread,nullptr)); + anyTasksRunning--; + if (useThreadPool) removeScheduler(this); + + threadLocal[threadIndex] = nullptr; + swapThread(oldThread); + + /* remember exception to throw */ + std::exception_ptr except = nullptr; + if (cancellingException != nullptr) except = cancellingException; + + /* wait for all threads to terminate */ + threadCounter--; + while (threadCounter > 0) yield(); + cancellingException = nullptr; + + /* re-throw proper exception */ + if (except != nullptr) + std::rethrow_exception(except); + } + + /* spawn a new task at the top of the threads task stack */ + template<typename Closure> + static __forceinline void spawn(size_t size, const Closure& closure) + { + Thread* thread = TaskScheduler::thread(); + if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure); + else instance()->spawn_root(closure,size); + } + + /* spawn a new task at the top of the threads task stack */ + template<typename Closure> + static __forceinline void spawn(const Closure& closure) { + spawn(1,closure); + } + + /* spawn a new task set */ + template<typename Index, typename Closure> + static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure) + { + spawn(end-begin, [=]() + { + if (end-begin <= blockSize) { + return closure(range<Index>(begin,end)); + } + const Index center = (begin+end)/2; + spawn(begin,center,blockSize,closure); + spawn(center,end ,blockSize,closure); + wait(); + }); + } + + /* work on spawned subtasks and wait until all have finished */ + dll_export static bool wait(); + + /* returns the ID of the current thread */ + dll_export static size_t threadID(); + + /* returns the index (0..threadCount-1) of the current thread */ + dll_export static size_t threadIndex(); + + /* returns the total number of threads */ + dll_export static size_t threadCount(); + + private: + + /* returns the thread local task list of this worker thread */ + dll_export static Thread* thread(); + + /* sets the thread local task list of this worker thread */ + dll_export static Thread* swapThread(Thread* thread); + + /*! returns the taskscheduler object to be used by the master thread */ + dll_export static TaskScheduler* instance(); + + /*! starts the threads */ + dll_export static void startThreads(); + + /*! adds a task scheduler object for scheduling */ + dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler); + + /*! remove the task scheduler object again */ + dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler); + + private: + std::vector<atomic<Thread*>> threadLocal; + std::atomic<size_t> threadCounter; + std::atomic<size_t> anyTasksRunning; + std::atomic<bool> hasRootTask; + std::exception_ptr cancellingException; + MutexSys mutex; + ConditionSys condition; + + private: + static size_t g_numThreads; + static __thread TaskScheduler* g_instance; + static __thread Thread* thread_local_thread; + static ThreadPool* threadPool; + }; + + RTC_NAMESPACE_END + +#if defined(RTC_NAMESPACE) + using RTC_NAMESPACE::TaskScheduler; +#endif +} diff --git a/thirdparty/embree/common/tasking/taskschedulerppl.h b/thirdparty/embree/common/tasking/taskschedulerppl.h new file mode 100644 index 0000000000..cbc2ecdbb8 --- /dev/null +++ b/thirdparty/embree/common/tasking/taskschedulerppl.h @@ -0,0 +1,46 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#if !defined(__WIN32__) +#error PPL tasking system only available under windows +#endif + +#include <ppl.h> + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() { + return GetCurrentThreadId(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + /* FIXME: threadIndex is NOT supported by PPL! */ + static __forceinline size_t threadIndex() { + return 0; + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() { + return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1; + } + }; +}; diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h new file mode 100644 index 0000000000..35bd49849f --- /dev/null +++ b/thirdparty/embree/common/tasking/taskschedulertbb.h @@ -0,0 +1,73 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#if defined(__WIN32__) +// -- GODOT start -- +#if !defined(NOMINMAX) +// -- GODOT end -- +# define NOMINMAX +// -- GODOT start -- +#endif +// -- GODOT end -- +#endif + +// We need to define these to avoid implicit linkage against +// tbb_debug.lib under Windows. When removing these lines debug build +// under Windows fails. +#define __TBB_NO_IMPLICIT_LINKAGE 1 +#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 +#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1 +#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1 +#include "tbb/tbb.h" +#include "tbb/parallel_sort.h" + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() + { + return threadIndex(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + static __forceinline size_t threadIndex() + { +#if TBB_INTERFACE_VERSION >= 9100 + return tbb::this_task_arena::current_thread_index(); +#elif TBB_INTERFACE_VERSION >= 9000 + return tbb::task_arena::current_thread_index(); +#else + return 0; +#endif + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() { +#if TBB_INTERFACE_VERSION >= 9100 + return tbb::this_task_arena::max_concurrency(); +#else + return tbb::task_scheduler_init::default_num_threads(); +#endif + } + + }; + +}; diff --git a/thirdparty/embree/include/embree3/rtcore.h b/thirdparty/embree/include/embree3/rtcore.h new file mode 100644 index 0000000000..450ab4c535 --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore.h @@ -0,0 +1,14 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_config.h" +#include "rtcore_common.h" +#include "rtcore_device.h" +#include "rtcore_buffer.h" +#include "rtcore_ray.h" +#include "rtcore_geometry.h" +#include "rtcore_scene.h" +#include "rtcore_builder.h" +#include "rtcore_quaternion.h" diff --git a/thirdparty/embree/include/embree3/rtcore_buffer.h b/thirdparty/embree/include/embree3/rtcore_buffer.h new file mode 100644 index 0000000000..6b8eba9769 --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_buffer.h @@ -0,0 +1,51 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_device.h" + +RTC_NAMESPACE_BEGIN + +/* Types of buffers */ +enum RTCBufferType +{ + RTC_BUFFER_TYPE_INDEX = 0, + RTC_BUFFER_TYPE_VERTEX = 1, + RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE = 2, + RTC_BUFFER_TYPE_NORMAL = 3, + RTC_BUFFER_TYPE_TANGENT = 4, + RTC_BUFFER_TYPE_NORMAL_DERIVATIVE = 5, + + RTC_BUFFER_TYPE_GRID = 8, + + RTC_BUFFER_TYPE_FACE = 16, + RTC_BUFFER_TYPE_LEVEL = 17, + RTC_BUFFER_TYPE_EDGE_CREASE_INDEX = 18, + RTC_BUFFER_TYPE_EDGE_CREASE_WEIGHT = 19, + RTC_BUFFER_TYPE_VERTEX_CREASE_INDEX = 20, + RTC_BUFFER_TYPE_VERTEX_CREASE_WEIGHT = 21, + RTC_BUFFER_TYPE_HOLE = 22, + + RTC_BUFFER_TYPE_FLAGS = 32 +}; + +/* Opaque buffer type */ +typedef struct RTCBufferTy* RTCBuffer; + +/* Creates a new buffer. */ +RTC_API RTCBuffer rtcNewBuffer(RTCDevice device, size_t byteSize); + +/* Creates a new shared buffer. */ +RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice device, void* ptr, size_t byteSize); + +/* Returns a pointer to the buffer data. */ +RTC_API void* rtcGetBufferData(RTCBuffer buffer); + +/* Retains the buffer (increments the reference count). */ +RTC_API void rtcRetainBuffer(RTCBuffer buffer); + +/* Releases the buffer (decrements the reference count). */ +RTC_API void rtcReleaseBuffer(RTCBuffer buffer); + +RTC_NAMESPACE_END diff --git a/thirdparty/embree/include/embree3/rtcore_builder.h b/thirdparty/embree/include/embree3/rtcore_builder.h new file mode 100644 index 0000000000..4bff999fed --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_builder.h @@ -0,0 +1,125 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_scene.h" + +RTC_NAMESPACE_BEGIN + +/* Opaque BVH type */ +typedef struct RTCBVHTy* RTCBVH; + +/* Input build primitives for the builder */ +struct RTC_ALIGN(32) RTCBuildPrimitive +{ + float lower_x, lower_y, lower_z; + unsigned int geomID; + float upper_x, upper_y, upper_z; + unsigned int primID; +}; + +/* Opaque thread local allocator type */ +typedef struct RTCThreadLocalAllocatorTy* RTCThreadLocalAllocator; + +/* Callback to create a node */ +typedef void* (*RTCCreateNodeFunction) (RTCThreadLocalAllocator allocator, unsigned int childCount, void* userPtr); + +/* Callback to set the pointer to all children */ +typedef void (*RTCSetNodeChildrenFunction) (void* nodePtr, void** children, unsigned int childCount, void* userPtr); + +/* Callback to set the bounds of all children */ +typedef void (*RTCSetNodeBoundsFunction) (void* nodePtr, const struct RTCBounds** bounds, unsigned int childCount, void* userPtr); + +/* Callback to create a leaf node */ +typedef void* (*RTCCreateLeafFunction) (RTCThreadLocalAllocator allocator, const struct RTCBuildPrimitive* primitives, size_t primitiveCount, void* userPtr); + +/* Callback to split a build primitive */ +typedef void (*RTCSplitPrimitiveFunction) (const struct RTCBuildPrimitive* primitive, unsigned int dimension, float position, struct RTCBounds* leftBounds, struct RTCBounds* rightBounds, void* userPtr); + +/* Build flags */ +enum RTCBuildFlags +{ + RTC_BUILD_FLAG_NONE = 0, + RTC_BUILD_FLAG_DYNAMIC = (1 << 0), +}; + +enum RTCBuildConstants +{ + RTC_BUILD_MAX_PRIMITIVES_PER_LEAF = 32 +}; + +/* Input for builders */ +struct RTCBuildArguments +{ + size_t byteSize; + + enum RTCBuildQuality buildQuality; + enum RTCBuildFlags buildFlags; + unsigned int maxBranchingFactor; + unsigned int maxDepth; + unsigned int sahBlockSize; + unsigned int minLeafSize; + unsigned int maxLeafSize; + float traversalCost; + float intersectionCost; + + RTCBVH bvh; + struct RTCBuildPrimitive* primitives; + size_t primitiveCount; + size_t primitiveArrayCapacity; + + RTCCreateNodeFunction createNode; + RTCSetNodeChildrenFunction setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds; + RTCCreateLeafFunction createLeaf; + RTCSplitPrimitiveFunction splitPrimitive; + RTCProgressMonitorFunction buildProgress; + void* userPtr; +}; + +/* Returns the default build settings. */ +RTC_FORCEINLINE struct RTCBuildArguments rtcDefaultBuildArguments() +{ + struct RTCBuildArguments args; + args.byteSize = sizeof(args); + args.buildQuality = RTC_BUILD_QUALITY_MEDIUM; + args.buildFlags = RTC_BUILD_FLAG_NONE; + args.maxBranchingFactor = 2; + args.maxDepth = 32; + args.sahBlockSize = 1; + args.minLeafSize = 1; + args.maxLeafSize = RTC_BUILD_MAX_PRIMITIVES_PER_LEAF; + args.traversalCost = 1.0f; + args.intersectionCost = 1.0f; + args.bvh = NULL; + args.primitives = NULL; + args.primitiveCount = 0; + args.primitiveArrayCapacity = 0; + args.createNode = NULL; + args.setNodeChildren = NULL; + args.setNodeBounds = NULL; + args.createLeaf = NULL; + args.splitPrimitive = NULL; + args.buildProgress = NULL; + args.userPtr = NULL; + return args; +} + +/* Creates a new BVH. */ +RTC_API RTCBVH rtcNewBVH(RTCDevice device); + +/* Builds a BVH. */ +RTC_API void* rtcBuildBVH(const struct RTCBuildArguments* args); + +/* Allocates memory using the thread local allocator. */ +RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator allocator, size_t bytes, size_t align); + +/* Retains the BVH (increments reference count). */ +RTC_API void rtcRetainBVH(RTCBVH bvh); + +/* Releases the BVH (decrements reference count). */ +RTC_API void rtcReleaseBVH(RTCBVH bvh); + +RTC_NAMESPACE_END + diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h new file mode 100644 index 0000000000..4857e1e05e --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_common.h @@ -0,0 +1,328 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <stddef.h> +#include <sys/types.h> +#include <stdbool.h> + +#include "rtcore_config.h" + +RTC_NAMESPACE_BEGIN + +#if defined(_WIN32) +#if defined(_M_X64) +typedef long long ssize_t; +#else +typedef int ssize_t; +#endif +#endif + +// -- GODOT start -- +#if defined(_WIN32) && defined(_MSC_VER) +// -- GODOT end -- +# define RTC_ALIGN(...) __declspec(align(__VA_ARGS__)) +#else +# define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__))) +#endif + +#if !defined (RTC_DEPRECATED) +#ifdef __GNUC__ + #define RTC_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) + #define RTC_DEPRECATED __declspec(deprecated) +#else + #define RTC_DEPRECATED +#endif +#endif + +#if defined(_WIN32) +# define RTC_FORCEINLINE __forceinline +#else +# define RTC_FORCEINLINE inline __attribute__((always_inline)) +#endif + +/* Invalid geometry ID */ +#define RTC_INVALID_GEOMETRY_ID ((unsigned int)-1) + +/* Maximum number of time steps */ +#define RTC_MAX_TIME_STEP_COUNT 129 + +/* Formats of buffers and other data structures */ +enum RTCFormat +{ + RTC_FORMAT_UNDEFINED = 0, + + /* 8-bit unsigned integer */ + RTC_FORMAT_UCHAR = 0x1001, + RTC_FORMAT_UCHAR2, + RTC_FORMAT_UCHAR3, + RTC_FORMAT_UCHAR4, + + /* 8-bit signed integer */ + RTC_FORMAT_CHAR = 0x2001, + RTC_FORMAT_CHAR2, + RTC_FORMAT_CHAR3, + RTC_FORMAT_CHAR4, + + /* 16-bit unsigned integer */ + RTC_FORMAT_USHORT = 0x3001, + RTC_FORMAT_USHORT2, + RTC_FORMAT_USHORT3, + RTC_FORMAT_USHORT4, + + /* 16-bit signed integer */ + RTC_FORMAT_SHORT = 0x4001, + RTC_FORMAT_SHORT2, + RTC_FORMAT_SHORT3, + RTC_FORMAT_SHORT4, + + /* 32-bit unsigned integer */ + RTC_FORMAT_UINT = 0x5001, + RTC_FORMAT_UINT2, + RTC_FORMAT_UINT3, + RTC_FORMAT_UINT4, + + /* 32-bit signed integer */ + RTC_FORMAT_INT = 0x6001, + RTC_FORMAT_INT2, + RTC_FORMAT_INT3, + RTC_FORMAT_INT4, + + /* 64-bit unsigned integer */ + RTC_FORMAT_ULLONG = 0x7001, + RTC_FORMAT_ULLONG2, + RTC_FORMAT_ULLONG3, + RTC_FORMAT_ULLONG4, + + /* 64-bit signed integer */ + RTC_FORMAT_LLONG = 0x8001, + RTC_FORMAT_LLONG2, + RTC_FORMAT_LLONG3, + RTC_FORMAT_LLONG4, + + /* 32-bit float */ + RTC_FORMAT_FLOAT = 0x9001, + RTC_FORMAT_FLOAT2, + RTC_FORMAT_FLOAT3, + RTC_FORMAT_FLOAT4, + RTC_FORMAT_FLOAT5, + RTC_FORMAT_FLOAT6, + RTC_FORMAT_FLOAT7, + RTC_FORMAT_FLOAT8, + RTC_FORMAT_FLOAT9, + RTC_FORMAT_FLOAT10, + RTC_FORMAT_FLOAT11, + RTC_FORMAT_FLOAT12, + RTC_FORMAT_FLOAT13, + RTC_FORMAT_FLOAT14, + RTC_FORMAT_FLOAT15, + RTC_FORMAT_FLOAT16, + + /* 32-bit float matrix (row-major order) */ + RTC_FORMAT_FLOAT2X2_ROW_MAJOR = 0x9122, + RTC_FORMAT_FLOAT2X3_ROW_MAJOR = 0x9123, + RTC_FORMAT_FLOAT2X4_ROW_MAJOR = 0x9124, + RTC_FORMAT_FLOAT3X2_ROW_MAJOR = 0x9132, + RTC_FORMAT_FLOAT3X3_ROW_MAJOR = 0x9133, + RTC_FORMAT_FLOAT3X4_ROW_MAJOR = 0x9134, + RTC_FORMAT_FLOAT4X2_ROW_MAJOR = 0x9142, + RTC_FORMAT_FLOAT4X3_ROW_MAJOR = 0x9143, + RTC_FORMAT_FLOAT4X4_ROW_MAJOR = 0x9144, + + /* 32-bit float matrix (column-major order) */ + RTC_FORMAT_FLOAT2X2_COLUMN_MAJOR = 0x9222, + RTC_FORMAT_FLOAT2X3_COLUMN_MAJOR = 0x9223, + RTC_FORMAT_FLOAT2X4_COLUMN_MAJOR = 0x9224, + RTC_FORMAT_FLOAT3X2_COLUMN_MAJOR = 0x9232, + RTC_FORMAT_FLOAT3X3_COLUMN_MAJOR = 0x9233, + RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR = 0x9234, + RTC_FORMAT_FLOAT4X2_COLUMN_MAJOR = 0x9242, + RTC_FORMAT_FLOAT4X3_COLUMN_MAJOR = 0x9243, + RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR = 0x9244, + + /* special 12-byte format for grids */ + RTC_FORMAT_GRID = 0xA001 +}; + +/* Build quality levels */ +enum RTCBuildQuality +{ + RTC_BUILD_QUALITY_LOW = 0, + RTC_BUILD_QUALITY_MEDIUM = 1, + RTC_BUILD_QUALITY_HIGH = 2, + RTC_BUILD_QUALITY_REFIT = 3, +}; + +/* Axis-aligned bounding box representation */ +struct RTC_ALIGN(16) RTCBounds +{ + float lower_x, lower_y, lower_z, align0; + float upper_x, upper_y, upper_z, align1; +}; + +/* Linear axis-aligned bounding box representation */ +struct RTC_ALIGN(16) RTCLinearBounds +{ + struct RTCBounds bounds0; + struct RTCBounds bounds1; +}; + +/* Intersection context flags */ +enum RTCIntersectContextFlags +{ + RTC_INTERSECT_CONTEXT_FLAG_NONE = 0, + RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT = (0 << 0), // optimize for incoherent rays + RTC_INTERSECT_CONTEXT_FLAG_COHERENT = (1 << 0) // optimize for coherent rays +}; + +/* Arguments for RTCFilterFunctionN */ +struct RTCFilterFunctionNArguments +{ + int* valid; + void* geometryUserPtr; + struct RTCIntersectContext* context; + struct RTCRayN* ray; + struct RTCHitN* hit; + unsigned int N; +}; + +/* Filter callback function */ +typedef void (*RTCFilterFunctionN)(const struct RTCFilterFunctionNArguments* args); + +/* Intersection context passed to intersect/occluded calls */ +struct RTCIntersectContext +{ + enum RTCIntersectContextFlags flags; // intersection flags + RTCFilterFunctionN filter; // filter function to execute + +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + unsigned int instStackSize; // Number of instances currently on the stack. +#endif + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance ids. + +#if RTC_MIN_WIDTH + float minWidthDistanceFactor; // curve radius is set to this factor times distance to ray origin +#endif +}; + +/* Initializes an intersection context. */ +RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context) +{ + unsigned l = 0; + context->flags = RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; + context->filter = NULL; + +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + context->instStackSize = 0; +#endif + for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + context->instID[l] = RTC_INVALID_GEOMETRY_ID; + +#if RTC_MIN_WIDTH + context->minWidthDistanceFactor = 0.0f; +#endif +} + +/* Point query structure for closest point query */ +struct RTC_ALIGN(16) RTCPointQuery +{ + float x; // x coordinate of the query point + float y; // y coordinate of the query point + float z; // z coordinate of the query point + float time; // time of the point query + float radius; // radius of the point query +}; + +/* Structure of a packet of 4 query points */ +struct RTC_ALIGN(16) RTCPointQuery4 +{ + float x[4]; // x coordinate of the query point + float y[4]; // y coordinate of the query point + float z[4]; // z coordinate of the query point + float time[4]; // time of the point query + float radius[4]; // radius of the point query +}; + +/* Structure of a packet of 8 query points */ +struct RTC_ALIGN(32) RTCPointQuery8 +{ + float x[8]; // x coordinate of the query point + float y[8]; // y coordinate of the query point + float z[8]; // z coordinate of the query point + float time[8]; // time of the point query + float radius[8]; // radius ofr the point query +}; + +/* Structure of a packet of 16 query points */ +struct RTC_ALIGN(64) RTCPointQuery16 +{ + float x[16]; // x coordinate of the query point + float y[16]; // y coordinate of the query point + float z[16]; // z coordinate of the query point + float time[16]; // time of the point quey + float radius[16]; // radius of the point query +}; + +struct RTCPointQueryN; + +struct RTC_ALIGN(16) RTCPointQueryContext +{ + // accumulated 4x4 column major matrices from world space to instance space. + // undefined if size == 0. + float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; + + // accumulated 4x4 column major matrices from instance space to world space. + // undefined if size == 0. + float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; + + // instance ids. + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; + + // number of instances currently on the stack. + unsigned int instStackSize; +}; + +/* Initializes an intersection context. */ +RTC_FORCEINLINE void rtcInitPointQueryContext(struct RTCPointQueryContext* context) +{ + context->instStackSize = 0; + context->instID[0] = RTC_INVALID_GEOMETRY_ID; +} + +struct RTC_ALIGN(16) RTCPointQueryFunctionArguments +{ + // The (world space) query object that was passed as an argument of rtcPointQuery. The + // radius of the query can be decreased inside the callback to shrink the + // search domain. Increasing the radius or modifying the time or position of + // the query results in undefined behaviour. + struct RTCPointQuery* query; + + // Used for user input/output data. Will not be read or modified internally. + void* userPtr; + + // primitive and geometry ID of primitive + unsigned int primID; + unsigned int geomID; + + // the context with transformation and instance ID stack + struct RTCPointQueryContext* context; + + // If the current instance transform M (= context->world2inst[context->instStackSize]) + // is a similarity matrix, i.e there is a constant factor similarityScale such that, + // for all x,y: dist(Mx, My) = similarityScale * dist(x, y), + // The similarity scale is 0, if the current instance transform is not a + // similarity transform and vice versa. The similarity scale allows to compute + // distance information in instance space and scale the distances into world + // space by dividing with the similarity scale, for example, to update the + // query radius. If the current instance transform is not a similarity + // transform (similarityScale = 0), the distance computation has to be + // performed in world space to ensure correctness. if there is no instance + // transform (context->instStackSize == 0), the similarity scale is 1. + float similarityScale; +}; + +typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args); + +RTC_NAMESPACE_END diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h new file mode 100644 index 0000000000..3a9819c9f1 --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_config.h @@ -0,0 +1,57 @@ + +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define RTC_VERSION_MAJOR 3 +#define RTC_VERSION_MINOR 13 +#define RTC_VERSION_PATCH 0 +#define RTC_VERSION 31300 +#define RTC_VERSION_STRING "3.13.0" + +#define RTC_MAX_INSTANCE_LEVEL_COUNT 1 + +#define EMBREE_MIN_WIDTH 0 +#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH + +#define EMBREE_STATIC_LIB +/* #undef EMBREE_API_NAMESPACE */ + +#if defined(EMBREE_API_NAMESPACE) +# define RTC_NAMESPACE +# define RTC_NAMESPACE_BEGIN namespace { +# define RTC_NAMESPACE_END } +# define RTC_NAMESPACE_USE using namespace ; +# define RTC_API_EXTERN_C +# undef EMBREE_API_NAMESPACE +#else +# define RTC_NAMESPACE_BEGIN +# define RTC_NAMESPACE_END +# define RTC_NAMESPACE_USE +# if defined(__cplusplus) +# define RTC_API_EXTERN_C extern "C" +# else +# define RTC_API_EXTERN_C +# endif +#endif + +#if defined(ISPC) +# define RTC_API_IMPORT extern "C" unmasked +# define RTC_API_EXPORT extern "C" unmasked +#elif defined(EMBREE_STATIC_LIB) +# define RTC_API_IMPORT RTC_API_EXTERN_C +# define RTC_API_EXPORT RTC_API_EXTERN_C +#elif defined(_WIN32) +# define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport) +# define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport) +#else +# define RTC_API_IMPORT RTC_API_EXTERN_C +# define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default"))) +#endif + +#if defined(RTC_EXPORT_API) +# define RTC_API RTC_API_EXPORT +#else +# define RTC_API RTC_API_IMPORT +#endif diff --git a/thirdparty/embree/include/embree3/rtcore_device.h b/thirdparty/embree/include/embree3/rtcore_device.h new file mode 100644 index 0000000000..2dd3047603 --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_device.h @@ -0,0 +1,87 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_common.h" + +RTC_NAMESPACE_BEGIN + +/* Opaque device type */ +typedef struct RTCDeviceTy* RTCDevice; + +/* Creates a new Embree device. */ +RTC_API RTCDevice rtcNewDevice(const char* config); + +/* Retains the Embree device (increments the reference count). */ +RTC_API void rtcRetainDevice(RTCDevice device); + +/* Releases an Embree device (decrements the reference count). */ +RTC_API void rtcReleaseDevice(RTCDevice device); + +/* Device properties */ +enum RTCDeviceProperty +{ + RTC_DEVICE_PROPERTY_VERSION = 0, + RTC_DEVICE_PROPERTY_VERSION_MAJOR = 1, + RTC_DEVICE_PROPERTY_VERSION_MINOR = 2, + RTC_DEVICE_PROPERTY_VERSION_PATCH = 3, + + RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED = 32, + RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED = 33, + RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED = 34, + RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED = 35, + + RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED = 63, + RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED = 64, + RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED = 65, + RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED = 66, + RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED = 67, + RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED = 68, + + RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED = 96, + RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED = 97, + RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED = 98, + RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED = 99, + RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED = 100, + RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED = 101, + + RTC_DEVICE_PROPERTY_TASKING_SYSTEM = 128, + RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED = 129, + RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED = 130 +}; + +/* Gets a device property. */ +RTC_API ssize_t rtcGetDeviceProperty(RTCDevice device, enum RTCDeviceProperty prop); + +/* Sets a device property. */ +RTC_API void rtcSetDeviceProperty(RTCDevice device, const enum RTCDeviceProperty prop, ssize_t value); + +/* Error codes */ +enum RTCError +{ + RTC_ERROR_NONE = 0, + RTC_ERROR_UNKNOWN = 1, + RTC_ERROR_INVALID_ARGUMENT = 2, + RTC_ERROR_INVALID_OPERATION = 3, + RTC_ERROR_OUT_OF_MEMORY = 4, + RTC_ERROR_UNSUPPORTED_CPU = 5, + RTC_ERROR_CANCELLED = 6 +}; + +/* Returns the error code. */ +RTC_API enum RTCError rtcGetDeviceError(RTCDevice device); + +/* Error callback function */ +typedef void (*RTCErrorFunction)(void* userPtr, enum RTCError code, const char* str); + +/* Sets the error callback function. */ +RTC_API void rtcSetDeviceErrorFunction(RTCDevice device, RTCErrorFunction error, void* userPtr); + +/* Memory monitor callback function */ +typedef bool (*RTCMemoryMonitorFunction)(void* ptr, ssize_t bytes, bool post); + +/* Sets the memory monitor callback function. */ +RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunction memoryMonitor, void* userPtr); + +RTC_NAMESPACE_END diff --git a/thirdparty/embree/include/embree3/rtcore_geometry.h b/thirdparty/embree/include/embree3/rtcore_geometry.h new file mode 100644 index 0000000000..d1de17491c --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_geometry.h @@ -0,0 +1,383 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_buffer.h" +#include "rtcore_quaternion.h" + +RTC_NAMESPACE_BEGIN + +/* Opaque scene type */ +typedef struct RTCSceneTy* RTCScene; + +/* Opaque geometry type */ +typedef struct RTCGeometryTy* RTCGeometry; + +/* Types of geometries */ +enum RTCGeometryType +{ + RTC_GEOMETRY_TYPE_TRIANGLE = 0, // triangle mesh + RTC_GEOMETRY_TYPE_QUAD = 1, // quad (triangle pair) mesh + RTC_GEOMETRY_TYPE_GRID = 2, // grid mesh + + RTC_GEOMETRY_TYPE_SUBDIVISION = 8, // Catmull-Clark subdivision surface + + RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE = 15, // Cone linear curves - discontinuous at edge boundaries + RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE = 16, // Round (rounded cone like) linear curves + RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE = 17, // flat (ribbon-like) linear curves + + RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE = 24, // round (tube-like) Bezier curves + RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE = 25, // flat (ribbon-like) Bezier curves + RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE = 26, // flat normal-oriented Bezier curves + + RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE = 32, // round (tube-like) B-spline curves + RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE = 33, // flat (ribbon-like) B-spline curves + RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE = 34, // flat normal-oriented B-spline curves + + RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE = 40, // round (tube-like) Hermite curves + RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE = 41, // flat (ribbon-like) Hermite curves + RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE = 42, // flat normal-oriented Hermite curves + + RTC_GEOMETRY_TYPE_SPHERE_POINT = 50, + RTC_GEOMETRY_TYPE_DISC_POINT = 51, + RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT = 52, + + RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE = 58, // round (tube-like) Catmull-Rom curves + RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE = 59, // flat (ribbon-like) Catmull-Rom curves + RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE = 60, // flat normal-oriented Catmull-Rom curves + + RTC_GEOMETRY_TYPE_USER = 120, // user-defined geometry + RTC_GEOMETRY_TYPE_INSTANCE = 121 // scene instance +}; + +/* Interpolation modes for subdivision surfaces */ +enum RTCSubdivisionMode +{ + RTC_SUBDIVISION_MODE_NO_BOUNDARY = 0, + RTC_SUBDIVISION_MODE_SMOOTH_BOUNDARY = 1, + RTC_SUBDIVISION_MODE_PIN_CORNERS = 2, + RTC_SUBDIVISION_MODE_PIN_BOUNDARY = 3, + RTC_SUBDIVISION_MODE_PIN_ALL = 4, +}; + +/* Curve segment flags */ +enum RTCCurveFlags +{ + RTC_CURVE_FLAG_NEIGHBOR_LEFT = (1 << 0), // left segments exists + RTC_CURVE_FLAG_NEIGHBOR_RIGHT = (1 << 1) // right segment exists +}; + +/* Arguments for RTCBoundsFunction */ +struct RTCBoundsFunctionArguments +{ + void* geometryUserPtr; + unsigned int primID; + unsigned int timeStep; + struct RTCBounds* bounds_o; +}; + +/* Bounding callback function */ +typedef void (*RTCBoundsFunction)(const struct RTCBoundsFunctionArguments* args); + +/* Arguments for RTCIntersectFunctionN */ +struct RTCIntersectFunctionNArguments +{ + int* valid; + void* geometryUserPtr; + unsigned int primID; + struct RTCIntersectContext* context; + struct RTCRayHitN* rayhit; + unsigned int N; + unsigned int geomID; +}; + +/* Intersection callback function */ +typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args); + +/* Arguments for RTCOccludedFunctionN */ +struct RTCOccludedFunctionNArguments +{ + int* valid; + void* geometryUserPtr; + unsigned int primID; + struct RTCIntersectContext* context; + struct RTCRayN* ray; + unsigned int N; + unsigned int geomID; +}; + +/* Occlusion callback function */ +typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args); + +/* Arguments for RTCDisplacementFunctionN */ +struct RTCDisplacementFunctionNArguments +{ + void* geometryUserPtr; + RTCGeometry geometry; + unsigned int primID; + unsigned int timeStep; + const float* u; + const float* v; + const float* Ng_x; + const float* Ng_y; + const float* Ng_z; + float* P_x; + float* P_y; + float* P_z; + unsigned int N; +}; + +/* Displacement mapping callback function */ +typedef void (*RTCDisplacementFunctionN)(const struct RTCDisplacementFunctionNArguments* args); + +/* Creates a new geometry of specified type. */ +RTC_API RTCGeometry rtcNewGeometry(RTCDevice device, enum RTCGeometryType type); + +/* Retains the geometry (increments the reference count). */ +RTC_API void rtcRetainGeometry(RTCGeometry geometry); + +/* Releases the geometry (decrements the reference count) */ +RTC_API void rtcReleaseGeometry(RTCGeometry geometry); + +/* Commits the geometry. */ +RTC_API void rtcCommitGeometry(RTCGeometry geometry); + + +/* Enables the geometry. */ +RTC_API void rtcEnableGeometry(RTCGeometry geometry); + +/* Disables the geometry. */ +RTC_API void rtcDisableGeometry(RTCGeometry geometry); + + +/* Sets the number of motion blur time steps of the geometry. */ +RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry geometry, unsigned int timeStepCount); + +/* Sets the motion blur time range of the geometry. */ +RTC_API void rtcSetGeometryTimeRange(RTCGeometry geometry, float startTime, float endTime); + +/* Sets the number of vertex attributes of the geometry. */ +RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry geometry, unsigned int vertexAttributeCount); + +/* Sets the ray mask of the geometry. */ +RTC_API void rtcSetGeometryMask(RTCGeometry geometry, unsigned int mask); + +/* Sets the build quality of the geometry. */ +RTC_API void rtcSetGeometryBuildQuality(RTCGeometry geometry, enum RTCBuildQuality quality); + +/* Sets the maximal curve or point radius scale allowed by min-width feature. */ +RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry geometry, float maxRadiusScale); + + +/* Sets a geometry buffer. */ +RTC_API void rtcSetGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, RTCBuffer buffer, size_t byteOffset, size_t byteStride, size_t itemCount); + +/* Sets a shared geometry buffer. */ +RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount); + +/* Creates and sets a new geometry buffer. */ +RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, size_t byteStride, size_t itemCount); + +/* Returns the pointer to the data of a buffer. */ +RTC_API void* rtcGetGeometryBufferData(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot); + +/* Updates a geometry buffer. */ +RTC_API void rtcUpdateGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot); + + +/* Sets the intersection filter callback function of the geometry. */ +RTC_API void rtcSetGeometryIntersectFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter); + +/* Sets the occlusion filter callback function of the geometry. */ +RTC_API void rtcSetGeometryOccludedFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter); + +/* Sets the user-defined data pointer of the geometry. */ +RTC_API void rtcSetGeometryUserData(RTCGeometry geometry, void* ptr); + +/* Gets the user-defined data pointer of the geometry. */ +RTC_API void* rtcGetGeometryUserData(RTCGeometry geometry); + +/* Set the point query callback function of a geometry. */ +RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry geometry, RTCPointQueryFunction pointQuery); + +/* Sets the number of primitives of a user geometry. */ +RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry geometry, unsigned int userPrimitiveCount); + +/* Sets the bounding callback function to calculate bounding boxes for user primitives. */ +RTC_API void rtcSetGeometryBoundsFunction(RTCGeometry geometry, RTCBoundsFunction bounds, void* userPtr); + +/* Set the intersect callback function of a user geometry. */ +RTC_API void rtcSetGeometryIntersectFunction(RTCGeometry geometry, RTCIntersectFunctionN intersect); + +/* Set the occlusion callback function of a user geometry. */ +RTC_API void rtcSetGeometryOccludedFunction(RTCGeometry geometry, RTCOccludedFunctionN occluded); + +/* Invokes the intersection filter from the intersection callback function. */ +RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs); + +/* Invokes the occlusion filter from the occlusion callback function. */ +RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs); + + +/* Sets the instanced scene of an instance geometry. */ +RTC_API void rtcSetGeometryInstancedScene(RTCGeometry geometry, RTCScene scene); + +/* Sets the transformation of an instance for the specified time step. */ +RTC_API void rtcSetGeometryTransform(RTCGeometry geometry, unsigned int timeStep, enum RTCFormat format, const void* xfm); + +/* Sets the transformation quaternion of an instance for the specified time step. */ +RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry geometry, unsigned int timeStep, const struct RTCQuaternionDecomposition* qd); + +/* Returns the interpolated transformation of an instance for the specified time. */ +RTC_API void rtcGetGeometryTransform(RTCGeometry geometry, float time, enum RTCFormat format, void* xfm); + + +/* Sets the uniform tessellation rate of the geometry. */ +RTC_API void rtcSetGeometryTessellationRate(RTCGeometry geometry, float tessellationRate); + +/* Sets the number of topologies of a subdivision surface. */ +RTC_API void rtcSetGeometryTopologyCount(RTCGeometry geometry, unsigned int topologyCount); + +/* Sets the subdivision interpolation mode. */ +RTC_API void rtcSetGeometrySubdivisionMode(RTCGeometry geometry, unsigned int topologyID, enum RTCSubdivisionMode mode); + +/* Binds a vertex attribute to a topology of the geometry. */ +RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry geometry, unsigned int vertexAttributeID, unsigned int topologyID); + +/* Sets the displacement callback function of a subdivision surface. */ +RTC_API void rtcSetGeometryDisplacementFunction(RTCGeometry geometry, RTCDisplacementFunctionN displacement); + +/* Returns the first half edge of a face. */ +RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry geometry, unsigned int faceID); + +/* Returns the face the half edge belongs to. */ +RTC_API unsigned int rtcGetGeometryFace(RTCGeometry geometry, unsigned int edgeID); + +/* Returns next half edge. */ +RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry geometry, unsigned int edgeID); + +/* Returns previous half edge. */ +RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry geometry, unsigned int edgeID); + +/* Returns opposite half edge. */ +RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry geometry, unsigned int topologyID, unsigned int edgeID); + + +/* Arguments for rtcInterpolate */ +struct RTCInterpolateArguments +{ + RTCGeometry geometry; + unsigned int primID; + float u; + float v; + enum RTCBufferType bufferType; + unsigned int bufferSlot; + float* P; + float* dPdu; + float* dPdv; + float* ddPdudu; + float* ddPdvdv; + float* ddPdudv; + unsigned int valueCount; +}; + +/* Interpolates vertex data to some u/v location and optionally calculates all derivatives. */ +RTC_API void rtcInterpolate(const struct RTCInterpolateArguments* args); + +/* Interpolates vertex data to some u/v location. */ +RTC_FORCEINLINE void rtcInterpolate0(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, float* P, unsigned int valueCount) +{ + struct RTCInterpolateArguments args; + args.geometry = geometry; + args.primID = primID; + args.u = u; + args.v = v; + args.bufferType = bufferType; + args.bufferSlot = bufferSlot; + args.P = P; + args.dPdu = NULL; + args.dPdv = NULL; + args.ddPdudu = NULL; + args.ddPdvdv = NULL; + args.ddPdudv = NULL; + args.valueCount = valueCount; + rtcInterpolate(&args); +} + +/* Interpolates vertex data to some u/v location and calculates first order derivatives. */ +RTC_FORCEINLINE void rtcInterpolate1(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, + float* P, float* dPdu, float* dPdv, unsigned int valueCount) +{ + struct RTCInterpolateArguments args; + args.geometry = geometry; + args.primID = primID; + args.u = u; + args.v = v; + args.bufferType = bufferType; + args.bufferSlot = bufferSlot; + args.P = P; + args.dPdu = dPdu; + args.dPdv = dPdv; + args.ddPdudu = NULL; + args.ddPdvdv = NULL; + args.ddPdudv = NULL; + args.valueCount = valueCount; + rtcInterpolate(&args); +} + +/* Interpolates vertex data to some u/v location and calculates first and second order derivatives. */ +RTC_FORCEINLINE void rtcInterpolate2(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, unsigned int valueCount) +{ + struct RTCInterpolateArguments args; + args.geometry = geometry; + args.primID = primID; + args.u = u; + args.v = v; + args.bufferType = bufferType; + args.bufferSlot = bufferSlot; + args.P = P; + args.dPdu = dPdu; + args.dPdv = dPdv; + args.ddPdudu = ddPdudu; + args.ddPdvdv = ddPdvdv; + args.ddPdudv = ddPdudv; + args.valueCount = valueCount; + rtcInterpolate(&args); +} + +/* Arguments for rtcInterpolateN */ +struct RTCInterpolateNArguments +{ + RTCGeometry geometry; + const void* valid; + const unsigned int* primIDs; + const float* u; + const float* v; + unsigned int N; + enum RTCBufferType bufferType; + unsigned int bufferSlot; + float* P; + float* dPdu; + float* dPdv; + float* ddPdudu; + float* ddPdvdv; + float* ddPdudv; + unsigned int valueCount; +}; + +/* Interpolates vertex data to an array of u/v locations. */ +RTC_API void rtcInterpolateN(const struct RTCInterpolateNArguments* args); + +/* RTCGrid primitive for grid mesh */ +struct RTCGrid +{ + unsigned int startVertexID; + unsigned int stride; + unsigned short width,height; // max is a 32k x 32k grid +}; + +RTC_NAMESPACE_END + + diff --git a/thirdparty/embree/include/embree3/rtcore_quaternion.h b/thirdparty/embree/include/embree3/rtcore_quaternion.h new file mode 100644 index 0000000000..6489fa3467 --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_quaternion.h @@ -0,0 +1,101 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_common.h" + +RTC_NAMESPACE_BEGIN + +/* + * Structure for transformation respresentation as a matrix decomposition using + * a quaternion + */ +struct RTC_ALIGN(16) RTCQuaternionDecomposition +{ + float scale_x; + float scale_y; + float scale_z; + float skew_xy; + float skew_xz; + float skew_yz; + float shift_x; + float shift_y; + float shift_z; + float quaternion_r; + float quaternion_i; + float quaternion_j; + float quaternion_k; + float translation_x; + float translation_y; + float translation_z; +}; + +RTC_FORCEINLINE void rtcInitQuaternionDecomposition(struct RTCQuaternionDecomposition* qdecomp) +{ + qdecomp->scale_x = 1.f; + qdecomp->scale_y = 1.f; + qdecomp->scale_z = 1.f; + qdecomp->skew_xy = 0.f; + qdecomp->skew_xz = 0.f; + qdecomp->skew_yz = 0.f; + qdecomp->shift_x = 0.f; + qdecomp->shift_y = 0.f; + qdecomp->shift_z = 0.f; + qdecomp->quaternion_r = 1.f; + qdecomp->quaternion_i = 0.f; + qdecomp->quaternion_j = 0.f; + qdecomp->quaternion_k = 0.f; + qdecomp->translation_x = 0.f; + qdecomp->translation_y = 0.f; + qdecomp->translation_z = 0.f; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetQuaternion( + struct RTCQuaternionDecomposition* qdecomp, + float r, float i, float j, float k) +{ + qdecomp->quaternion_r = r; + qdecomp->quaternion_i = i; + qdecomp->quaternion_j = j; + qdecomp->quaternion_k = k; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetScale( + struct RTCQuaternionDecomposition* qdecomp, + float scale_x, float scale_y, float scale_z) +{ + qdecomp->scale_x = scale_x; + qdecomp->scale_y = scale_y; + qdecomp->scale_z = scale_z; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetSkew( + struct RTCQuaternionDecomposition* qdecomp, + float skew_xy, float skew_xz, float skew_yz) +{ + qdecomp->skew_xy = skew_xy; + qdecomp->skew_xz = skew_xz; + qdecomp->skew_yz = skew_yz; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetShift( + struct RTCQuaternionDecomposition* qdecomp, + float shift_x, float shift_y, float shift_z) +{ + qdecomp->shift_x = shift_x; + qdecomp->shift_y = shift_y; + qdecomp->shift_z = shift_z; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetTranslation( + struct RTCQuaternionDecomposition* qdecomp, + float translation_x, float translation_y, float translation_z) +{ + qdecomp->translation_x = translation_x; + qdecomp->translation_y = translation_y; + qdecomp->translation_z = translation_z; +} + +RTC_NAMESPACE_END + diff --git a/thirdparty/embree/include/embree3/rtcore_ray.h b/thirdparty/embree/include/embree3/rtcore_ray.h new file mode 100644 index 0000000000..a2ee6dabbb --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_ray.h @@ -0,0 +1,378 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_common.h" + +RTC_NAMESPACE_BEGIN + +/* Ray structure for a single ray */ +struct RTC_ALIGN(16) RTCRay +{ + float org_x; // x coordinate of ray origin + float org_y; // y coordinate of ray origin + float org_z; // z coordinate of ray origin + float tnear; // start of ray segment + + float dir_x; // x coordinate of ray direction + float dir_y; // y coordinate of ray direction + float dir_z; // z coordinate of ray direction + float time; // time of this ray for motion blur + + float tfar; // end of ray segment (set to hit distance) + unsigned int mask; // ray mask + unsigned int id; // ray ID + unsigned int flags; // ray flags +}; + +/* Hit structure for a single ray */ +struct RTC_ALIGN(16) RTCHit +{ + float Ng_x; // x coordinate of geometry normal + float Ng_y; // y coordinate of geometry normal + float Ng_z; // z coordinate of geometry normal + + float u; // barycentric u coordinate of hit + float v; // barycentric v coordinate of hit + + unsigned int primID; // primitive ID + unsigned int geomID; // geometry ID + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID +}; + +/* Combined ray/hit structure for a single ray */ +struct RTCRayHit +{ + struct RTCRay ray; + struct RTCHit hit; +}; + +/* Ray structure for a packet of 4 rays */ +struct RTC_ALIGN(16) RTCRay4 +{ + float org_x[4]; + float org_y[4]; + float org_z[4]; + float tnear[4]; + + float dir_x[4]; + float dir_y[4]; + float dir_z[4]; + float time[4]; + + float tfar[4]; + unsigned int mask[4]; + unsigned int id[4]; + unsigned int flags[4]; +}; + +/* Hit structure for a packet of 4 rays */ +struct RTC_ALIGN(16) RTCHit4 +{ + float Ng_x[4]; + float Ng_y[4]; + float Ng_z[4]; + + float u[4]; + float v[4]; + + unsigned int primID[4]; + unsigned int geomID[4]; + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][4]; +}; + +/* Combined ray/hit structure for a packet of 4 rays */ +struct RTCRayHit4 +{ + struct RTCRay4 ray; + struct RTCHit4 hit; +}; + +/* Ray structure for a packet of 8 rays */ +struct RTC_ALIGN(32) RTCRay8 +{ + float org_x[8]; + float org_y[8]; + float org_z[8]; + float tnear[8]; + + float dir_x[8]; + float dir_y[8]; + float dir_z[8]; + float time[8]; + + float tfar[8]; + unsigned int mask[8]; + unsigned int id[8]; + unsigned int flags[8]; +}; + +/* Hit structure for a packet of 8 rays */ +struct RTC_ALIGN(32) RTCHit8 +{ + float Ng_x[8]; + float Ng_y[8]; + float Ng_z[8]; + + float u[8]; + float v[8]; + + unsigned int primID[8]; + unsigned int geomID[8]; + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][8]; +}; + +/* Combined ray/hit structure for a packet of 8 rays */ +struct RTCRayHit8 +{ + struct RTCRay8 ray; + struct RTCHit8 hit; +}; + +/* Ray structure for a packet of 16 rays */ +struct RTC_ALIGN(64) RTCRay16 +{ + float org_x[16]; + float org_y[16]; + float org_z[16]; + float tnear[16]; + + float dir_x[16]; + float dir_y[16]; + float dir_z[16]; + float time[16]; + + float tfar[16]; + unsigned int mask[16]; + unsigned int id[16]; + unsigned int flags[16]; +}; + +/* Hit structure for a packet of 16 rays */ +struct RTC_ALIGN(64) RTCHit16 +{ + float Ng_x[16]; + float Ng_y[16]; + float Ng_z[16]; + + float u[16]; + float v[16]; + + unsigned int primID[16]; + unsigned int geomID[16]; + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; +}; + +/* Combined ray/hit structure for a packet of 16 rays */ +struct RTCRayHit16 +{ + struct RTCRay16 ray; + struct RTCHit16 hit; +}; + +/* Ray structure for a packet/stream of N rays in pointer SOA layout */ +struct RTCRayNp +{ + float* org_x; + float* org_y; + float* org_z; + float* tnear; + + float* dir_x; + float* dir_y; + float* dir_z; + float* time; + + float* tfar; + unsigned int* mask; + unsigned int* id; + unsigned int* flags; +}; + +/* Hit structure for a packet/stream of N rays in pointer SOA layout */ +struct RTCHitNp +{ + float* Ng_x; + float* Ng_y; + float* Ng_z; + + float* u; + float* v; + + unsigned int* primID; + unsigned int* geomID; + unsigned int* instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; +}; + +/* Combined ray/hit structure for a packet/stream of N rays in pointer SOA layout */ +struct RTCRayHitNp +{ + struct RTCRayNp ray; + struct RTCHitNp hit; +}; + +struct RTCRayN; +struct RTCHitN; +struct RTCRayHitN; + +#if defined(__cplusplus) + +/* Helper functions to access ray packets of runtime size N */ +RTC_FORCEINLINE float& RTCRayN_org_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[0*N+i]; } +RTC_FORCEINLINE float& RTCRayN_org_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[1*N+i]; } +RTC_FORCEINLINE float& RTCRayN_org_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[2*N+i]; } +RTC_FORCEINLINE float& RTCRayN_tnear(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[3*N+i]; } + +RTC_FORCEINLINE float& RTCRayN_dir_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[4*N+i]; } +RTC_FORCEINLINE float& RTCRayN_dir_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[5*N+i]; } +RTC_FORCEINLINE float& RTCRayN_dir_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[6*N+i]; } +RTC_FORCEINLINE float& RTCRayN_time (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[7*N+i]; } + +RTC_FORCEINLINE float& RTCRayN_tfar (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[8*N+i]; } +RTC_FORCEINLINE unsigned int& RTCRayN_mask (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[9*N+i]; } +RTC_FORCEINLINE unsigned int& RTCRayN_id (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[10*N+i]; } +RTC_FORCEINLINE unsigned int& RTCRayN_flags(RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[11*N+i]; } + +/* Helper functions to access hit packets of runtime size N */ +RTC_FORCEINLINE float& RTCHitN_Ng_x(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[0*N+i]; } +RTC_FORCEINLINE float& RTCHitN_Ng_y(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[1*N+i]; } +RTC_FORCEINLINE float& RTCHitN_Ng_z(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[2*N+i]; } + +RTC_FORCEINLINE float& RTCHitN_u(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[3*N+i]; } +RTC_FORCEINLINE float& RTCHitN_v(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[4*N+i]; } + +RTC_FORCEINLINE unsigned int& RTCHitN_primID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; } +RTC_FORCEINLINE unsigned int& RTCHitN_geomID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; } +RTC_FORCEINLINE unsigned int& RTCHitN_instID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N+i+N*l]; } + +/* Helper functions to extract RTCRayN and RTCHitN from RTCRayHitN */ +RTC_FORCEINLINE RTCRayN* RTCRayHitN_RayN(RTCRayHitN* rayhit, unsigned int N) { return (RTCRayN*)&((float*)rayhit)[0*N]; } +RTC_FORCEINLINE RTCHitN* RTCRayHitN_HitN(RTCRayHitN* rayhit, unsigned int N) { return (RTCHitN*)&((float*)rayhit)[12*N]; } + +/* Helper structure for a ray packet of compile-time size N */ +template<int N> +struct RTCRayNt +{ + float org_x[N]; + float org_y[N]; + float org_z[N]; + float tnear[N]; + + float dir_x[N]; + float dir_y[N]; + float dir_z[N]; + float time[N]; + + float tfar[N]; + unsigned int mask[N]; + unsigned int id[N]; + unsigned int flags[N]; +}; + +/* Helper structure for a hit packet of compile-time size N */ +template<int N> +struct RTCHitNt +{ + float Ng_x[N]; + float Ng_y[N]; + float Ng_z[N]; + + float u[N]; + float v[N]; + + unsigned int primID[N]; + unsigned int geomID[N]; + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][N]; +}; + +/* Helper structure for a combined ray/hit packet of compile-time size N */ +template<int N> +struct RTCRayHitNt +{ + RTCRayNt<N> ray; + RTCHitNt<N> hit; +}; + +RTC_FORCEINLINE RTCRay rtcGetRayFromRayN(RTCRayN* rayN, unsigned int N, unsigned int i) +{ + RTCRay ray; + ray.org_x = RTCRayN_org_x(rayN,N,i); + ray.org_y = RTCRayN_org_y(rayN,N,i); + ray.org_z = RTCRayN_org_z(rayN,N,i); + ray.tnear = RTCRayN_tnear(rayN,N,i); + ray.dir_x = RTCRayN_dir_x(rayN,N,i); + ray.dir_y = RTCRayN_dir_y(rayN,N,i); + ray.dir_z = RTCRayN_dir_z(rayN,N,i); + ray.time = RTCRayN_time(rayN,N,i); + ray.tfar = RTCRayN_tfar(rayN,N,i); + ray.mask = RTCRayN_mask(rayN,N,i); + ray.id = RTCRayN_id(rayN,N,i); + ray.flags = RTCRayN_flags(rayN,N,i); + return ray; +} + +RTC_FORCEINLINE RTCHit rtcGetHitFromHitN(RTCHitN* hitN, unsigned int N, unsigned int i) +{ + RTCHit hit; + hit.Ng_x = RTCHitN_Ng_x(hitN,N,i); + hit.Ng_y = RTCHitN_Ng_y(hitN,N,i); + hit.Ng_z = RTCHitN_Ng_z(hitN,N,i); + hit.u = RTCHitN_u(hitN,N,i); + hit.v = RTCHitN_v(hitN,N,i); + hit.primID = RTCHitN_primID(hitN,N,i); + hit.geomID = RTCHitN_geomID(hitN,N,i); + for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) + hit.instID[l] = RTCHitN_instID(hitN,N,i,l); + return hit; +} + +RTC_FORCEINLINE void rtcCopyHitToHitN(RTCHitN* hitN, const RTCHit* hit, unsigned int N, unsigned int i) +{ + RTCHitN_Ng_x(hitN,N,i) = hit->Ng_x; + RTCHitN_Ng_y(hitN,N,i) = hit->Ng_y; + RTCHitN_Ng_z(hitN,N,i) = hit->Ng_z; + RTCHitN_u(hitN,N,i) = hit->u; + RTCHitN_v(hitN,N,i) = hit->v; + RTCHitN_primID(hitN,N,i) = hit->primID; + RTCHitN_geomID(hitN,N,i) = hit->geomID; + for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) + RTCHitN_instID(hitN,N,i,l) = hit->instID[l]; +} + +RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned int N, unsigned int i) +{ + RTCRayHit rh; + + RTCRayN* ray = RTCRayHitN_RayN(rayhitN,N); + rh.ray.org_x = RTCRayN_org_x(ray,N,i); + rh.ray.org_y = RTCRayN_org_y(ray,N,i); + rh.ray.org_z = RTCRayN_org_z(ray,N,i); + rh.ray.tnear = RTCRayN_tnear(ray,N,i); + rh.ray.dir_x = RTCRayN_dir_x(ray,N,i); + rh.ray.dir_y = RTCRayN_dir_y(ray,N,i); + rh.ray.dir_z = RTCRayN_dir_z(ray,N,i); + rh.ray.time = RTCRayN_time(ray,N,i); + rh.ray.tfar = RTCRayN_tfar(ray,N,i); + rh.ray.mask = RTCRayN_mask(ray,N,i); + rh.ray.id = RTCRayN_id(ray,N,i); + rh.ray.flags = RTCRayN_flags(ray,N,i); + + RTCHitN* hit = RTCRayHitN_HitN(rayhitN,N); + rh.hit.Ng_x = RTCHitN_Ng_x(hit,N,i); + rh.hit.Ng_y = RTCHitN_Ng_y(hit,N,i); + rh.hit.Ng_z = RTCHitN_Ng_z(hit,N,i); + rh.hit.u = RTCHitN_u(hit,N,i); + rh.hit.v = RTCHitN_v(hit,N,i); + rh.hit.primID = RTCHitN_primID(hit,N,i); + rh.hit.geomID = RTCHitN_geomID(hit,N,i); + for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) + rh.hit.instID[l] = RTCHitN_instID(hit,N,i,l); + + return rh; +} + +#endif + +RTC_NAMESPACE_END + diff --git a/thirdparty/embree/include/embree3/rtcore_scene.h b/thirdparty/embree/include/embree3/rtcore_scene.h new file mode 100644 index 0000000000..5878a3d402 --- /dev/null +++ b/thirdparty/embree/include/embree3/rtcore_scene.h @@ -0,0 +1,160 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_device.h" + +RTC_NAMESPACE_BEGIN + +/* Forward declarations for ray structures */ +struct RTCRayHit; +struct RTCRayHit4; +struct RTCRayHit8; +struct RTCRayHit16; +struct RTCRayHitNp; + +/* Scene flags */ +enum RTCSceneFlags +{ + RTC_SCENE_FLAG_NONE = 0, + RTC_SCENE_FLAG_DYNAMIC = (1 << 0), + RTC_SCENE_FLAG_COMPACT = (1 << 1), + RTC_SCENE_FLAG_ROBUST = (1 << 2), + RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION = (1 << 3) +}; + +/* Creates a new scene. */ +RTC_API RTCScene rtcNewScene(RTCDevice device); + +/* Returns the device the scene got created in. The reference count of + * the device is incremented by this function. */ +RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene); + +/* Retains the scene (increments the reference count). */ +RTC_API void rtcRetainScene(RTCScene scene); + +/* Releases the scene (decrements the reference count). */ +RTC_API void rtcReleaseScene(RTCScene scene); + + +/* Attaches the geometry to a scene. */ +RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry); + +/* Attaches the geometry to a scene using the specified geometry ID. */ +RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID); + +/* Detaches the geometry from the scene. */ +RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID); + +/* Gets a geometry handle from the scene. */ +RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID); + + +/* Commits the scene. */ +RTC_API void rtcCommitScene(RTCScene scene); + +/* Commits the scene from multiple threads. */ +RTC_API void rtcJoinCommitScene(RTCScene scene); + + +/* Progress monitor callback function */ +typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n); + +/* Sets the progress monitor callback function of the scene. */ +RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr); + +/* Sets the build quality of the scene. */ +RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality); + +/* Sets the scene flags. */ +RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags); + +/* Returns the scene flags. */ +RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene); + +/* Returns the axis-aligned bounds of the scene. */ +RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o); + +/* Returns the linear axis-aligned bounds of the scene. */ +RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o); + + +/* Perform a closest point query of the scene. */ +RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr); + +/* Perform a closest point query with a packet of 4 points with the scene. */ +RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); + +/* Perform a closest point query with a packet of 4 points with the scene. */ +RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); + +/* Perform a closest point query with a packet of 4 points with the scene. */ +RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); + +/* Intersects a single ray with the scene. */ +RTC_API void rtcIntersect1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit); + +/* Intersects a packet of 4 rays with the scene. */ +RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit4* rayhit); + +/* Intersects a packet of 8 rays with the scene. */ +RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit8* rayhit); + +/* Intersects a packet of 16 rays with the scene. */ +RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit16* rayhit); + +/* Intersects a stream of M rays with the scene. */ +RTC_API void rtcIntersect1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit, unsigned int M, size_t byteStride); + +/* Intersects a stream of pointers to M rays with the scene. */ +RTC_API void rtcIntersect1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit** rayhit, unsigned int M); + +/* Intersects a stream of M ray packets of size N in SOA format with the scene. */ +RTC_API void rtcIntersectNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride); + +/* Intersects a stream of M ray packets of size N in SOA format with the scene. */ +RTC_API void rtcIntersectNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayHitNp* rayhit, unsigned int N); + +/* Tests a single ray for occlusion with the scene. */ +RTC_API void rtcOccluded1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray); + +/* Tests a packet of 4 rays for occlusion occluded with the scene. */ +RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay4* ray); + +/* Tests a packet of 8 rays for occlusion with the scene. */ +RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay8* ray); + +/* Tests a packet of 16 rays for occlusion with the scene. */ +RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay16* ray); + +/* Tests a stream of M rays for occlusion with the scene. */ +RTC_API void rtcOccluded1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray, unsigned int M, size_t byteStride); + +/* Tests a stream of pointers to M rays for occlusion with the scene. */ +RTC_API void rtcOccluded1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay** ray, unsigned int M); + +/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */ +RTC_API void rtcOccludedNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride); + +/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */ +RTC_API void rtcOccludedNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayNp* ray, unsigned int N); + +/*! collision callback */ +struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; }; +typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions); + +/*! Performs collision detection of two scenes */ +RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr); + +#if defined(__cplusplus) + +/* Helper for easily combining scene flags */ +inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) { + return (RTCSceneFlags)((size_t)a | (size_t)b); +} + +#endif + +RTC_NAMESPACE_END + diff --git a/thirdparty/embree/kernels/builders/bvh_builder_hair.h b/thirdparty/embree/kernels/builders/bvh_builder_hair.h new file mode 100644 index 0000000000..d83e8918a1 --- /dev/null +++ b/thirdparty/embree/kernels/builders/bvh_builder_hair.h @@ -0,0 +1,411 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../geometry/primitive.h" +#include "../builders/bvh_builder_sah.h" +#include "../builders/heuristic_binning_array_aligned.h" +#include "../builders/heuristic_binning_array_unaligned.h" +#include "../builders/heuristic_strand_array.h" + +#define NUM_HAIR_OBJECT_BINS 32 + +namespace embree +{ + namespace isa + { + struct BVHBuilderHair + { + /*! settings for builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {} + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + size_t finished_range_threshold; //!< finished range threshold + }; + + template<typename NodeRef, + typename CreateAllocFunc, + typename CreateAABBNodeFunc, + typename SetAABBNodeFunc, + typename CreateOBBNodeFunc, + typename SetOBBNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor, + typename ReportFinishedRangeFunc> + + class BuilderT + { + ALIGNED_CLASS_(16); + friend struct BVHBuilderHair; + + typedef FastAllocator::CachedAllocator Allocator; + typedef HeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> HeuristicBinningSAH; + typedef UnalignedHeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> UnalignedHeuristicBinningSAH; + typedef HeuristicStrandSplit HeuristicStrandSplitSAH; + + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build + + static const size_t travCostAligned = 1; + static const size_t travCostUnaligned = 5; + static const size_t intCost = 6; + + BuilderT (Scene* scene, + PrimRef* prims, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeFunc& createAABBNode, + const SetAABBNodeFunc& setAABBNode, + const CreateOBBNodeFunc& createOBBNode, + const SetOBBNodeFunc& setOBBNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const ReportFinishedRangeFunc& reportFinishedRange, + const Settings settings) + + : cfg(settings), + prims(prims), + createAlloc(createAlloc), + createAABBNode(createAABBNode), + setAABBNode(setAABBNode), + createOBBNode(createOBBNode), + setOBBNode(setOBBNode), + createLeaf(createLeaf), + progressMonitor(progressMonitor), + reportFinishedRange(reportFinishedRange), + alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {} + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const PrimInfoRange& range) + { + if (range.size() == 0) return true; + unsigned int firstGeomID = prims[range.begin()].geomID(); + for (size_t i=range.begin()+1; i<range.end(); i++) { + if (prims[i].geomID() != firstGeomID){ + return false; + } + } + return true; + } + + /*! creates a large leaf that could be larger than supported by the BVH */ + NodeRef createLargeLeaf(size_t depth, const PrimInfoRange& pinfo, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo)) + return createLeaf(prims,pinfo,alloc); + + /* fill all children by always splitting the largest one */ + PrimInfoRange children[MAX_BRANCHING_FACTOR]; + unsigned numChildren = 1; + children[0] = pinfo; + + do { + + /* find best child with largest bounding box area */ + int bestChild = -1; + size_t bestSize = 0; + for (unsigned i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i])) + continue; + + /* remember child with largest size */ + if (children[i].size() > bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + __aligned(64) PrimInfoRange left, right; + if (!sameGeometry(children[bestChild])) { + alignedHeuristic.splitByGeometry(children[bestChild],left,right); + } else { + alignedHeuristic.splitFallback(children[bestChild],left,right); + } + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + /* create node */ + auto node = createAABBNode(alloc); + + for (size_t i=0; i<numChildren; i++) { + const NodeRef child = createLargeLeaf(depth+1,children[i],alloc); + setAABBNode(node,i,child,children[i].geomBounds); + } + + return node; + } + + /*! performs split */ + __noinline void split(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo, bool& aligned) // FIXME: not inlined as ICC otherwise uses much stack + { + /* variable to track the SAH of the best splitting approach */ + float bestSAH = inf; + const size_t blocks = (pinfo.size()+(1ull<<cfg.logBlockSize)-1ull) >> cfg.logBlockSize; + const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds); + + /* try standard binning in aligned space */ + float alignedObjectSAH = inf; + HeuristicBinningSAH::Split alignedObjectSplit; + if (aligned) { + alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize); + alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH(); + bestSAH = min(alignedObjectSAH,bestSAH); + } + + /* try standard binning in unaligned space */ + UnalignedHeuristicBinningSAH::Split unalignedObjectSplit; + LinearSpace3fa uspace; + float unalignedObjectSAH = inf; + if (bestSAH > 0.7f*leafSAH) { + uspace = unalignedHeuristic.computeAlignedSpace(pinfo); + const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace); + unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace); + unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH(); + bestSAH = min(unalignedObjectSAH,bestSAH); + } + + /* try splitting into two strands */ + HeuristicStrandSplitSAH::Split strandSplit; + float strandSAH = inf; + if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) { + strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize); + strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH(); + bestSAH = min(strandSAH,bestSAH); + } + + /* fallback if SAH heuristics failed */ + if (unlikely(!std::isfinite(bestSAH))) + { + alignedHeuristic.deterministic_order(pinfo); + alignedHeuristic.splitFallback(pinfo,linfo,rinfo); + } + + /* perform aligned split if this is best */ + else if (bestSAH == alignedObjectSAH) { + alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo); + } + + /* perform unaligned split if this is best */ + else if (bestSAH == unalignedObjectSAH) { + unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo); + aligned = false; + } + + /* perform strand split if this is best */ + else if (bestSAH == strandSAH) { + strandHeuristic.split(strandSplit,pinfo,linfo,rinfo); + aligned = false; + } + + /* can never happen */ + else + assert(false); + } + + /*! recursive build */ + NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD) + progressMonitor(pinfo.size()); + + PrimInfoRange children[MAX_BRANCHING_FACTOR]; + + /* create leaf node */ + if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) { + alignedHeuristic.deterministic_order(pinfo); + return createLargeLeaf(depth,pinfo,alloc); + } + + /* fill all children by always splitting the one with the largest surface area */ + size_t numChildren = 1; + children[0] = pinfo; + bool aligned = true; + + do { + + /* find best child with largest bounding box area */ + ssize_t bestChild = -1; + float bestArea = neg_inf; + for (size_t i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.minLeafSize) + continue; + + /* remember child with largest area */ + if (area(children[i].geomBounds) > bestArea) { + bestArea = area(children[i].geomBounds); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + PrimInfoRange left, right; + split(children[bestChild],left,right,aligned); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + NodeRef node; + + /* create aligned node */ + if (aligned) + { + node = createAABBNode(alloc); + + /* spawn tasks or ... */ + if (pinfo.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequentially */ + else { + for (size_t i=0; i<numChildren; i++) { + const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds); + } + } + } + + /* create unaligned node */ + else + { + node = createOBBNode(alloc); + + /* spawn tasks or ... */ + if (pinfo.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]); + const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space); + const OBBox3fa obounds(space,sinfo.geomBounds); + const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequentially */ + else + { + for (size_t i=0; i<numChildren; i++) { + const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]); + const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space); + const OBBox3fa obounds(space,sinfo.geomBounds); + const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds); + } + } + } + + /* reports a finished range of primrefs */ + if (unlikely(alloc_barrier)) + reportFinishedRange(pinfo); + + return node; + } + + private: + Settings cfg; + PrimRef* prims; + const CreateAllocFunc& createAlloc; + const CreateAABBNodeFunc& createAABBNode; + const SetAABBNodeFunc& setAABBNode; + const CreateOBBNodeFunc& createOBBNode; + const SetOBBNodeFunc& setOBBNode; + const CreateLeafFunc& createLeaf; + const ProgressMonitor& progressMonitor; + const ReportFinishedRangeFunc& reportFinishedRange; + + private: + HeuristicBinningSAH alignedHeuristic; + UnalignedHeuristicBinningSAH unalignedHeuristic; + HeuristicStrandSplitSAH strandHeuristic; + }; + + template<typename NodeRef, + typename CreateAllocFunc, + typename CreateAABBNodeFunc, + typename SetAABBNodeFunc, + typename CreateOBBNodeFunc, + typename SetOBBNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor, + typename ReportFinishedRangeFunc> + + static NodeRef build (const CreateAllocFunc& createAlloc, + const CreateAABBNodeFunc& createAABBNode, + const SetAABBNodeFunc& setAABBNode, + const CreateOBBNodeFunc& createOBBNode, + const SetOBBNodeFunc& setOBBNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const ReportFinishedRangeFunc& reportFinishedRange, + Scene* scene, + PrimRef* prims, + const PrimInfo& pinfo, + const Settings settings) + { + typedef BuilderT<NodeRef, + CreateAllocFunc, + CreateAABBNodeFunc,SetAABBNodeFunc, + CreateOBBNodeFunc,SetOBBNodeFunc, + CreateLeafFunc,ProgressMonitor, + ReportFinishedRangeFunc> Builder; + + Builder builder(scene,prims,createAlloc, + createAABBNode,setAABBNode, + createOBBNode,setOBBNode, + createLeaf,progressMonitor,reportFinishedRange,settings); + + NodeRef root = builder.recurse(1,pinfo,nullptr,true,false); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + }; + } +} diff --git a/thirdparty/embree/kernels/builders/bvh_builder_morton.h b/thirdparty/embree/kernels/builders/bvh_builder_morton.h new file mode 100644 index 0000000000..8f21e3254f --- /dev/null +++ b/thirdparty/embree/kernels/builders/bvh_builder_morton.h @@ -0,0 +1,501 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/builder.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + namespace isa + { + struct BVHBuilderMorton + { + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth + + /*! settings for morton builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {} + + /*! initialize settings from API settings */ + Settings (const RTCBuildArguments& settings) + : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) + { + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; + if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; + + minLeafSize = min(minLeafSize,maxLeafSize); + } + + Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold) + : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + }; + + /*! Build primitive consisting of morton code and primitive ID. */ + struct __aligned(8) BuildPrim + { + union { + struct { + unsigned int code; //!< morton code + unsigned int index; //!< i'th primitive + }; + uint64_t t; + }; + + /*! interface for radix sort */ + __forceinline operator unsigned() const { return code; } + + /*! interface for standard sort */ + __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; } + }; + + /*! maps bounding box to morton code */ + struct MortonCodeMapping + { + static const size_t LATTICE_BITS_PER_DIM = 10; + static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM; + + vfloat4 base; + vfloat4 scale; + + __forceinline MortonCodeMapping(const BBox3fa& bounds) + { + base = (vfloat4)bounds.lower; + const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower; + scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f)); + } + + __forceinline const vint4 bin (const BBox3fa& box) const + { + const vfloat4 lower = (vfloat4)box.lower; + const vfloat4 upper = (vfloat4)box.upper; + const vfloat4 centroid = lower+upper; + return vint4((centroid-base)*scale); + } + + __forceinline unsigned int code (const BBox3fa& box) const + { + const vint4 binID = bin(box); + const unsigned int x = extract<0>(binID); + const unsigned int y = extract<1>(binID); + const unsigned int z = extract<2>(binID); + const unsigned int xyz = bitInterleave(x,y,z); + return xyz; + } + }; + +#if defined (__AVX2__) + + /*! for AVX2 there is a fast scalar bitInterleave */ + struct MortonCodeGenerator + { + __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) + : mapping(mapping), dest(dest) {} + + __forceinline void operator() (const BBox3fa& b, const unsigned index) + { + dest->index = index; + dest->code = mapping.code(b); + dest++; + } + + public: + const MortonCodeMapping mapping; + BuildPrim* dest; + size_t currentID; + }; + +#else + + /*! before AVX2 is it better to use the SSE version of bitInterleave */ + struct MortonCodeGenerator + { + __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) + : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {} + + __forceinline ~MortonCodeGenerator() + { + if (slots != 0) + { + const vint4 code = bitInterleave(ax,ay,az); + for (size_t i=0; i<slots; i++) { + dest[currentID-slots+i].index = ai[i]; + dest[currentID-slots+i].code = code[i]; + } + } + } + + __forceinline void operator() (const BBox3fa& b, const unsigned index) + { + const vint4 binID = mapping.bin(b); + ax[slots] = extract<0>(binID); + ay[slots] = extract<1>(binID); + az[slots] = extract<2>(binID); + ai[slots] = index; + slots++; + currentID++; + + if (slots == 4) + { + const vint4 code = bitInterleave(ax,ay,az); + vint4::storeu(&dest[currentID-4],unpacklo(code,ai)); + vint4::storeu(&dest[currentID-2],unpackhi(code,ai)); + slots = 0; + } + } + + public: + const MortonCodeMapping mapping; + BuildPrim* dest; + size_t currentID; + size_t slots; + vint4 ax, ay, az, ai; + }; + +#endif + + template< + typename ReductionTy, + typename Allocator, + typename CreateAllocator, + typename CreateNodeFunc, + typename SetNodeBoundsFunc, + typename CreateLeafFunc, + typename CalculateBounds, + typename ProgressMonitor> + + class BuilderT : private Settings + { + ALIGNED_CLASS_(16); + + public: + + BuilderT (CreateAllocator& createAllocator, + CreateNodeFunc& createNode, + SetNodeBoundsFunc& setBounds, + CreateLeafFunc& createLeaf, + CalculateBounds& calculateBounds, + ProgressMonitor& progressMonitor, + const Settings& settings) + + : Settings(settings), + createAllocator(createAllocator), + createNode(createNode), + setBounds(setBounds), + createLeaf(createLeaf), + calculateBounds(calculateBounds), + progressMonitor(progressMonitor), + morton(nullptr) {} + + ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (depth > maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (current.size() <= maxLeafSize) + return createLeaf(current,alloc); + + /* fill all children by always splitting the largest one */ + range<unsigned> children[MAX_BRANCHING_FACTOR]; + size_t numChildren = 1; + children[0] = current; + + do { + + /* find best child with largest number of primitives */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= maxLeafSize) + continue; + + /* remember child with largest size */ + if (children[i].size() > bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == size_t(-1)) break; + + /*! split best child into left and right child */ + auto split = children[bestChild].split(); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = split.first; + children[numChildren+0] = split.second; + numChildren++; + + } while (numChildren < branchingFactor); + + /* create node */ + auto node = createNode(alloc,numChildren); + + /* recurse into each child */ + ReductionTy bounds[MAX_BRANCHING_FACTOR]; + for (size_t i=0; i<numChildren; i++) + bounds[i] = createLargeLeaf(depth+1,children[i],alloc); + + return setBounds(node,bounds,numChildren); + } + + /*! recreates morton codes when reaching a region where all codes are identical */ + __noinline void recreateMortonCodes(const range<unsigned>& current) const + { + /* fast path for small ranges */ + if (likely(current.size() < 1024)) + { + /*! recalculate centroid bounds */ + BBox3fa centBounds(empty); + for (size_t i=current.begin(); i<current.end(); i++) + centBounds.extend(center2(calculateBounds(morton[i]))); + + /* recalculate morton codes */ + MortonCodeMapping mapping(centBounds); + for (size_t i=current.begin(); i<current.end(); i++) + morton[i].code = mapping.code(calculateBounds(morton[i])); + + /* sort morton codes */ + std::sort(morton+current.begin(),morton+current.end()); + } + else + { + /*! recalculate centroid bounds */ + auto calculateCentBounds = [&] ( const range<unsigned>& r ) { + BBox3fa centBounds = empty; + for (size_t i=r.begin(); i<r.end(); i++) + centBounds.extend(center2(calculateBounds(morton[i]))); + return centBounds; + }; + const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024), + BBox3fa(empty), calculateCentBounds, BBox3fa::merge); + + /* recalculate morton codes */ + MortonCodeMapping mapping(centBounds); + parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) { + for (size_t i=r.begin(); i<r.end(); i++) { + morton[i].code = mapping.code(calculateBounds(morton[i])); + } + }); + + /*! sort morton codes */ +#if defined(TASKING_TBB) + tbb::parallel_sort(morton+current.begin(),morton+current.end()); +#else + radixsort32(morton+current.begin(),current.size()); +#endif + } + } + + __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const + { + const unsigned int code_start = morton[current.begin()].code; + const unsigned int code_end = morton[current.end()-1].code; + unsigned int bitpos = lzcnt(code_start^code_end); + + /* if all items mapped to same morton code, then re-create new morton codes for the items */ + if (unlikely(bitpos == 32)) + { + recreateMortonCodes(current); + const unsigned int code_start = morton[current.begin()].code; + const unsigned int code_end = morton[current.end()-1].code; + bitpos = lzcnt(code_start^code_end); + + /* if the morton code is still the same, goto fall back split */ + if (unlikely(bitpos == 32)) { + current.split(left,right); + return; + } + } + + /* split the items at the topmost different morton code bit */ + const unsigned int bitpos_diff = 31-bitpos; + const unsigned int bitmask = 1 << bitpos_diff; + + /* find location where bit differs using binary search */ + unsigned begin = current.begin(); + unsigned end = current.end(); + while (begin + 1 != end) { + const unsigned mid = (begin+end)/2; + const unsigned bit = morton[mid].code & bitmask; + if (bit == 0) begin = mid; else end = mid; + } + unsigned center = end; +#if defined(DEBUG) + for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0); + for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask); +#endif + + left = make_range(current.begin(),center); + right = make_range(center,current.end()); + } + + ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAllocator(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= singleThreadThreshold) + progressMonitor(current.size()); + + /* create leaf node */ + if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize)) + return createLargeLeaf(depth,current,alloc); + + /* fill all children by always splitting the one with the largest surface area */ + range<unsigned> children[MAX_BRANCHING_FACTOR]; + split(current,children[0],children[1]); + size_t numChildren = 2; + + while (numChildren < branchingFactor) + { + /* find best child with largest number of primitives */ + int bestChild = -1; + unsigned bestItems = 0; + for (unsigned int i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= minLeafSize) + continue; + + /* remember child with largest area */ + if (children[i].size() > bestItems) { + bestItems = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + range<unsigned> left, right; + split(children[bestChild],left,right); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + } + + /* create leaf node if no split is possible */ + if (unlikely(numChildren == 1)) + return createLeaf(current,alloc); + + /* allocate node */ + auto node = createNode(alloc,numChildren); + + /* process top parts of tree parallel */ + ReductionTy bounds[MAX_BRANCHING_FACTOR]; + if (current.size() > singleThreadThreshold) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + bounds[i] = recurse(depth+1,children[i],nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + + /* finish tree sequentially */ + else + { + for (size_t i=0; i<numChildren; i++) + bounds[i] = recurse(depth+1,children[i],alloc,false); + } + + return setBounds(node,bounds,numChildren); + } + + /* build function */ + ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives) + { + /* sort morton codes */ + morton = src; + radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold); + + /* build BVH */ + const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + public: + CreateAllocator& createAllocator; + CreateNodeFunc& createNode; + SetNodeBoundsFunc& setBounds; + CreateLeafFunc& createLeaf; + CalculateBounds& calculateBounds; + ProgressMonitor& progressMonitor; + + public: + BuildPrim* morton; + }; + + + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename SetBoundsFunc, + typename CreateLeafFunc, + typename CalculateBoundsFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAllocator, + CreateNodeFunc createNode, + SetBoundsFunc setBounds, + CreateLeafFunc createLeaf, + CalculateBoundsFunc calculateBounds, + ProgressMonitor progressMonitor, + BuildPrim* src, + BuildPrim* tmp, + size_t numPrimitives, + const Settings& settings) + { + typedef BuilderT< + ReductionTy, + decltype(createAllocator()), + CreateAllocFunc, + CreateNodeFunc, + SetBoundsFunc, + CreateLeafFunc, + CalculateBoundsFunc, + ProgressMonitor> Builder; + + Builder builder(createAllocator, + createNode, + setBounds, + createLeaf, + calculateBounds, + progressMonitor, + settings); + + return builder.build(src,tmp,numPrimitives); + } + }; + } +} diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h new file mode 100644 index 0000000000..f9a08d65cd --- /dev/null +++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h @@ -0,0 +1,692 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define MBLUR_NUM_TEMPORAL_BINS 2 +#define MBLUR_NUM_OBJECT_BINS 32 + +#include "../bvh/bvh.h" +#include "../common/primref_mb.h" +#include "heuristic_binning_array_aligned.h" +#include "heuristic_timesplit_array.h" + +namespace embree +{ + namespace isa + { + template<typename T> + struct SharedVector + { + __forceinline SharedVector() {} + + __forceinline SharedVector(T* ptr, size_t refCount = 1) + : prims(ptr), refCount(refCount) {} + + __forceinline void incRef() { + refCount++; + } + + __forceinline void decRef() + { + if (--refCount == 0) + delete prims; + } + + T* prims; + size_t refCount; + }; + + template<typename BuildRecord, int MAX_BRANCHING_FACTOR> + struct LocalChildListT + { + typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector; + + __forceinline LocalChildListT (const BuildRecord& record) + : numChildren(1), numSharedPrimVecs(1) + { + /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */ + children[0] = record; + primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2); + } + + __forceinline ~LocalChildListT() + { + for (size_t i = 0; i < numChildren; i++) + primvecs[i]->decRef(); + } + + __forceinline BuildRecord& operator[] ( const size_t i ) { + return children[i]; + } + + __forceinline size_t size() const { + return numChildren; + } + + __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector) + { + SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild]; + if (lrecord.prims.prims == bsharedPrimVec->prims) { + primvecs[bestChild] = bsharedPrimVec; + bsharedPrimVec->incRef(); + } + else { + primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims); + } + + if (rrecord.prims.prims == bsharedPrimVec->prims) { + primvecs[numChildren] = bsharedPrimVec; + bsharedPrimVec->incRef(); + } + else { + primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims); + } + bsharedPrimVec->decRef(); + new_vector.release(); + + children[bestChild] = lrecord; + children[numChildren] = rrecord; + numChildren++; + } + + public: + array_t<BuildRecord,MAX_BRANCHING_FACTOR> children; + array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs; + size_t numChildren; + + array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs; + size_t numSharedPrimVecs; + }; + + template<typename Mesh> + struct RecalculatePrimRef + { + Scene* scene; + + __forceinline RecalculatePrimRef (Scene* scene) + : scene(scene) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Mesh* mesh = scene->get<Mesh>(geomID); + const LBBox3fa lbounds = mesh->linearBounds(primID, time_range); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + // __noinline is workaround for ICC16 bug under MacOSX + __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Mesh* mesh = scene->get<Mesh>(geomID); + const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range); + } + + // __noinline is workaround for ICC16 bug under MacOSX + __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { + return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range); + } + }; + + struct VirtualRecalculatePrimRef + { + Scene* scene; + + __forceinline VirtualRecalculatePrimRef (Scene* scene) + : scene(scene) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Geometry* mesh = scene->get(geomID); + const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Geometry* mesh = scene->get(geomID); + const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { + return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range); + } + }; + + struct BVHBuilderMSMBlur + { + /*! settings for msmblur builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8), + travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false), + singleThreadThreshold(1024) {} + + + Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold) + : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), + travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + float travCost; //!< estimated cost of one traversal step + float intCost; //!< estimated cost of one primitive intersection + bool singleLeafTimeSegment; //!< split time to single time range + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + }; + + struct BuildRecord + { + public: + __forceinline BuildRecord () {} + + __forceinline BuildRecord (size_t depth) + : depth(depth) {} + + __forceinline BuildRecord (const SetMB& prims, size_t depth) + : depth(depth), prims(prims) {} + + __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) { + return a.prims.size() < b.prims.size(); + } + + __forceinline size_t size() const { + return prims.size(); + } + + public: + size_t depth; //!< Depth of the root of this subtree. + SetMB prims; //!< The list of primitives. + }; + + struct BuildRecordSplit : public BuildRecord + { + __forceinline BuildRecordSplit () {} + + __forceinline BuildRecordSplit (size_t depth) + : BuildRecord(depth) {} + + __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split) + : BuildRecord(record), split(split) {} + + BinSplit<MBLUR_NUM_OBJECT_BINS> split; + }; + + template< + typename NodeRef, + typename RecalculatePrimRef, + typename Allocator, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename SetNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + class BuilderT + { + ALIGNED_CLASS_(16); + static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split; + typedef mvector<PrimRefMB>* PrimRefVector; + typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector; + typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList; + typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit; + + public: + + BuilderT (MemoryMonitorInterface* device, + const RecalculatePrimRef recalculatePrimRef, + const CreateAllocFunc createAlloc, + const CreateNodeFunc createNode, + const SetNodeFunc setNode, + const CreateLeafFunc createLeaf, + const ProgressMonitor progressMonitor, + const Settings& settings) + : cfg(settings), + heuristicObjectSplit(), + heuristicTemporalSplit(device, recalculatePrimRef), + recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf), + progressMonitor(progressMonitor) + { + if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) + throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); + } + + /*! finds the best split */ + const Split find(const SetMB& set) + { + /* first try standard object split */ + const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize); + const float object_split_sah = object_split.splitSAH(); + + /* test temporal splits only when object split was bad */ + const float leaf_sah = set.leafSAH(cfg.logBlockSize); + if (object_split_sah < 0.50f*leaf_sah) + return object_split; + + /* do temporal splits only if the time range is big enough */ + if (set.time_range.size() > 1.01f/float(set.max_num_time_segments)) + { + const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize); + const float temporal_split_sah = temporal_split.splitSAH(); + + /* take temporal split if it improved SAH */ + if (temporal_split_sah < object_split_sah) + return temporal_split; + } + + return object_split; + } + + /*! array partitioning */ + __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) + { + /* perform object split */ + if (likely(split.data == Split::SPLIT_OBJECT)) { + heuristicObjectSplit.split(split,set,lset,rset); + } + /* perform temporal split */ + else if (likely(split.data == Split::SPLIT_TEMPORAL)) { + return heuristicTemporalSplit.split(split,set,lset,rset); + } + /* perform fallback split */ + else if (unlikely(split.data == Split::SPLIT_FALLBACK)) { + set.deterministic_order(); + splitFallback(set,lset,rset); + } + /* split by geometry */ + else if (unlikely(split.data == Split::SPLIT_GEOMID)) { + set.deterministic_order(); + splitByGeometry(set,lset,rset); + } + else + assert(false); + + return std::unique_ptr<mvector<PrimRefMB>>(); + } + + /*! finds the best fallback split */ + __noinline Split findFallback(const SetMB& set) + { + /* split if primitives are not from same geometry */ + if (!sameGeometry(set)) + return Split(0.0f,Split::SPLIT_GEOMID); + + /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */ + if (cfg.singleLeafTimeSegment) + { + /* test if one primitive has more than one time segment in time range, if so split time */ + for (size_t i=set.begin(); i<set.end(); i++) + { + const PrimRefMB& prim = (*set.prims)[i]; + const range<int> itime_range = prim.timeSegmentRange(set.time_range); + const int localTimeSegments = itime_range.size(); + assert(localTimeSegments > 0); + if (localTimeSegments > 1) { + const int icenter = (itime_range.begin() + itime_range.end())/2; + const float splitTime = prim.timeStep(icenter); + return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime); + } + } + } + + /* otherwise return fallback split */ + return Split(0.0f,Split::SPLIT_FALLBACK); + } + + /*! performs fallback split */ + void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset) + { + mvector<PrimRefMB>& prims = *set.prims; + + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfoMB linfo = empty; + for (size_t i=begin; i<center; i++) + linfo.add_primref(prims[i]); + + PrimInfoMB rinfo = empty; + for (size_t i=center; i<end; i++) + rinfo.add_primref(prims[i]); + + new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range); + } + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const SetMB& set) + { + if (set.size() == 0) return true; + mvector<PrimRefMB>& prims = *set.prims; + const size_t begin = set.begin(); + const size_t end = set.end(); + unsigned int firstGeomID = prims[begin].geomID(); + for (size_t i=begin+1; i<end; i++) { + if (prims[i].geomID() != firstGeomID){ + return false; + } + } + return true; + } + + /* split by geometry ID */ + void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset) + { + assert(set.size() > 1); + + mvector<PrimRefMB>& prims = *set.prims; + const size_t begin = set.begin(); + const size_t end = set.end(); + + PrimInfoMB left(empty); + PrimInfoMB right(empty); + unsigned int geomID = prims[begin].geomID(); + size_t center = serial_partitioning(prims.data(),begin,end,left,right, + [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, + [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); }); + + new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range); + } + + const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (in.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* replace already found split by fallback split */ + const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims)); + + /* special case when directly creating leaf without any splits that could shrink time_range */ + bool force_split = false; + if (current.depth == 1 && current.size() > 0) + { + BBox1f c = empty; + BBox1f p = current.prims.time_range; + for (size_t i=current.prims.begin(); i<current.prims.end(); i++) { + mvector<PrimRefMB>& prims = *current.prims.prims; + c.extend(prims[i].time_range); + } + + force_split = c.lower > p.lower || c.upper < p.upper; + } + + /* create leaf for few primitives */ + if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split) + return createLeaf(current,alloc); + + /* fill all children by always splitting the largest one */ + bool hasTimeSplits = false; + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildListSplit children(current); + + do { + /* find best child with largest bounding box area */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i<children.size(); i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split) + continue; + + force_split = false; + + /* remember child with largest size */ + if (children[i].size() > bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /* perform best found split */ + BuildRecordSplit& brecord = children[bestChild]; + BuildRecordSplit lrecord(current.depth+1); + BuildRecordSplit rrecord(current.depth+1); + std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims); + hasTimeSplits |= new_vector != nullptr; + + /* find new splits */ + lrecord.split = findFallback(lrecord.prims); + rrecord.split = findFallback(rrecord.prims); + children.split(bestChild,lrecord,rrecord,std::move(new_vector)); + + } while (children.size() < cfg.branchingFactor); + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i<children.size(); i++) { + const BBox1f c = children[i].prims.time_range; + const BBox1f p = in.prims.time_range; + hasTimeSplits |= c.lower > p.lower || c.upper < p.upper; + } + + /* create node */ + auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits); + + /* recurse into each child and perform reduction */ + LBBox3fa gbounds = empty; + for (size_t i=0; i<children.size(); i++) { + values[i] = createLargeLeaf(children[i],alloc); + gbounds.extend(values[i].lbounds); + } + + setNode(current,children.children.data(),node,values,children.numChildren); + + /* calculate geometry bounds of this node */ + if (hasTimeSplits) + return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range); + else + return NodeRecordMB4D(node,gbounds,current.prims.time_range); + } + + const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= cfg.singleThreadThreshold) + progressMonitor(current.size()); + + /*! find best split */ + const Split csplit = find(current.prims); + + /*! compute leaf and split cost */ + const float leafSAH = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize); + const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH(); + assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0))); + + /*! create a leaf node when threshold reached or SAH tells us to stop */ + if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { + current.prims.deterministic_order(); + return createLargeLeaf(current,alloc); + } + + /*! perform initial split */ + SetMB lprims,rprims; + std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims); + bool hasTimeSplits = new_vector != nullptr; + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildList children(current); + { + BuildRecord lrecord(lprims,current.depth+1); + BuildRecord rrecord(rprims,current.depth+1); + children.split(0,lrecord,rrecord,std::move(new_vector)); + } + + /*! split until node is full or SAH tells us to stop */ + while (children.size() < cfg.branchingFactor) + { + /*! find best child to split */ + float bestArea = neg_inf; + ssize_t bestChild = -1; + for (size_t i=0; i<children.size(); i++) + { + if (children[i].size() <= cfg.minLeafSize) continue; + if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) { + bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds); + } + } + if (bestChild == -1) break; + + /* perform split */ + BuildRecord& brecord = children[bestChild]; + BuildRecord lrecord(current.depth+1); + BuildRecord rrecord(current.depth+1); + Split csplit = find(brecord.prims); + std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims); + hasTimeSplits |= new_vector != nullptr; + children.split(bestChild,lrecord,rrecord,std::move(new_vector)); + } + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i<children.size(); i++) { + const BBox1f c = children[i].prims.time_range; + const BBox1f p = current.prims.time_range; + hasTimeSplits |= c.lower > p.lower || c.upper < p.upper; + } + + /* sort buildrecords for simpler shadow ray traversal */ + //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !! + + /*! create an inner node */ + auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits); + LBBox3fa gbounds = empty; + + /* spawn tasks */ + if (unlikely(current.size() > cfg.singleThreadThreshold)) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + values[i] = recurse(children[i],nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + + /*! merge bounding boxes */ + for (size_t i=0; i<children.size(); i++) + gbounds.extend(values[i].lbounds); + } + /* recurse into each child */ + else + { + //for (size_t i=0; i<children.size(); i++) + for (ssize_t i=children.size()-1; i>=0; i--) { + values[i] = recurse(children[i],alloc,false); + gbounds.extend(values[i].lbounds); + } + } + + setNode(current,children.children.data(),node,values,children.numChildren); + + /* calculate geometry bounds of this node */ + if (unlikely(hasTimeSplits)) + return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range); + else + return NodeRecordMB4D(node,gbounds,current.prims.time_range); + } + + /*! builder entry function */ + __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo) + { + const SetMB set(pinfo,&prims); + auto ret = recurse(BuildRecord(set,1),nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return ret; + } + + private: + Settings cfg; + HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit; + HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit; + const RecalculatePrimRef recalculatePrimRef; + const CreateAllocFunc createAlloc; + const CreateNodeFunc createNode; + const SetNodeFunc setNode; + const CreateLeafFunc createLeaf; + const ProgressMonitor progressMonitor; + }; + + template<typename NodeRef, + typename RecalculatePrimRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename SetNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitorFunc> + + static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims, + const PrimInfoMB& pinfo, + MemoryMonitorInterface* device, + const RecalculatePrimRef recalculatePrimRef, + const CreateAllocFunc createAlloc, + const CreateNodeFunc createNode, + const SetNodeFunc setNode, + const CreateLeafFunc createLeaf, + const ProgressMonitorFunc progressMonitor, + const Settings& settings) + { + typedef BuilderT< + NodeRef, + RecalculatePrimRef, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + SetNodeFunc, + CreateLeafFunc, + ProgressMonitorFunc> Builder; + + Builder builder(device, + recalculatePrimRef, + createAlloc, + createNode, + setNode, + createLeaf, + progressMonitor, + settings); + + + return builder(prims,pinfo); + } + }; + } +} diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h new file mode 100644 index 0000000000..397e8636b1 --- /dev/null +++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur_hair.h @@ -0,0 +1,526 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../geometry/primitive.h" +#include "../builders/bvh_builder_msmblur.h" +#include "../builders/heuristic_binning_array_aligned.h" +#include "../builders/heuristic_binning_array_unaligned.h" +#include "../builders/heuristic_timesplit_array.h" + +namespace embree +{ + namespace isa + { + struct BVHBuilderHairMSMBlur + { + /*! settings for msmblur builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {} + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + }; + + struct BuildRecord + { + public: + __forceinline BuildRecord () {} + + __forceinline BuildRecord (size_t depth) + : depth(depth) {} + + __forceinline BuildRecord (const SetMB& prims, size_t depth) + : depth(depth), prims(prims) {} + + __forceinline size_t size() const { + return prims.size(); + } + + public: + size_t depth; //!< depth of the root of this subtree + SetMB prims; //!< the list of primitives + }; + + template<typename NodeRef, + typename RecalculatePrimRef, + typename CreateAllocFunc, + typename CreateAABBNodeMBFunc, + typename SetAABBNodeMBFunc, + typename CreateOBBNodeMBFunc, + typename SetOBBNodeMBFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + class BuilderT + { + ALIGNED_CLASS_(16); + + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build + + typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + + typedef FastAllocator::CachedAllocator Allocator; + typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList; + + typedef HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> HeuristicTemporal; + typedef HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> HeuristicBinning; + typedef UnalignedHeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> UnalignedHeuristicBinning; + + public: + + BuilderT (Scene* scene, + const RecalculatePrimRef& recalculatePrimRef, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeMBFunc& createAABBNodeMB, + const SetAABBNodeMBFunc& setAABBNodeMB, + const CreateOBBNodeMBFunc& createOBBNodeMB, + const SetOBBNodeMBFunc& setOBBNodeMB, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings settings) + + : cfg(settings), + scene(scene), + recalculatePrimRef(recalculatePrimRef), + createAlloc(createAlloc), + createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB), + createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB), + createLeaf(createLeaf), + progressMonitor(progressMonitor), + unalignedHeuristic(scene), + temporalSplitHeuristic(scene->device,recalculatePrimRef) {} + + private: + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const SetMB& set) + { + mvector<PrimRefMB>& prims = *set.prims; + unsigned int firstGeomID = prims[set.begin()].geomID(); + for (size_t i=set.begin()+1; i<set.end(); i++) { + if (prims[i].geomID() != firstGeomID){ + return false; + } + } + return true; + } + + /*! performs some split if SAH approaches fail */ + void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset) + { + mvector<PrimRefMB>& prims = *set.prims; + + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfoMB linfo = empty; + for (size_t i=begin; i<center; i++) + linfo.add_primref(prims[i]); + + PrimInfoMB rinfo = empty; + for (size_t i=center; i<end; i++) + rinfo.add_primref(prims[i]); + + new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range); + } + + void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset) + { + assert(set.size() > 1); + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB linfo(empty); + PrimInfoMB rinfo(empty); + unsigned int geomID = (*set.prims)[begin].geomID(); + size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo, + [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, + [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); }); + + new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range); + } + + /*! creates a large leaf that could be larger than supported by the BVH */ + NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (current.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* special case when directly creating leaf without any splits that could shrink time_range */ + bool force_split = false; + if (current.depth == 1 && current.size() > 0) + { + BBox1f c = empty; + BBox1f p = current.prims.time_range; + for (size_t i=current.prims.begin(); i<current.prims.end(); i++) { + mvector<PrimRefMB>& prims = *current.prims.prims; + c.extend(prims[i].time_range); + } + + force_split = c.lower > p.lower || c.upper < p.upper; + } + + /* create leaf for few primitives */ + if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split) + return createLeaf(current.prims,alloc); + + /* fill all children by always splitting the largest one */ + LocalChildList children(current); + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + + do { + + /* find best child with largest bounding box area */ + int bestChild = -1; + size_t bestSize = 0; + for (unsigned i=0; i<children.size(); i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i].prims) && !force_split) + continue; + + force_split = false; + + /* remember child with largest size */ + if (children[i].size() > bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + if (!sameGeometry(children[bestChild].prims)) { + splitByGeometry(children[bestChild].prims,left.prims,right.prims); + } else { + splitFallback(children[bestChild].prims,left.prims,right.prims); + } + children.split(bestChild,left,right,std::unique_ptr<mvector<PrimRefMB>>()); + + } while (children.size() < cfg.branchingFactor); + + + /* detect time_ranges that have shrunken */ + bool timesplit = false; + for (size_t i=0; i<children.size(); i++) { + const BBox1f c = children[i].prims.time_range; + const BBox1f p = current.prims.time_range; + timesplit |= c.lower > p.lower || c.upper < p.upper; + } + + /* create node */ + NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit); + + LBBox3fa bounds = empty; + for (size_t i=0; i<children.size(); i++) { + values[i] = createLargeLeaf(children[i],alloc); + bounds.extend(values[i].lbounds); + } + + setAABBNodeMB(current,children.children.data(),node,values,children.numChildren); + + if (timesplit) + bounds = current.prims.linearBounds(recalculatePrimRef); + + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + + /*! performs split */ + std::unique_ptr<mvector<PrimRefMB>> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit) + { + /* variable to track the SAH of the best splitting approach */ + float bestSAH = inf; + const float leafSAH = current.prims.leafSAH(cfg.logBlockSize); + + /* perform standard binning in aligned space */ + HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize); + float alignedObjectSAH = alignedObjectSplit.splitSAH(); + bestSAH = min(alignedObjectSAH,bestSAH); + + /* perform standard binning in unaligned space */ + UnalignedHeuristicBinning::Split unalignedObjectSplit; + LinearSpace3fa uspace; + float unalignedObjectSAH = inf; + if (alignedObjectSAH > 0.7f*leafSAH) { + uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims); + const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace); + unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace); + unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive + bestSAH = min(unalignedObjectSAH,bestSAH); + } + + /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */ + float temporal_split_sah = inf; + typename HeuristicTemporal::Split temporal_split; + if (bestSAH > 0.5f*leafSAH) { + if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) { + temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize); + temporal_split_sah = temporal_split.splitSAH(); + bestSAH = min(temporal_split_sah,bestSAH); + } + } + + /* perform fallback split if SAH heuristics failed */ + if (unlikely(!std::isfinite(bestSAH))) { + current.prims.deterministic_order(); + splitFallback(current.prims,lrecord.prims,rrecord.prims); + } + /* perform aligned split if this is best */ + else if (likely(bestSAH == alignedObjectSAH)) { + alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims); + } + /* perform unaligned split if this is best */ + else if (likely(bestSAH == unalignedObjectSAH)) { + unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims); + aligned = false; + } + /* perform temporal split if this is best */ + else if (likely(bestSAH == temporal_split_sah)) { + timesplit = true; + return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims); + } + else + assert(false); + + return std::unique_ptr<mvector<PrimRefMB>>(); + } + + /*! recursive build */ + NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD) + progressMonitor(current.size()); + + /* create leaf node */ + if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) { + current.prims.deterministic_order(); + return createLargeLeaf(current,alloc); + } + + /* fill all children by always splitting the one with the largest surface area */ + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildList children(current); + bool aligned = true; + bool timesplit = false; + + do { + + /* find best child with largest bounding box area */ + ssize_t bestChild = -1; + float bestArea = neg_inf; + for (size_t i=0; i<children.size(); i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.minLeafSize) + continue; + + /* remember child with largest area */ + const float A = children[i].prims.halfArea(); + if (A > bestArea) { + bestArea = children[i].prims.halfArea(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + std::unique_ptr<mvector<PrimRefMB>> new_vector = split(children[bestChild],left,right,aligned,timesplit); + children.split(bestChild,left,right,std::move(new_vector)); + + } while (children.size() < cfg.branchingFactor); + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i<children.size(); i++) { + const BBox1f c = children[i].prims.time_range; + const BBox1f p = current.prims.time_range; + timesplit |= c.lower > p.lower || c.upper < p.upper; + } + + /* create time split node */ + if (timesplit) + { + const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true); + + /* spawn tasks or ... */ + if (current.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + values[i] = recurse(children[i],nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequential */ + else { + for (size_t i=0; i<children.size(); i++) { + values[i] = recurse(children[i],alloc,false); + } + } + + setAABBNodeMB(current,children.children.data(),node,values,children.numChildren); + + const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef); + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + + /* create aligned node */ + else if (aligned) + { + const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true); + + /* spawn tasks or ... */ + if (current.size() > SINGLE_THREADED_THRESHOLD) + { + LBBox3fa cbounds[MAX_BRANCHING_FACTOR]; + parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + values[i] = recurse(children[i],nullptr,true); + cbounds[i] = values[i].lbounds; + _mm_mfence(); // to allow non-temporal stores during build + } + }); + + LBBox3fa bounds = empty; + for (size_t i=0; i<children.size(); i++) + bounds.extend(cbounds[i]); + setAABBNodeMB(current,children.children.data(),node,values,children.numChildren); + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + /* ... continue sequentially */ + else + { + LBBox3fa bounds = empty; + for (size_t i=0; i<children.size(); i++) { + values[i] = recurse(children[i],alloc,false); + bounds.extend(values[i].lbounds); + } + setAABBNodeMB(current,children.children.data(),node,values,children.numChildren); + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + } + + /* create unaligned node */ + else + { + const NodeRef node = createOBBNodeMB(alloc); + + /* spawn tasks or ... */ + if (current.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims); + const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space); + const auto child = recurse(children[i],nullptr,true); + setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequentially */ + else + { + for (size_t i=0; i<children.size(); i++) { + const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims); + const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space); + const auto child = recurse(children[i],alloc,false); + setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range); + } + } + + const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef); + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + } + + public: + + /*! entry point into builder */ + NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo) + { + BuildRecord record(SetMB(pinfo,&prims),1); + auto root = recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + private: + Settings cfg; + Scene* scene; + const RecalculatePrimRef& recalculatePrimRef; + const CreateAllocFunc& createAlloc; + const CreateAABBNodeMBFunc& createAABBNodeMB; + const SetAABBNodeMBFunc& setAABBNodeMB; + const CreateOBBNodeMBFunc& createOBBNodeMB; + const SetOBBNodeMBFunc& setOBBNodeMB; + const CreateLeafFunc& createLeaf; + const ProgressMonitor& progressMonitor; + + private: + HeuristicBinning alignedHeuristic; + UnalignedHeuristicBinning unalignedHeuristic; + HeuristicTemporal temporalSplitHeuristic; + }; + + template<typename NodeRef, + typename RecalculatePrimRef, + typename CreateAllocFunc, + typename CreateAABBNodeMBFunc, + typename SetAABBNodeMBFunc, + typename CreateOBBNodeMBFunc, + typename SetOBBNodeMBFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + static BVHNodeRecordMB4D<NodeRef> build (Scene* scene, mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo, + const RecalculatePrimRef& recalculatePrimRef, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeMBFunc& createAABBNodeMB, + const SetAABBNodeMBFunc& setAABBNodeMB, + const CreateOBBNodeMBFunc& createOBBNodeMB, + const SetOBBNodeMBFunc& setOBBNodeMB, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings settings) + { + typedef BuilderT<NodeRef,RecalculatePrimRef,CreateAllocFunc, + CreateAABBNodeMBFunc,SetAABBNodeMBFunc, + CreateOBBNodeMBFunc,SetOBBNodeMBFunc, + CreateLeafFunc,ProgressMonitor> Builder; + + Builder builder(scene,recalculatePrimRef,createAlloc, + createAABBNodeMB,setAABBNodeMB, + createOBBNodeMB,setOBBNodeMB, + createLeaf,progressMonitor,settings); + + return builder(prims,pinfo); + } + }; + } +} diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h new file mode 100644 index 0000000000..fff4bf2a35 --- /dev/null +++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h @@ -0,0 +1,669 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning_array_aligned.h" +#include "heuristic_spatial_array.h" +#include "heuristic_openmerge_array.h" + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL +# define NUM_OBJECT_BINS 16 +# define NUM_SPATIAL_BINS 16 +#else +# define NUM_OBJECT_BINS 32 +# define NUM_SPATIAL_BINS 16 +#endif + +namespace embree +{ + namespace isa + { + MAYBE_UNUSED static const float travCost = 1.0f; + MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024; + + struct GeneralBVHBuilder + { + static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth + + + /*! settings for SAH builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), + travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {} + + /*! initialize settings from API settings */ + Settings (const RTCBuildArguments& settings) + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), + travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) + { + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; + if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(settings.sahBlockSize); + if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost )) travCost = settings.traversalCost; + if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost )) intCost = settings.intersectionCost; + + minLeafSize = min(minLeafSize,maxLeafSize); + } + + Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf) + : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), + travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + float travCost; //!< estimated cost of one traversal step + float intCost; //!< estimated cost of one primitive intersection + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + size_t primrefarrayalloc; //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished + }; + + /*! recursive state of builder */ + template<typename Set, typename Split> + struct BuildRecordT + { + public: + __forceinline BuildRecordT () {} + + __forceinline BuildRecordT (size_t depth) + : depth(depth), alloc_barrier(false), prims(empty) {} + + __forceinline BuildRecordT (size_t depth, const Set& prims) + : depth(depth), alloc_barrier(false), prims(prims) {} + + __forceinline BBox3fa bounds() const { return prims.geomBounds; } + + __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); } + __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size(); } + + __forceinline size_t size() const { return prims.size(); } + + public: + size_t depth; //!< Depth of the root of this subtree. + bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes + Set prims; //!< The list of primitives. + }; + + template<typename PrimRef, typename Set> + struct DefaultCanCreateLeafFunc + { + __forceinline bool operator()(const PrimRef*, const Set&) const { return true; } + }; + + template<typename PrimRef, typename Set> + struct DefaultCanCreateLeafSplitFunc + { + __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { } + }; + + template<typename BuildRecord, + typename Heuristic, + typename Set, + typename PrimRef, + typename ReductionTy, + typename Allocator, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename CanCreateLeafFunc, + typename CanCreateLeafSplitFunc, + typename ProgressMonitor> + + class BuilderT + { + friend struct GeneralBVHBuilder; + + BuilderT (PrimRef* prims, + Heuristic& heuristic, + const CreateAllocFunc& createAlloc, + const CreateNodeFunc& createNode, + const UpdateNodeFunc& updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + const Settings& settings) : + cfg(settings), + prims(prims), + heuristic(heuristic), + createAlloc(createAlloc), + createNode(createNode), + updateNode(updateNode), + createLeaf(createLeaf), + canCreateLeaf(canCreateLeaf), + canCreateLeafSplit(canCreateLeafSplit), + progressMonitor(progressMonitor) + { + if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) + throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); + } + + const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (current.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims)) + return createLeaf(prims,current.prims,alloc); + + /* fill all children by always splitting the largest one */ + ReductionTy values[MAX_BRANCHING_FACTOR]; + BuildRecord children[MAX_BRANCHING_FACTOR]; + size_t numChildren = 1; + children[0] = current; + do { + + /* find best child with largest bounding box area */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,children[i].prims)) + continue; + + /* remember child with largest size */ + if (children[i].prims.size() > bestSize) { + bestSize = children[i].prims.size(); + bestChild = i; + } + } + if (bestChild == (size_t)-1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + if (!canCreateLeaf(prims,children[bestChild].prims)) { + canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims); + } else { + heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims); + } + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + /* set barrier for primrefarrayalloc */ + if (unlikely(current.size() > cfg.primrefarrayalloc)) + for (size_t i=0; i<numChildren; i++) + children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc; + + /* create node */ + auto node = createNode(children,numChildren,alloc); + + /* recurse into each child and perform reduction */ + for (size_t i=0; i<numChildren; i++) + values[i] = createLargeLeaf(children[i],alloc); + + /* perform reduction */ + return updateNode(current,children,node,values,numChildren); + } + + const ReductionTy recurse(BuildRecord& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= cfg.singleThreadThreshold) + progressMonitor(current.size()); + + /*! find best split */ + auto split = heuristic.find(current.prims,cfg.logBlockSize); + + /*! compute leaf and split cost */ + const float leafSAH = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize); + const float splitSAH = cfg.travCost*halfArea(current.prims.geomBounds)+cfg.intCost*split.splitSAH(); + assert((current.prims.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0))); + + /*! create a leaf node when threshold reached or SAH tells us to stop */ + if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { + heuristic.deterministic_order(current.prims); + return createLargeLeaf(current,alloc); + } + + /*! perform initial split */ + Set lprims,rprims; + heuristic.split(split,current.prims,lprims,rprims); + + /*! initialize child list with initial split */ + ReductionTy values[MAX_BRANCHING_FACTOR]; + BuildRecord children[MAX_BRANCHING_FACTOR]; + children[0] = BuildRecord(current.depth+1,lprims); + children[1] = BuildRecord(current.depth+1,rprims); + size_t numChildren = 2; + + /*! split until node is full or SAH tells us to stop */ + while (numChildren < cfg.branchingFactor) + { + /*! find best child to split */ + float bestArea = neg_inf; + ssize_t bestChild = -1; + for (size_t i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].prims.size() <= cfg.minLeafSize) continue; + + /* find child with largest surface area */ + if (halfArea(children[i].prims.geomBounds) > bestArea) { + bestChild = i; + bestArea = halfArea(children[i].prims.geomBounds); + } + } + if (bestChild == -1) break; + + /* perform best found split */ + BuildRecord& brecord = children[bestChild]; + BuildRecord lrecord(current.depth+1); + BuildRecord rrecord(current.depth+1); + auto split = heuristic.find(brecord.prims,cfg.logBlockSize); + heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims); + children[bestChild ] = lrecord; + children[numChildren] = rrecord; + numChildren++; + } + + /* set barrier for primrefarrayalloc */ + if (unlikely(current.size() > cfg.primrefarrayalloc)) + for (size_t i=0; i<numChildren; i++) + children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc; + + /* sort buildrecords for faster shadow ray traversal */ + std::sort(&children[0],&children[numChildren],std::greater<BuildRecord>()); + + /*! create an inner node */ + auto node = createNode(children,numChildren,alloc); + + /* spawn tasks */ + if (current.size() > cfg.singleThreadThreshold) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here + for (size_t i=r.begin(); i<r.end(); i++) { + values[i] = recurse(children[i],nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + + return updateNode(current,children,node,values,numChildren); + } + /* recurse into each child */ + else + { + for (size_t i=0; i<numChildren; i++) + values[i] = recurse(children[i],alloc,false); + + return updateNode(current,children,node,values,numChildren); + } + } + + private: + Settings cfg; + PrimRef* prims; + Heuristic& heuristic; + const CreateAllocFunc& createAlloc; + const CreateNodeFunc& createNode; + const UpdateNodeFunc& updateNode; + const CreateLeafFunc& createLeaf; + const CanCreateLeafFunc& canCreateLeaf; + const CanCreateLeafSplitFunc& canCreateLeafSplit; + const ProgressMonitor& progressMonitor; + }; + + template< + typename ReductionTy, + typename Heuristic, + typename Set, + typename PrimRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + __noinline static ReductionTy build(Heuristic& heuristic, + PrimRef* prims, + const Set& set, + CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings& settings) + { + typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord; + + typedef BuilderT< + BuildRecord, + Heuristic, + Set, + PrimRef, + ReductionTy, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + UpdateNodeFunc, + CreateLeafFunc, + DefaultCanCreateLeafFunc<PrimRef, Set>, + DefaultCanCreateLeafSplitFunc<PrimRef, Set>, + ProgressMonitor> Builder; + + /* instantiate builder */ + Builder builder(prims, + heuristic, + createAlloc, + createNode, + updateNode, + createLeaf, + DefaultCanCreateLeafFunc<PrimRef, Set>(), + DefaultCanCreateLeafSplitFunc<PrimRef, Set>(), + progressMonitor, + settings); + + /* build hierarchy */ + BuildRecord record(1,set); + const ReductionTy root = builder.recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + template< + typename ReductionTy, + typename Heuristic, + typename Set, + typename PrimRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename CanCreateLeafFunc, + typename CanCreateLeafSplitFunc, + typename ProgressMonitor> + + __noinline static ReductionTy build(Heuristic& heuristic, + PrimRef* prims, + const Set& set, + CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + const Settings& settings) + { + typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord; + + typedef BuilderT< + BuildRecord, + Heuristic, + Set, + PrimRef, + ReductionTy, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + UpdateNodeFunc, + CreateLeafFunc, + CanCreateLeafFunc, + CanCreateLeafSplitFunc, + ProgressMonitor> Builder; + + /* instantiate builder */ + Builder builder(prims, + heuristic, + createAlloc, + createNode, + updateNode, + createLeaf, + canCreateLeaf, + canCreateLeafSplit, + progressMonitor, + settings); + + /* build hierarchy */ + BuildRecord record(1,set); + const ReductionTy root = builder.recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + }; + + /* SAH builder that operates on an array of BuildRecords */ + struct BVHBuilderBinnedSAH + { + typedef PrimInfoRange Set; + typedef HeuristicArrayBinningSAH<PrimRef,NUM_OBJECT_BINS> Heuristic; + typedef GeneralBVHBuilder::BuildRecordT<Set,typename Heuristic::Split> BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + PrimRef* prims, const PrimInfo& pinfo, + const Settings& settings) + { + Heuristic heuristic(prims); + return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>( + heuristic, + prims, + PrimInfoRange(0,pinfo.size(),pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + progressMonitor, + settings); + } + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename CanCreateLeafFunc, + typename CanCreateLeafSplitFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + PrimRef* prims, const PrimInfo& pinfo, + const Settings& settings) + { + Heuristic heuristic(prims); + return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>( + heuristic, + prims, + PrimInfoRange(0,pinfo.size(),pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + canCreateLeaf, + canCreateLeafSplit, + progressMonitor, + settings); + } + }; + + /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */ + struct BVHBuilderBinnedFastSpatialSAH + { + typedef PrimInfoExtRange Set; + typedef Split2<BinSplit<NUM_OBJECT_BINS>,SpatialBinSplit<NUM_SPATIAL_BINS> > Split; + typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + static const unsigned int GEOMID_MASK = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + template<typename ReductionTy, typename UserCreateLeaf> + struct CreateLeafExt + { + __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf) + : userCreateLeaf(userCreateLeaf) {} + + // __noinline is workaround for ICC2016 compiler bug + template<typename Allocator> + __noinline ReductionTy operator() (PrimRef* prims, const range<size_t>& range, Allocator alloc) const + { + for (size_t i=range.begin(); i<range.end(); i++) + prims[i].lower.u &= GEOMID_MASK; + + return userCreateLeaf(prims,range,alloc); + } + + const UserCreateLeaf userCreateLeaf; + }; + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename SplitPrimitiveFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, + UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + SplitPrimitiveFunc splitPrimitive, + ProgressMonitor progressMonitor, + PrimRef* prims, + const size_t extSize, + const PrimInfo& pinfo, + const Settings& settings) + { + typedef HeuristicArraySpatialSAH<SplitPrimitiveFunc,PrimRef,NUM_OBJECT_BINS,NUM_SPATIAL_BINS> Heuristic; + Heuristic heuristic(splitPrimitive,prims,pinfo); + + /* calculate total surface area */ // FIXME: this sum is not deterministic + const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range<size_t>& r) -> double { + + double A = 0.0f; + for (size_t i=r.begin(); i<r.end(); i++) + { + PrimRef& prim = prims[i]; + A += area(prim.bounds()); + } + return A; + },std::plus<double>()); + + + /* calculate maximum number of spatial splits per primitive */ + const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1; + const float f = 10.0f; + + const float invA = 1.0f / A; + parallel_for( size_t(0), pinfo.size(), [&](const range<size_t>& r) { + + for (size_t i=r.begin(); i<r.end(); i++) + { + PrimRef& prim = prims[i]; + assert((prim.geomID() & SPLITS_MASK) == 0); + // FIXME: is there a better general heuristic ? + const float nf = ceilf(f*pinfo.size()*area(prim.bounds()) * invA); + unsigned int n = 4+min((int)maxSplits-4, max(1, (int)(nf))); + prim.lower.u |= n << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + } + }); + + return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>( + heuristic, + prims, + PrimInfoExtRange(0,pinfo.size(),extSize,pinfo), + createAlloc, + createNode, + updateNode, + CreateLeafExt<ReductionTy,CreateLeafFunc>(createLeaf), + progressMonitor, + settings); + } + }; + + /* Open/Merge SAH builder that operates on an array of BuildRecords */ + struct BVHBuilderBinnedOpenMergeSAH + { + static const size_t NUM_OBJECT_BINS_HQ = 32; + typedef PrimInfoExtRange Set; + typedef BinSplit<NUM_OBJECT_BINS_HQ> Split; + typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename BuildRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename NodeOpenerFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, + UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + NodeOpenerFunc nodeOpenerFunc, + ProgressMonitor progressMonitor, + BuildRef* prims, + const size_t extSize, + const PrimInfo& pinfo, + const Settings& settings) + { + typedef HeuristicArrayOpenMergeSAH<NodeOpenerFunc,BuildRef,NUM_OBJECT_BINS_HQ> Heuristic; + Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor); + + return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,BuildRef>( + heuristic, + prims, + PrimInfoExtRange(0,pinfo.size(),extSize,pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + progressMonitor, + settings); + } + }; + } +} diff --git a/thirdparty/embree/kernels/builders/heuristic_binning.h b/thirdparty/embree/kernels/builders/heuristic_binning.h new file mode 100644 index 0000000000..ee29d09ac9 --- /dev/null +++ b/thirdparty/embree/kernels/builders/heuristic_binning.h @@ -0,0 +1,496 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "priminfo.h" +#include "../../common/algorithms/parallel_reduce.h" +#include "../../common/algorithms/parallel_partition.h" + +namespace embree +{ + namespace isa + { + /*! mapping into bins */ + template<size_t BINS> + struct BinMapping + { + public: + __forceinline BinMapping() {} + + /*! calculates the mapping */ + __forceinline BinMapping(size_t N, const BBox3fa& centBounds) + { + num = min(BINS,size_t(4.0f + 0.05f*N)); + assert(num >= 1); + const vfloat4 eps = 1E-34f; + const vfloat4 diag = max(eps, (vfloat4) centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) centBounds.lower; + } + + /*! calculates the mapping */ + __forceinline BinMapping(const BBox3fa& centBounds) + { + num = BINS; + const vfloat4 eps = 1E-34f; + const vfloat4 diag = max(eps, (vfloat4) centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) centBounds.lower; + } + + /*! calculates the mapping */ + template<typename PrimInfo> + __forceinline BinMapping(const PrimInfo& pinfo) + { + const vfloat4 eps = 1E-34f; + num = min(BINS,size_t(4.0f + 0.05f*pinfo.size())); + const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) pinfo.centBounds.lower; + } + + /*! returns number of bins */ + __forceinline size_t size() const { return num; } + + /*! slower but safe binning */ + __forceinline Vec3ia bin(const Vec3fa& p) const + { + const vint4 i = floori((vfloat4(p)-ofs)*scale); +#if 1 + assert(i[0] >= 0 && (size_t)i[0] < num); + assert(i[1] >= 0 && (size_t)i[1] < num); + assert(i[2] >= 0 && (size_t)i[2] < num); + return Vec3ia(i); +#else + return Vec3ia(clamp(i,vint4(0),vint4(num-1))); +#endif + } + + /*! faster but unsafe binning */ + __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const { + return Vec3ia(floori((vfloat4(p)-ofs)*scale)); + } + + /*! faster but unsafe binning */ + template<typename PrimRef> + __forceinline Vec3ia bin_unsafe(const PrimRef& p) const { + return bin_unsafe(p.binCenter()); + } + + /*! faster but unsafe binning */ + template<typename PrimRef, typename BinBoundsAndCenter> + __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const { + return bin_unsafe(binBoundsAndCenter.binCenter(p)); + } + + template<typename PrimRef> + __forceinline bool bin_unsafe(const PrimRef& ref, + const vint4& vSplitPos, + const vbool4& splitDimMask) const // FIXME: rename to isLeft + { + return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask); + } + /*! calculates left spatial position of bin */ + __forceinline float pos(const size_t bin, const size_t dim) const { + return madd(float(bin),1.0f / scale[dim],ofs[dim]); + } + + /*! returns true if the mapping is invalid in some dimension */ + __forceinline bool invalid(const size_t dim) const { + return scale[dim] == 0.0f; + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) { + return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}"; + } + + public: + size_t num; + vfloat4 ofs,scale; //!< linear function that maps to bin ID + }; + + /*! stores all information to perform some split */ + template<size_t BINS> + struct BinSplit + { + enum + { + SPLIT_OBJECT = 0, + SPLIT_FALLBACK = 1, + SPLIT_ENFORCE = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already + SPLIT_TEMPORAL = 2, + SPLIT_GEOMID = 3, + }; + + /*! construct an invalid split by default */ + __forceinline BinSplit() + : sah(inf), dim(-1), pos(0), data(0) {} + + __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0) + : sah(sah), dim(dim), fpos(fpos), data(data) {} + + /*! constructs specified split */ + __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping) + : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {} + + /*! tests if this split is valid */ + __forceinline bool valid() const { return dim != -1; } + + /*! calculates surface area heuristic for performing the split */ + __forceinline float splitSAH() const { return sah; } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) { + return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}"; + } + + public: + float sah; //!< SAH cost of the split + int dim; //!< split dimension + union { int pos; float fpos; }; //!< bin index for splitting + unsigned int data; //!< extra optional split data + BinMapping<BINS> mapping; //!< mapping into bins + }; + + /*! stores extended information about the split */ + template<typename BBox> + struct SplitInfoT + { + + __forceinline SplitInfoT () {} + + __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds) + : leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {} + + public: + size_t leftCount,rightCount; + BBox leftBounds,rightBounds; + }; + + typedef SplitInfoT<BBox3fa> SplitInfo; + typedef SplitInfoT<LBBox3fa> SplitInfo2; + + /*! stores all binning information */ + template<size_t BINS, typename PrimRef, typename BBox> + struct __aligned(64) BinInfoT + { + typedef BinSplit<BINS> Split; + typedef vbool4 vbool; + typedef vint4 vint; + typedef vfloat4 vfloat; + + __forceinline BinInfoT() { + } + + __forceinline BinInfoT(EmptyTy) { + clear(); + } + + /*! bin access function */ + __forceinline BBox &bounds(const size_t binID, const size_t dimID) { return _bounds[binID][dimID]; } + __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; } + + __forceinline unsigned int &counts(const size_t binID, const size_t dimID) { return _counts[binID][dimID]; } + __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; } + + __forceinline vuint4 &counts(const size_t binID) { return _counts[binID]; } + __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; } + + /*! clears the bin info */ + __forceinline void clear() + { + for (size_t i=0; i<BINS; i++) { + bounds(i,0) = bounds(i,1) = bounds(i,2) = empty; + counts(i) = vuint4(zero); + } + } + + /*! bins an array of primitives */ + __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping) + { + if (unlikely(N == 0)) return; + size_t i; + for (i=0; i<N-1; i+=2) + { + /*! map even and odd primitive to bin */ + BBox prim0; Vec3fa center0; + prims[i+0].binBoundsAndCenter(prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + + BBox prim1; Vec3fa center1; + prims[i+1].binBoundsAndCenter(prim1,center1); + const vint4 bin1 = (vint4)mapping.bin(center1); + + /*! increase bounds for bins for even primitive */ + const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0); + const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); + const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); + const unsigned int s0 = (unsigned int)prims[i+0].size(); + counts(b00,0)+=s0; + counts(b01,1)+=s0; + counts(b02,2)+=s0; + + /*! increase bounds of bins for odd primitive */ + const unsigned int b10 = extract<0>(bin1); bounds(b10,0).extend(prim1); + const unsigned int b11 = extract<1>(bin1); bounds(b11,1).extend(prim1); + const unsigned int b12 = extract<2>(bin1); bounds(b12,2).extend(prim1); + const unsigned int s1 = (unsigned int)prims[i+1].size(); + counts(b10,0)+=s1; + counts(b11,1)+=s1; + counts(b12,2)+=s1; + } + /*! for uneven number of primitives */ + if (i < N) + { + /*! map primitive to bin */ + BBox prim0; Vec3fa center0; + prims[i].binBoundsAndCenter(prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + + /*! increase bounds of bins */ + const unsigned int s0 = (unsigned int)prims[i].size(); + const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + } + } + + /*! bins an array of primitives */ + template<typename BinBoundsAndCenter> + __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (N == 0) return; + + size_t i; + for (i=0; i<N-1; i+=2) + { + /*! map even and odd primitive to bin */ + BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + BBox prim1; Vec3fa center1; binBoundsAndCenter.binBoundsAndCenter(prims[i+1],prim1,center1); + const vint4 bin1 = (vint4)mapping.bin(center1); + + /*! increase bounds for bins for even primitive */ + const unsigned int s0 = prims[i+0].size(); + const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + + /*! increase bounds of bins for odd primitive */ + const unsigned int s1 = prims[i+1].size(); + const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1); + const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1); + const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1); + } + + /*! for uneven number of primitives */ + if (i < N) + { + /*! map primitive to bin */ + BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + + /*! increase bounds of bins */ + const unsigned int s0 = prims[i+0].size(); + const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + } + } + + __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) { + bin(prims+begin,end-begin,mapping); + } + + template<typename BinBoundsAndCenter> + __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) { + bin<BinBoundsAndCenter>(prims+begin,end-begin,mapping,binBoundsAndCenter); + } + + /*! merges in other binning information */ + __forceinline void merge (const BinInfoT& other, size_t numBins) + { + + for (size_t i=0; i<numBins; i++) + { + counts(i) += other.counts(i); + bounds(i,0).extend(other.bounds(i,0)); + bounds(i,1).extend(other.bounds(i,1)); + bounds(i,2).extend(other.bounds(i,2)); + } + } + + /*! reduces binning information */ + static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS) + { + BinInfoT c; + for (size_t i=0; i<numBins; i++) + { + c.counts(i) = a.counts(i)+b.counts(i); + c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0)); + c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1)); + c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2)); + } + return c; + } + + /*! finds the best split by scanning binning information */ + __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const + { + /* sweep from right to left and compute parallel prefix of merged bounds */ + vfloat4 rAreas[BINS]; + vuint4 rCounts[BINS]; + vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty; + for (size_t i=mapping.size()-1; i>0; i--) + { + count += counts(i); + rCounts[i] = count; + bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx); + by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by); + bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz); + rAreas[i][3] = 0.0f; + } + /* sweep from left to right and compute SAH */ + vuint4 blocks_add = (1 << blocks_shift)-1; + vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; + count = 0; bx = empty; by = empty; bz = empty; + for (size_t i=1; i<mapping.size(); i++, ii+=1) + { + count += counts(i-1); + bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx); + by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by); + bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz); + const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az); + const vfloat4 rArea = rAreas[i]; + const vuint4 lCount = (count +blocks_add) >> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions. + const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift); + const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount)); + //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount))); + + vbestPos = select(sah < vbestSAH,ii ,vbestPos); + vbestSAH = select(sah < vbestSAH,sah,vbestSAH); + } + + /* find best dimension */ + float bestSAH = inf; + int bestDim = -1; + int bestPos = 0; + for (int dim=0; dim<3; dim++) + { + /* ignore zero sized dimensions */ + if (unlikely(mapping.invalid(dim))) + continue; + + /* test if this is a better dimension */ + if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) { + bestDim = dim; + bestPos = vbestPos[dim]; + bestSAH = vbestSAH[dim]; + } + } + return Split(bestSAH,bestDim,bestPos,mapping); + } + + /*! calculates extended split information */ + __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const + { + if (split.dim == -1) { + new (&info) SplitInfoT<BBox>(0,empty,0,empty); + return; + } + + size_t leftCount = 0; + BBox leftBounds = empty; + for (size_t i=0; i<(size_t)split.pos; i++) { + leftCount += counts(i,split.dim); + leftBounds.extend(bounds(i,split.dim)); + } + size_t rightCount = 0; + BBox rightBounds = empty; + for (size_t i=split.pos; i<mapping.size(); i++) { + rightCount += counts(i,split.dim); + rightBounds.extend(bounds(i,split.dim)); + } + new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds); + } + + /*! gets the number of primitives left of the split */ + __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t leftCount = 0; + for (size_t i = 0; i < (size_t)split.pos; i++) { + leftCount += counts(i, split.dim); + } + return leftCount; + } + + /*! gets the number of primitives right of the split */ + __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t rightCount = 0; + for (size_t i = (size_t)split.pos; i<mapping.size(); i++) { + rightCount += counts(i, split.dim); + } + return rightCount; + } + + private: + BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension + vuint4 _counts[BINS]; //!< counts number of primitives that map into the bins + }; + } + + template<typename BinInfoT, typename BinMapping, typename PrimRef> + __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping) + { + if (likely(end-begin < parallelThreshold)) { + binner.bin(prims,begin,end,mapping); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template<typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef> + __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (likely(end-begin < parallelThreshold)) { + binner.bin(prims,begin,end,mapping,binBoundsAndCenter); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef> + __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping) + { + if (!parallel) { + binner.bin(prims,begin,end,mapping); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template<bool parallel, typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef> + __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (!parallel) { + binner.bin(prims,begin,end,mapping,binBoundsAndCenter); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } +} diff --git a/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h new file mode 100644 index 0000000000..ab3b97efb9 --- /dev/null +++ b/thirdparty/embree/kernels/builders/heuristic_binning_array_aligned.h @@ -0,0 +1,200 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" + +namespace embree +{ + namespace isa + { + struct PrimInfoRange : public CentGeomBBox3fa, public range<size_t> + { + __forceinline PrimInfoRange () { + } + + __forceinline PrimInfoRange(const PrimInfo& pinfo) + : CentGeomBBox3fa(pinfo), range<size_t>(pinfo.begin,pinfo.end) {} + + __forceinline PrimInfoRange(EmptyTy) + : CentGeomBBox3fa(EmptyTy()), range<size_t>(0,0) {} + + __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) + : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {} + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift); + } + }; + + /*! Performs standard object binning */ + template<typename PrimRef, size_t BINS> + struct HeuristicArrayBinningSAH + { + typedef BinSplit<BINS> Split; + typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner; + typedef range<size_t> Set; + + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + __forceinline HeuristicArrayBinningSAH () + : prims(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArrayBinningSAH (PrimRef* prims) + : prims(prims) {} + + /*! finds the best split */ + __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize) + { + if (likely(pinfo.size() < PARALLEL_THRESHOLD)) + return find_template<false>(pinfo,logBlockSize); + else + return find_template<true>(pinfo,logBlockSize); + } + + template<bool parallel> + __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping<BINS> mapping(pinfo); + bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + if (likely(pinfo.size() < PARALLEL_THRESHOLD)) + split_template<false>(split,pinfo,linfo,rinfo); + else + split_template<true>(split,pinfo,linfo,rinfo); + } + + template<bool parallel> + __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename Binner::vint vSplitPos(splitPos); + const typename Binner::vbool vSplitMask(splitDimMask); + auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + size_t center = 0; + if (!parallel) + center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }); + else + center = parallel_partitioning( + prims,begin,end,EmptyTy(),local_left,local_right,isLeft, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }, + [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const PrimInfoRange& pinfo) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]); + } + + void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + const size_t begin = pinfo.begin(); + const size_t end = pinfo.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i<center; i++) + left.extend_center2(prims[i]); + new (&linfo) PrimInfoRange(begin,center,left); + + CentGeomBBox3fa right(empty); + for (size_t i=center; i<end; i++) + right.extend_center2(prims[i]); + new (&rinfo) PrimInfoRange(center,end,right); + } + + void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + assert(range.size() > 1); + CentGeomBBox3fa left(empty); + CentGeomBBox3fa right(empty); + unsigned int geomID = prims[range.begin()].geomID(); + size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right, + [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; }, + [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); }); + + new (&linfo) PrimInfoRange(range.begin(),center,left); + new (&rinfo) PrimInfoRange(center,range.end(),right); + } + + private: + PrimRef* const prims; + }; + + /*! Performs standard object binning */ + template<typename PrimRefMB, size_t BINS> + struct HeuristicArrayBinningMB + { + typedef BinSplit<BINS> Split; + typedef typename PrimRefMB::BBox BBox; + typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner; + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize) + { + ObjectBinner binner(empty); + const BinMapping<BINS> mapping(set.size(),set.centBounds); + bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping); + Split osplit = binner.best(mapping,logBlockSize); + osplit.sah *= set.time_range.size(); + if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split + return osplit; + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB left = empty; + PrimInfoMB right = empty; + const vint4 vSplitPos(split.pos); + const vbool4 vSplitMask(1 << split.dim); + auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); }; + auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); }; + auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); }; + size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD); + new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range); + } + }; + } +} diff --git a/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h new file mode 100644 index 0000000000..34a7f121bb --- /dev/null +++ b/thirdparty/embree/kernels/builders/heuristic_binning_array_unaligned.h @@ -0,0 +1,302 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + template<typename PrimRef, size_t BINS> + struct UnalignedHeuristicArrayBinningSAH + { + typedef BinSplit<BINS> Split; + typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner; + typedef range<size_t> Set; + + __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required? + : scene(nullptr), prims(nullptr) {} + + /*! remember prim array */ + __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims) + : scene(scene), prims(prims) {} + + const LinearSpace3fa computeAlignedSpace(const range<size_t>& set) + { + Vec3fa axis(0,0,1); + uint64_t bestGeomPrimID = -1; + + /*! find curve with minimum ID that defines valid direction */ + for (size_t i=set.begin(); i<set.end(); i++) + { + const unsigned int geomID = prims[i].geomID(); + const unsigned int primID = prims[i].primID(); + const uint64_t geomprimID = prims[i].ID64(); + if (geomprimID >= bestGeomPrimID) continue; + const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID); + if (sqr_length(axis1) > 1E-18f) { + axis = normalize(axis1); + bestGeomPrimID = geomprimID; + } + } + return frame(axis).transposed(); + } + + const PrimInfo computePrimInfo(const range<size_t>& set, const LinearSpace3fa& space) + { + auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa + { + CentGeomBBox3fa bounds(empty); + for (size_t i=r.begin(); i<r.end(); i++) { + Geometry* mesh = scene->get(prims[i].geomID()); + bounds.extend(mesh->vbounds(space,prims[i].primID())); + } + return bounds; + }; + + const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), + CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2); + + return PrimInfo(set.begin(),set.end(),bounds); + } + + struct BinBoundsAndCenter + { + __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space) + : scene(scene), space(space) {} + + /*! returns center for binning */ + __forceinline Vec3fa binCenter(const PrimRef& ref) const + { + Geometry* mesh = (Geometry*) scene->get(ref.geomID()); + BBox3fa bounds = mesh->vbounds(space,ref.primID()); + return embree::center2(bounds); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const + { + Geometry* mesh = (Geometry*) scene->get(ref.geomID()); + BBox3fa bounds = mesh->vbounds(space,ref.primID()); + bounds_o = bounds; + center_o = embree::center2(bounds); + } + + private: + Scene* scene; + const LinearSpace3fa space; + }; + + /*! finds the best split */ + __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space) + { + if (likely(pinfo.size() < 10000)) + return find_template<false>(pinfo,logBlockSize,space); + else + return find_template<true>(pinfo,logBlockSize,space); + } + + /*! finds the best split */ + template<bool parallel> + const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space) + { + Binner binner(empty); + const BinMapping<BINS> mapping(set); + BinBoundsAndCenter binBoundsAndCenter(scene,space); + bin_serial_or_parallel<parallel>(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (likely(set.size() < 10000)) + split_template<false>(split,space,set,lset,rset); + else + split_template<true>(split,space,set,lset,rset); + } + + /*! array partitioning */ + template<bool parallel> + __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + const int splitPos = split.pos; + const int splitDim = split.dim; + BinBoundsAndCenter binBoundsAndCenter(scene,space); + + size_t center = 0; + if (likely(set.size() < 10000)) + center = serial_partitioning(prims,begin,end,local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; }, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }); + else + center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; }, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }, + [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); }, + 128); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const range<size_t>& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[set.begin()],&prims[set.end()]); + } + + void splitFallback(const range<size_t>& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i<center; i++) + left.extend_center2(prims[i]); + new (&lset) PrimInfoRange(begin,center,left); + + CentGeomBBox3fa right(empty); + for (size_t i=center; i<end; i++) + right.extend_center2(prims[i]); + new (&rset) PrimInfoRange(center,end,right); + } + + private: + Scene* const scene; + PrimRef* const prims; + }; + + /*! Performs standard object binning */ + template<typename PrimRefMB, size_t BINS> + struct UnalignedHeuristicArrayBinningMB + { + typedef BinSplit<BINS> Split; + typedef typename PrimRefMB::BBox BBox; + typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner; + + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + UnalignedHeuristicArrayBinningMB(Scene* scene) + : scene(scene) {} + + const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set) + { + Vec3fa axis0(0,0,1); + uint64_t bestGeomPrimID = -1; + + /*! find curve with minimum ID that defines valid direction */ + for (size_t i=set.begin(); i<set.end(); i++) + { + const PrimRefMB& prim = (*set.prims)[i]; + const unsigned int geomID = prim.geomID(); + const unsigned int primID = prim.primID(); + const uint64_t geomprimID = prim.ID64(); + if (geomprimID >= bestGeomPrimID) continue; + + const Geometry* mesh = scene->get(geomID); + const range<int> tbounds = mesh->timeSegmentRange(set.time_range); + if (tbounds.size() == 0) continue; + + const size_t t = (tbounds.begin()+tbounds.end())/2; + const Vec3fa axis1 = mesh->computeDirection(primID,t); + if (sqr_length(axis1) > 1E-18f) { + axis0 = normalize(axis1); + bestGeomPrimID = geomprimID; + } + } + + return frame(axis0).transposed(); + } + + struct BinBoundsAndCenter + { + __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space) + : scene(scene), time_range(time_range), space(space) {} + + /*! returns center for binning */ + template<typename PrimRef> + __forceinline Vec3fa binCenter(const PrimRef& ref) const + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + return center2(lbounds.interpolate(0.5f)); + } + + /*! returns bounds and centroid used for binning */ + __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + bounds_o = lbounds.interpolate(0.5f); + center_o = center2(bounds_o); + } + + /*! returns bounds and centroid used for binning */ + __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + bounds_o = lbounds; + center_o = center2(lbounds.interpolate(0.5f)); + } + + private: + Scene* scene; + BBox1f time_range; + const LinearSpace3fa space; + }; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space) + { + BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space); + ObjectBinner binner(empty); + const BinMapping<BINS> mapping(set.size(),set.centBounds); + bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter); + Split osplit = binner.best(mapping,logBlockSize); + osplit.sah *= set.time_range.size(); + if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split + return osplit; + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset) + { + BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space); + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB left = empty; + PrimInfoMB right = empty; + const vint4 vSplitPos(split.pos); + const vbool4 vSplitMask(1 << split.dim); + auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); }; + auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); }; + auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); }; + size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD); + new (&lset) SetMB(left,set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range); + } + + private: + Scene* scene; + }; + } +} diff --git a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h new file mode 100644 index 0000000000..4249d16ea1 --- /dev/null +++ b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h @@ -0,0 +1,443 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +// TODO: +// - adjust parallel build thresholds +// - openNodesBasedOnExtend should consider max extended size + +#pragma once + +#include "heuristic_binning.h" +#include "heuristic_spatial.h" + +/* stop opening of all bref.geomIDs are the same */ +#define EQUAL_GEOMID_STOP_CRITERIA 1 + +/* 10% spatial extend threshold */ +#define MAX_EXTEND_THRESHOLD 0.1f + +/* maximum is 8 children */ +#define MAX_OPENED_CHILD_NODES 8 + +/* open until all build refs are below threshold size in one step */ +#define USE_LOOP_OPENING 0 + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + template<typename NodeOpenerFunc, typename PrimRef, size_t OBJECT_BINS> + struct HeuristicArrayOpenMergeSAH + { + typedef BinSplit<OBJECT_BINS> Split; + typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> Binner; + + static const size_t PARALLEL_THRESHOLD = 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 512; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + static const size_t MOVE_STEP_SIZE = 64; + static const size_t CREATE_SPLITS_STEP_SIZE = 128; + + __forceinline HeuristicArrayOpenMergeSAH () + : prims0(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size) + : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) + { + assert(max_open_size <= MAX_OPENED_CHILD_NODES); + } + + struct OpenHeuristic + { + __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo ) + { + const Vec3fa diag = pinfo.geomBounds.size(); + dim = maxDim(diag); + assert(diag[dim] > 0.0f); + inv_max_extend = 1.0f / diag[dim]; + } + + __forceinline bool operator () ( PrimRef& prim ) const { + return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD; + } + + private: + size_t dim; + float inv_max_extend; + }; + + /*! compute extended ranges */ + __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight) + { + assert(set.ext_range_size() > 0); + const float left_factor = (float)lweight / (lweight + rweight); + const size_t ext_range_size = set.ext_range_size(); + const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size); + const size_t right_ext_range_size = ext_range_size - left_ext_range_size; + lset.set_ext_range(lset.end() + left_ext_range_size); + rset.set_ext_range(rset.end() + right_ext_range_size); + } + + /*! move ranges */ + __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t left_ext_range_size = lset.ext_range_size(); + const size_t right_size = rset.size(); + + /* has the left child an extended range? */ + if (left_ext_range_size > 0) + { + /* left extended range smaller than right range ? */ + if (left_ext_range_size < right_size) + { + /* only move a small part of the beginning of the right range to the end */ + parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + prims0[i+right_size] = prims0[i]; + }); + } + else + { + /* no overlap, move entire right range to new location, can be made fully parallel */ + parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + prims0[i+left_ext_range_size] = prims0[i]; + }); + } + /* update right range */ + assert(rset.ext_end() + left_ext_range_size == set.ext_end()); + rset.move_right(left_ext_range_size); + } + } + + /* estimates the extra space required when opening, and checks if all primitives are from same geometry */ + __noinline std::pair<size_t,bool> getProperties(const PrimInfoExtRange& set) + { + const OpenHeuristic heuristic(set); + const unsigned int geomID = prims0[set.begin()].geomID(); + + auto body = [&] (const range<size_t>& r) -> std::pair<size_t,bool> { + bool commonGeomID = true; + size_t opens = 0; + for (size_t i=r.begin(); i<r.end(); i++) { + commonGeomID &= prims0[i].geomID() == geomID; + if (heuristic(prims0[i])) + opens += prims0[i].node.getN()-1; // coarse approximation + } + return std::pair<size_t,bool>(opens,commonGeomID); + }; + auto reduction = [&] (const std::pair<size_t,bool>& b0, const std::pair<size_t,bool>& b1) -> std::pair<size_t,bool> { + return std::pair<size_t,bool>(b0.first+b1.first,b0.second && b1.second); + }; + return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair<size_t,bool>(0,true),body,reduction); + } + + // FIXME: should consider maximum available extended size + __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set) + { + const OpenHeuristic heuristic(set); + const size_t ext_range_start = set.end(); + + if (false && set.size() < PARALLEL_THRESHOLD) + { + size_t extra_elements = 0; + for (size_t i=set.begin(); i<set.end(); i++) + { + if (heuristic(prims0[i])) + { + PrimRef tmp[MAX_OPENED_CHILD_NODES]; + const size_t n = nodeOpenerFunc(prims0[i],tmp); + assert(extra_elements + n-1 <= set.ext_range_size()); + for (size_t j=0; j<n; j++) + set.extend_center2(tmp[j]); + + prims0[i] = tmp[0]; + for (size_t j=1; j<n; j++) + prims0[ext_range_start+extra_elements+j-1] = tmp[j]; + extra_elements += n-1; + } + } + set._end += extra_elements; + } + else + { + std::atomic<size_t> ext_elements; + ext_elements.store(0); + PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range<size_t>& r) -> PrimInfo { + PrimInfo info(empty); + for (size_t i=r.begin(); i<r.end(); i++) + if (heuristic(prims0[i])) + { + PrimRef tmp[MAX_OPENED_CHILD_NODES]; + const size_t n = nodeOpenerFunc(prims0[i],tmp); + const size_t ID = ext_elements.fetch_add(n-1); + assert(ID + n-1 <= set.ext_range_size()); + + for (size_t j=0; j<n; j++) + info.extend_center2(tmp[j]); + + prims0[i] = tmp[0]; + for (size_t j=1; j<n; j++) + prims0[ext_range_start+ID+j-1] = tmp[j]; + } + return info; + }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); }); + set.centBounds.extend(info.centBounds); + assert(ext_elements.load() <= set.ext_range_size()); + set._end += ext_elements.load(); + } + } + + __noinline void openNodesBasedOnExtendLoop(PrimInfoExtRange& set, const size_t est_new_elements) + { + const OpenHeuristic heuristic(set); + size_t next_iteration_extra_elements = est_new_elements; + + while (next_iteration_extra_elements <= set.ext_range_size()) + { + next_iteration_extra_elements = 0; + size_t extra_elements = 0; + const size_t ext_range_start = set.end(); + + for (size_t i=set.begin(); i<set.end(); i++) + { + if (heuristic(prims0[i])) + { + PrimRef tmp[MAX_OPENED_CHILD_NODES]; + const size_t n = nodeOpenerFunc(prims0[i],tmp); + assert(extra_elements + n-1 <= set.ext_range_size()); + for (size_t j=0;j<n;j++) + set.extend_center2(tmp[j]); + + prims0[i] = tmp[0]; + for (size_t j=1;j<n;j++) + prims0[ext_range_start+extra_elements+j-1] = tmp[j]; + extra_elements += n-1; + + for (size_t j=0; j<n; j++) + if (heuristic(tmp[j])) + next_iteration_extra_elements += tmp[j].node.getN()-1; // coarse approximation + + } + } + assert( extra_elements <= set.ext_range_size()); + set._end += extra_elements; + + for (size_t i=set.begin();i<set.end();i++) + assert(prims0[i].numPrimitives() > 0); + + if (unlikely(next_iteration_extra_elements == 0)) break; + } + } + + __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize) + { + /* single element */ + if (set.size() <= 1) + return Split(); + + /* disable opening if there is no overlap */ + const size_t D = 4; + if (unlikely(set.has_ext_range() && set.size() <= D)) + { + bool disjoint = true; + for (size_t j=set.begin(); j<set.end()-1; j++) { + for (size_t i=set.begin()+1; i<set.end(); i++) { + if (conjoint(prims0[j].bounds(),prims0[i].bounds())) { + disjoint = false; break; + } + } + } + if (disjoint) set.set_ext_range(set.end()); /* disables opening */ + } + + std::pair<size_t,bool> p(0,false); + + /* disable opening when all primitives are from same geometry */ + if (unlikely(set.has_ext_range())) + { + p = getProperties(set); +#if EQUAL_GEOMID_STOP_CRITERIA == 1 + if (p.second) set.set_ext_range(set.end()); /* disable opening */ +#endif + } + + /* open nodes when we have sufficient space available */ + if (unlikely(set.has_ext_range())) + { +#if USE_LOOP_OPENING == 1 + openNodesBasedOnExtendLoop(set,p.first); +#else + if (p.first <= set.ext_range_size()) + openNodesBasedOnExtend(set); +#endif + + /* disable opening when unsufficient space for opening a node available */ + if (set.ext_range_size() < max_open_size-1) + set.set_ext_range(set.end()); /* disable opening */ + } + + /* find best split */ + return object_find(set,logBlockSize); + } + + + /*! finds the best object split */ + __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize); + else return parallel_object_find (set,logBlockSize); + } + + /*! finds the best object split */ + __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping<OBJECT_BINS> mapping(set.centBounds); + binner.bin(prims0,set.begin(),set.end(),mapping); + return binner.best(mapping,logBlockSize); + } + + /*! finds the best split */ + __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping<OBJECT_BINS> mapping(set.centBounds); + const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround + auto body = [&] (const range<size_t>& r) -> Binner { + Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; + }; + auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { + Binner r = b0; r.merge(b1,_mapping.size()); return r; + }; + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + PrimInfoExtRange set = set_i; + + /* valid split */ + if (unlikely(!split.valid())) { + deterministic_order(set); + splitFallback(set,lset,rset); + return; + } + + std::pair<size_t,size_t> ext_weights(0,0); + + /* object split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_object_split(split,set,lset,rset); + else + ext_weights = parallel_object_split(split,set,lset,rset); + + /* if we have an extended range, set extended child ranges and move right split range */ + if (unlikely(set.has_ext_range())) + { + setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second); + moveExtentedRange(set,lset,rset); + } + } + + /*! array partitioning */ + std::pair<size_t,size_t> sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); }); + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(local_left.size(),local_right.size()); + } + + /*! array partitioning */ + __noinline std::pair<size_t,size_t> parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); }, + [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + + return std::pair<size_t,size_t>(left.size(),right.size()); + } + + void deterministic_order(const extended_range<size_t>& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims0[set.begin()],&prims0[set.end()]); + } + + __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfo left(empty); + for (size_t i=begin; i<center; i++) + left.add_center2(prims0[i]); + + const size_t lweight = left.end; + + PrimInfo right(empty); + for (size_t i=center; i<end; i++) + right.add_center2(prims0[i]); + + const size_t rweight = right.end; + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + /* if we have an extended range */ + if (set.has_ext_range()) + { + setExtentedRanges(set,lset,rset,lweight,rweight); + moveExtentedRange(set,lset,rset); + } + } + + private: + PrimRef* const prims0; + const NodeOpenerFunc& nodeOpenerFunc; + size_t max_open_size; + }; + } +} diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial.h b/thirdparty/embree/kernels/builders/heuristic_spatial.h new file mode 100644 index 0000000000..a6939ba258 --- /dev/null +++ b/thirdparty/embree/kernels/builders/heuristic_spatial.h @@ -0,0 +1,414 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "priminfo.h" + +namespace embree +{ + static const unsigned int RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS = 5; + + namespace isa + { + + /*! mapping into bins */ + template<size_t BINS> + struct SpatialBinMapping + { + public: + __forceinline SpatialBinMapping() {} + + /*! calculates the mapping */ + __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo) + { + const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower; + const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper; + const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper)); + const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size()); + scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag); + ofs = (vfloat4) pinfo.geomBounds.lower; + inv_scale = 1.0f / scale; + } + + /*! slower but safe binning */ + __forceinline vint4 bin(const Vec3fa& p) const + { + const vint4 i = floori((vfloat4(p)-ofs)*scale); + return clamp(i,vint4(0),vint4(BINS-1)); + } + + __forceinline std::pair<vint4,vint4> bin(const BBox3fa& b) const + { +#if defined(__AVX__) + const vfloat8 ofs8(ofs); + const vfloat8 scale8(scale); + const vint8 lu = floori((vfloat8::loadu(&b)-ofs8)*scale8); + const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1)); + return std::pair<vint4,vint4>(extract4<0>(c_lu),extract4<1>(c_lu)); +#else + const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale); + const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale); + const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1)); + const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1)); + return std::pair<vint4,vint4>(c_lower,c_upper); +#endif + } + + + /*! calculates left spatial position of bin */ + __forceinline float pos(const size_t bin, const size_t dim) const { + return madd(float(bin),inv_scale[dim],ofs[dim]); + } + + /*! calculates left spatial position of bin */ + template<size_t N> + __forceinline vfloat<N> posN(const vfloat<N> bin, const size_t dim) const { + return madd(bin,vfloat<N>(inv_scale[dim]),vfloat<N>(ofs[dim])); + } + + /*! returns true if the mapping is invalid in some dimension */ + __forceinline bool invalid(const size_t dim) const { + return scale[dim] == 0.0f; + } + + public: + vfloat4 ofs,scale,inv_scale; //!< linear function that maps to bin ID + }; + + /*! stores all information required to perform some split */ + template<size_t BINS> + struct SpatialBinSplit + { + /*! construct an invalid split by default */ + __forceinline SpatialBinSplit() + : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {} + + /*! constructs specified split */ + __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping<BINS>& mapping) + : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {} + + /*! constructs specified split */ + __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping<BINS>& mapping) + : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {} + + /*! tests if this split is valid */ + __forceinline bool valid() const { return dim != -1; } + + /*! calculates surface area heuristic for performing the split */ + __forceinline float splitSAH() const { return sah; } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) { + return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}"; + } + + public: + float sah; //!< SAH cost of the split + int dim; //!< split dimension + int pos; //!< split position + int left; //!< number of elements on the left side + int right; //!< number of elements on the right side + float factor; //!< factor splitting the extended range + SpatialBinMapping<BINS> mapping; //!< mapping into bins + }; + + /*! stores all binning information */ + template<size_t BINS, typename PrimRef> + struct __aligned(64) SpatialBinInfo + { + SpatialBinInfo() { + } + + __forceinline SpatialBinInfo(EmptyTy) { + clear(); + } + + /*! clears the bin info */ + __forceinline void clear() + { + for (size_t i=0; i<BINS; i++) { + bounds[i][0] = bounds[i][1] = bounds[i][2] = empty; + numBegin[i] = numEnd[i] = 0; + } + } + + /*! adds binning data */ + __forceinline void add(const size_t dim, + const size_t beginID, + const size_t endID, + const size_t binID, + const BBox3fa &b, + const size_t n = 1) + { + assert(beginID < BINS); + assert(endID < BINS); + assert(binID < BINS); + + numBegin[beginID][dim]+=(unsigned int)n; + numEnd [endID][dim]+=(unsigned int)n; + bounds [binID][dim].extend(b); + } + + /*! extends binning bounds */ + __forceinline void extend(const size_t dim, + const size_t binID, + const BBox3fa &b) + { + assert(binID < BINS); + bounds [binID][dim].extend(b); + } + + /*! bins an array of triangles */ + template<typename SplitPrimitive> + __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping) + { + for (size_t i=0; i<N; i++) + { + const PrimRef prim = prims[i]; + unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + if (unlikely(splits == 1)) + { + const vint4 bin = mapping.bin(center(prim.bounds())); + for (size_t dim=0; dim<3; dim++) + { + assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS); + numBegin[bin[dim]][dim]++; + numEnd [bin[dim]][dim]++; + bounds [bin[dim]][dim].extend(prim.bounds()); + } + } + else + { + const vint4 bin0 = mapping.bin(prim.bounds().lower); + const vint4 bin1 = mapping.bin(prim.bounds().upper); + + for (size_t dim=0; dim<3; dim++) + { + size_t bin; + PrimRef rest = prim; + size_t l = bin0[dim]; + size_t r = bin1[dim]; + + // same bin optimization + if (likely(l == r)) + { + numBegin[l][dim]++; + numEnd [l][dim]++; + bounds [l][dim].extend(prim.bounds()); + continue; + } + + for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) + { + const float pos = mapping.pos(bin+1,dim); + + PrimRef left,right; + splitPrimitive(rest,(int)dim,pos,left,right); + if (unlikely(left.bounds().empty())) l++; + bounds[bin][dim].extend(left.bounds()); + rest = right; + } + if (unlikely(rest.bounds().empty())) r--; + numBegin[l][dim]++; + numEnd [r][dim]++; + bounds [bin][dim].extend(rest.bounds()); + } + } + } + } + + /*! bins a range of primitives inside an array */ + template<typename SplitPrimitive> + void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) { + bin(splitPrimitive,prims+begin,end-begin,mapping); + } + + /*! bins an array of primitives */ + template<typename PrimitiveSplitterFactory> + __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) + { + for (size_t i=begin; i<end; i++) + { + const PrimRef &prim = source[i]; + const vint4 bin0 = mapping.bin(prim.bounds().lower); + const vint4 bin1 = mapping.bin(prim.bounds().upper); + + for (size_t dim=0; dim<3; dim++) + { + if (unlikely(mapping.invalid(dim))) + continue; + + size_t bin; + size_t l = bin0[dim]; + size_t r = bin1[dim]; + + // same bin optimization + if (likely(l == r)) + { + add(dim,l,l,l,prim.bounds()); + continue; + } + const size_t bin_start = bin0[dim]; + const size_t bin_end = bin1[dim]; + BBox3fa rest = prim.bounds(); + const auto splitter = splitterFactory(prim); + for (bin=bin_start; bin<bin_end; bin++) + { + const float pos = mapping.pos(bin+1,dim); + BBox3fa left,right; + splitter(rest,dim,pos,left,right); + if (unlikely(left.empty())) l++; + extend(dim,bin,left); + rest = right; + } + if (unlikely(rest.empty())) r--; + add(dim,l,r,bin,rest); + } + } + } + + + + /*! bins an array of primitives */ + __forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) + { + for (size_t i=begin; i<end; i++) + { + const PrimRef &prim = source[i]; + const vint4 bin0 = mapping.bin(prim.bounds().lower); + const vint4 bin1 = mapping.bin(prim.bounds().upper); + + for (size_t dim=0; dim<3; dim++) + { + if (unlikely(mapping.invalid(dim))) + continue; + + const size_t l = bin0[dim]; + const size_t r = bin1[dim]; + + const unsigned int n = prim.primID(); + + // same bin optimization + if (likely(l == r)) + { + add(dim,l,l,l,prim.bounds(),n); + continue; + } + const size_t bin_start = bin0[dim]; + const size_t bin_end = bin1[dim]; + for (size_t bin=bin_start; bin<bin_end; bin++) + add(dim,l,r,bin,prim.bounds(),n); + } + } + } + + /*! merges in other binning information */ + void merge (const SpatialBinInfo& other) + { + for (size_t i=0; i<BINS; i++) + { + numBegin[i] += other.numBegin[i]; + numEnd [i] += other.numEnd [i]; + bounds[i][0].extend(other.bounds[i][0]); + bounds[i][1].extend(other.bounds[i][1]); + bounds[i][2].extend(other.bounds[i][2]); + } + } + + /*! merges in other binning information */ + static __forceinline const SpatialBinInfo reduce (const SpatialBinInfo& a, const SpatialBinInfo& b) + { + SpatialBinInfo c(empty); + for (size_t i=0; i<BINS; i++) + { + c.numBegin[i] += a.numBegin[i]+b.numBegin[i]; + c.numEnd [i] += a.numEnd [i]+b.numEnd [i]; + c.bounds[i][0] = embree::merge(a.bounds[i][0],b.bounds[i][0]); + c.bounds[i][1] = embree::merge(a.bounds[i][1],b.bounds[i][1]); + c.bounds[i][2] = embree::merge(a.bounds[i][2],b.bounds[i][2]); + } + return c; + } + + /*! finds the best split by scanning binning information */ + SpatialBinSplit<BINS> best(const SpatialBinMapping<BINS>& mapping, const size_t blocks_shift) const + { + /* sweep from right to left and compute parallel prefix of merged bounds */ + vfloat4 rAreas[BINS]; + vuint4 rCounts[BINS]; + vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty; + for (size_t i=BINS-1; i>0; i--) + { + count += numEnd[i]; + rCounts[i] = count; + bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx); + by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by); + bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz); + rAreas[i][3] = 0.0f; + } + + /* sweep from left to right and compute SAH */ + vuint4 blocks_add = (1 << blocks_shift)-1; + vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0; + count = 0; bx = empty; by = empty; bz = empty; + for (size_t i=1; i<BINS; i++, ii+=1) + { + count += numBegin[i-1]; + bx.extend(bounds[i-1][0]); float Ax = halfArea(bx); + by.extend(bounds[i-1][1]); float Ay = halfArea(by); + bz.extend(bounds[i-1][2]); float Az = halfArea(bz); + const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az); + const vfloat4 rArea = rAreas[i]; + const vuint4 lCount = (count +blocks_add) >> (unsigned int)(blocks_shift); + const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift); + const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount)); + // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount))); + const vbool4 mask = sah < vbestSAH; + vbestPos = select(mask,ii ,vbestPos); + vbestSAH = select(mask,sah,vbestSAH); + vbestlCount = select(mask,count,vbestlCount); + vbestrCount = select(mask,rCounts[i],vbestrCount); + } + + /* find best dimension */ + float bestSAH = inf; + int bestDim = -1; + int bestPos = 0; + unsigned int bestlCount = 0; + unsigned int bestrCount = 0; + for (int dim=0; dim<3; dim++) + { + /* ignore zero sized dimensions */ + if (unlikely(mapping.invalid(dim))) + continue; + + /* test if this is a better dimension */ + if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) { + bestDim = dim; + bestPos = vbestPos[dim]; + bestSAH = vbestSAH[dim]; + bestlCount = vbestlCount[dim]; + bestrCount = vbestrCount[dim]; + } + } + assert(bestSAH >= 0.0f); + + /* return invalid split if no split found */ + if (bestDim == -1) + return SpatialBinSplit<BINS>(inf,-1,0,mapping); + + /* return best found split */ + return SpatialBinSplit<BINS>(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping); + } + + private: + BBox3fa bounds[BINS][3]; //!< geometry bounds for each bin in each dimension + vuint4 numBegin[BINS]; //!< number of primitives starting in bin + vuint4 numEnd[BINS]; //!< number of primitives ending in bin + }; + } +} + diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h new file mode 100644 index 0000000000..60d235f48d --- /dev/null +++ b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h @@ -0,0 +1,546 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" +#include "heuristic_spatial.h" + +namespace embree +{ + namespace isa + { +#if 0 +#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f +#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f +#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f +#else +#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f +#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f +#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f +#endif + + struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range<size_t> + { + __forceinline PrimInfoExtRange() { + } + + __forceinline PrimInfoExtRange(EmptyTy) + : CentGeomBBox3fa(EmptyTy()), extended_range<size_t>(0,0,0) {} + + __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) + : CentGeomBBox3fa(centGeomBounds), extended_range<size_t>(begin,end,ext_end) {} + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift); + } + }; + + template<typename ObjectSplit, typename SpatialSplit> + struct Split2 + { + __forceinline Split2 () {} + + __forceinline Split2 (const Split2& other) + { + spatial = other.spatial; + sah = other.sah; + if (spatial) spatialSplit() = other.spatialSplit(); + else objectSplit() = other.objectSplit(); + } + + __forceinline Split2& operator= (const Split2& other) + { + spatial = other.spatial; + sah = other.sah; + if (spatial) spatialSplit() = other.spatialSplit(); + else objectSplit() = other.objectSplit(); + return *this; + } + + __forceinline ObjectSplit& objectSplit() { return *( ObjectSplit*)data; } + __forceinline const ObjectSplit& objectSplit() const { return *(const ObjectSplit*)data; } + + __forceinline SpatialSplit& spatialSplit() { return *( SpatialSplit*)data; } + __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; } + + __forceinline Split2 (const ObjectSplit& objectSplit, float sah) + : spatial(false), sah(sah) + { + new (data) ObjectSplit(objectSplit); + } + + __forceinline Split2 (const SpatialSplit& spatialSplit, float sah) + : spatial(true), sah(sah) + { + new (data) SpatialSplit(spatialSplit); + } + + __forceinline float splitSAH() const { + return sah; + } + + __forceinline bool valid() const { + return sah < float(inf); + } + + public: + __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)]; + bool spatial; + float sah; + }; + + /*! Performs standard object binning */ + template<typename PrimitiveSplitterFactory, typename PrimRef, size_t OBJECT_BINS, size_t SPATIAL_BINS> + struct HeuristicArraySpatialSAH + { + typedef BinSplit<OBJECT_BINS> ObjectSplit; + typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> ObjectBinner; + + typedef SpatialBinSplit<SPATIAL_BINS> SpatialSplit; + typedef SpatialBinInfo<SPATIAL_BINS,PrimRef> SpatialBinner; + + //typedef extended_range<size_t> Set; + typedef Split2<ObjectSplit,SpatialSplit> Split; + + static const size_t PARALLEL_THRESHOLD = 3*1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + static const size_t MOVE_STEP_SIZE = 64; + static const size_t CREATE_SPLITS_STEP_SIZE = 64; + + __forceinline HeuristicArraySpatialSAH () + : prims0(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info) + : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {} + + + /*! compute extended ranges */ + __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight) + { + assert(set.ext_range_size() > 0); + const float left_factor = (float)lweight / (lweight + rweight); + const size_t ext_range_size = set.ext_range_size(); + const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size); + const size_t right_ext_range_size = ext_range_size - left_ext_range_size; + lset.set_ext_range(lset.end() + left_ext_range_size); + rset.set_ext_range(rset.end() + right_ext_range_size); + } + + /*! move ranges */ + __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t left_ext_range_size = lset.ext_range_size(); + const size_t right_size = rset.size(); + + /* has the left child an extended range? */ + if (left_ext_range_size > 0) + { + /* left extended range smaller than right range ? */ + if (left_ext_range_size < right_size) + { + /* only move a small part of the beginning of the right range to the end */ + parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + prims0[i+right_size] = prims0[i]; + }); + } + else + { + /* no overlap, move entire right range to new location, can be made fully parallel */ + parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + prims0[i+left_ext_range_size] = prims0[i]; + }); + } + /* update right range */ + assert(rset.ext_end() + left_ext_range_size == set.ext_end()); + rset.move_right(left_ext_range_size); + } + } + + /*! finds the best split */ + const Split find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + SplitInfo oinfo; + const ObjectSplit object_split = object_find(set,logBlockSize,oinfo); + const float object_split_sah = object_split.splitSAH(); + + if (unlikely(set.has_ext_range())) + { + const BBox3fa overlap = intersect(oinfo.leftBounds, oinfo.rightBounds); + + /* do only spatial splits if the child bounds overlap */ + if (safeArea(overlap) >= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) && + safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds)) + { + const SpatialSplit spatial_split = spatial_find(set, logBlockSize); + const float spatial_split_sah = spatial_split.splitSAH(); + + /* valid spatial split, better SAH and number of splits do not exceed extended range */ + if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah && + spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size()) + { + return Split(spatial_split,spatial_split_sah); + } + } + } + + return Split(object_split,object_split_sah); + } + + /*! finds the best object split */ + __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info); + else return parallel_object_find (set,logBlockSize,info); + } + + /*! finds the best object split */ + __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + ObjectBinner binner(empty); + const BinMapping<OBJECT_BINS> mapping(set); + binner.bin(prims0,set.begin(),set.end(),mapping); + ObjectSplit s = binner.best(mapping,logBlockSize); + binner.getSplitInfo(mapping, s, info); + return s; + } + + /*! finds the best split */ + __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + ObjectBinner binner(empty); + const BinMapping<OBJECT_BINS> mapping(set); + const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner, + [&] (const range<size_t>& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; }, + [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; }); + ObjectSplit s = binner.best(mapping,logBlockSize); + binner.getSplitInfo(mapping, s, info); + return s; + } + + /*! finds the best spatial split */ + __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize); + else return parallel_spatial_find (set, logBlockSize); + } + + /*! finds the best spatial split */ + __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + SpatialBinner binner(empty); + const SpatialBinMapping<SPATIAL_BINS> mapping(set); + binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping); + /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + return binner.best(mapping,logBlockSize); //,set.ext_size()); + } + + __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + SpatialBinner binner(empty); + const SpatialBinMapping<SPATIAL_BINS> mapping(set); + const SpatialBinMapping<SPATIAL_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner, + [&] (const range<size_t>& r) -> SpatialBinner { + SpatialBinner binner(empty); + binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping); + return binner; }, + [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); }); + /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + return binner.best(mapping,logBlockSize); //,set.ext_size()); + } + + + /*! subdivides primitives based on a spatial split */ + __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping<SPATIAL_BINS> &mapping) + { + assert(set.has_ext_range()); + const size_t max_ext_range_size = set.ext_range_size(); + const size_t ext_range_start = set.end(); + + /* atomic counter for number of primref splits */ + std::atomic<size_t> ext_elements; + ext_elements.store(0); + + const float fpos = split.mapping.pos(split.pos,split.dim); + + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + + parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin();i<r.end();i++) + { + const unsigned int splits = prims0[i].geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */ + + //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim]; + //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim]; + //if (unlikely(bin0 < split.pos && bin1 >= split.pos)) + if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos)) + { + assert(splits > 1); + + PrimRef left,right; + const auto splitter = splitterFactory(prims0[i]); + splitter(prims0[i],split.dim,fpos,left,right); + + // no empty splits + if (unlikely(left.bounds().empty() || right.bounds().empty())) continue; + + left.lower.u = (left.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + + const size_t ID = ext_elements.fetch_add(1); + + /* break if the number of subdivided elements are greater than the maximum allowed size */ + if (unlikely(ID >= max_ext_range_size)) + break; + + /* only write within the correct bounds */ + assert(ID < max_ext_range_size); + prims0[i] = left; + prims0[ext_range_start+ID] = right; + } + } + }); + + const size_t numExtElements = min(max_ext_range_size,ext_elements.load()); + assert(set.end()+numExtElements<=set.ext_end()); + set._end += numExtElements; + } + + /*! array partitioning */ + void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + PrimInfoExtRange set = set_i; + + /* valid split */ + if (unlikely(!split.valid())) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + std::pair<size_t,size_t> ext_weights(0,0); + + if (unlikely(split.spatial)) + { + create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); + + /* spatial split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset); + else + ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset); + } + else + { + /* object split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset); + else + ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset); + } + + /* if we have an extended range, set extended child ranges and move right split range */ + if (unlikely(set.has_ext_range())) + { + setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second); + moveExtentedRange(set,lset,rset); + } + } + + /*! array partitioning */ + std::pair<size_t,size_t> sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename ObjectBinner::vint vSplitPos(splitPos); + const typename ObjectBinner::vbool vSplitMask(splitDimMask); + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { + return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); + }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); + const size_t left_weight = local_left.end; + const size_t right_weight = local_right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(left_weight,right_weight); + } + + + /*! array partitioning */ + __noinline std::pair<size_t,size_t> sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + /* init spatial mapping */ + const SpatialBinMapping<SPATIAL_BINS> &mapping = split.mapping; + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { + const Vec3fa c = ref.bounds().center(); + return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); + }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); + + const size_t left_weight = local_left.end; + const size_t right_weight = local_right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(left_weight,right_weight); + } + + + + /*! array partitioning */ + __noinline std::pair<size_t,size_t> parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename ObjectBinner::vint vSplitPos(splitPos); + const typename ObjectBinner::vbool vSplitMask(splitDimMask); + auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }, + [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + const size_t left_weight = left.end; + const size_t right_weight = right.end; + + left.begin = begin; left.end = center; + right.begin = center; right.end = end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + assert(area(left.geomBounds) >= 0.0f); + assert(area(right.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(left_weight,right_weight); + } + + /*! array partitioning */ + __noinline std::pair<size_t,size_t> parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + /* init spatial mapping */ + const SpatialBinMapping<SPATIAL_BINS>& mapping = split.mapping; + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + auto isLeft = [&] (const PrimRef &ref) { + const Vec3fa c = ref.bounds().center(); + return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }, + [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + const size_t left_weight = left.end; + const size_t right_weight = right.end; + + left.begin = begin; left.end = center; + right.begin = center; right.end = end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + assert(area(left.geomBounds) >= 0.0f); + assert(area(right.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(left_weight,right_weight); + } + + void deterministic_order(const PrimInfoExtRange& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims0[set.begin()],&prims0[set.end()]); + } + + void splitFallback(const PrimInfoExtRange& set, + PrimInfoExtRange& lset, + PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfo left(empty); + for (size_t i=begin; i<center; i++) { + left.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + } + const size_t lweight = left.end; + + PrimInfo right(empty); + for (size_t i=center; i<end; i++) { + right.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + } + const size_t rweight = right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + /* if we have an extended range */ + if (set.has_ext_range()) { + setExtentedRanges(set,lset,rset,lweight,rweight); + moveExtentedRange(set,lset,rset); + } + } + + private: + PrimRef* const prims0; + const PrimitiveSplitterFactory& splitterFactory; + const CentGeomBBox3fa& root_info; + }; + } +} diff --git a/thirdparty/embree/kernels/builders/heuristic_strand_array.h b/thirdparty/embree/kernels/builders/heuristic_strand_array.h new file mode 100644 index 0000000000..19c7fcdaa8 --- /dev/null +++ b/thirdparty/embree/kernels/builders/heuristic_strand_array.h @@ -0,0 +1,188 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "priminfo.h" +#include "../../common/algorithms/parallel_reduce.h" +#include "../../common/algorithms/parallel_partition.h" + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + struct HeuristicStrandSplit + { + typedef range<size_t> Set; + + static const size_t PARALLEL_THRESHOLD = 10000; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64; + + /*! stores all information to perform some split */ + struct Split + { + /*! construct an invalid split by default */ + __forceinline Split() + : sah(inf), axis0(zero), axis1(zero) {} + + /*! constructs specified split */ + __forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1) + : sah(sah), axis0(axis0), axis1(axis1) {} + + /*! calculates standard surface area heuristic for the split */ + __forceinline float splitSAH() const { return sah; } + + /*! test if this split is valid */ + __forceinline bool valid() const { return sah != float(inf); } + + public: + float sah; //!< SAH cost of the split + Vec3fa axis0, axis1; //!< axis the two strands are aligned into + }; + + __forceinline HeuristicStrandSplit () // FIXME: required? + : scene(nullptr), prims(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims) + : scene(scene), prims(prims) {} + + __forceinline const Vec3fa direction(const PrimRef& prim) { + return scene->get(prim.geomID())->computeDirection(prim.primID()); + } + + __forceinline const BBox3fa bounds(const PrimRef& prim) { + return scene->get(prim.geomID())->vbounds(prim.primID()); + } + + __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) { + return scene->get(prim.geomID())->vbounds(space,prim.primID()); + } + + /*! finds the best split */ + const Split find(const range<size_t>& set, size_t logBlockSize) + { + Vec3fa axis0(0,0,1); + uint64_t bestGeomPrimID = -1; + + /* curve with minimum ID determines first axis */ + for (size_t i=set.begin(); i<set.end(); i++) + { + const uint64_t geomprimID = prims[i].ID64(); + if (geomprimID >= bestGeomPrimID) continue; + const Vec3fa axis = direction(prims[i]); + if (sqr_length(axis) > 1E-18f) { + axis0 = normalize(axis); + bestGeomPrimID = geomprimID; + } + } + + /* find 2nd axis that is most misaligned with first axis and has minimum ID */ + float bestCos = 1.0f; + Vec3fa axis1 = axis0; + bestGeomPrimID = -1; + for (size_t i=set.begin(); i<set.end(); i++) + { + const uint64_t geomprimID = prims[i].ID64(); + Vec3fa axisi = direction(prims[i]); + float leni = length(axisi); + if (leni == 0.0f) continue; + axisi /= leni; + float cos = abs(dot(axisi,axis0)); + if ((cos == bestCos && (geomprimID < bestGeomPrimID)) || cos < bestCos) { + bestCos = cos; axis1 = axisi; + bestGeomPrimID = geomprimID; + } + } + + /* partition the two strands */ + size_t lnum = 0, rnum = 0; + BBox3fa lbounds = empty, rbounds = empty; + const LinearSpace3fa space0 = frame(axis0).transposed(); + const LinearSpace3fa space1 = frame(axis1).transposed(); + + for (size_t i=set.begin(); i<set.end(); i++) + { + PrimRef& prim = prims[i]; + const Vec3fa axisi = normalize(direction(prim)); + const float cos0 = abs(dot(axisi,axis0)); + const float cos1 = abs(dot(axisi,axis1)); + + if (cos0 > cos1) { lnum++; lbounds.extend(bounds(space0,prim)); } + else { rnum++; rbounds.extend(bounds(space1,prim)); } + } + + /*! return an invalid split if we do not partition */ + if (lnum == 0 || rnum == 0) + return Split(inf,axis0,axis1); + + /*! calculate sah for the split */ + const size_t lblocks = (lnum+(1ull<<logBlockSize)-1ull) >> logBlockSize; + const size_t rblocks = (rnum+(1ull<<logBlockSize)-1ull) >> logBlockSize; + const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds)); + return Split(sah,axis0,axis1); + } + + /*! array partitioning */ + void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + + auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { + const Vec3fa axisi = normalize(direction(prim)); + const float cos0 = abs(dot(axisi,split.axis0)); + const float cos1 = abs(dot(axisi,split.axis1)); + return cos0 > cos1; + }; + + auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { + pinfo.extend(bounds(ref)); + }; + + size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const Set& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[set.begin()],&prims[set.end()]); + } + + void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i<center; i++) + left.extend(bounds(prims[i])); + new (&lset) PrimInfoRange(begin,center,left); + + CentGeomBBox3fa right(empty); + for (size_t i=center; i<end; i++) + right.extend(bounds(prims[i])); + new (&rset) PrimInfoRange(center,end,right); + } + + private: + Scene* const scene; + PrimRef* const prims; + }; + } +} diff --git a/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h new file mode 100644 index 0000000000..b968e01c90 --- /dev/null +++ b/thirdparty/embree/kernels/builders/heuristic_timesplit_array.h @@ -0,0 +1,237 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/primref_mb.h" +#include "../../common/algorithms/parallel_filter.h" + +#define MBLUR_TIME_SPLIT_THRESHOLD 1.25f + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + template<typename PrimRefMB, typename RecalculatePrimRef, size_t BINS> + struct HeuristicMBlurTemporalSplit + { + typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split; + typedef mvector<PrimRefMB>* PrimRefVector; + typedef typename PrimRefMB::BBox BBox; + + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef) + : device(device), recalculatePrimRef(recalculatePrimRef) {} + + struct TemporalBinInfo + { + __forceinline TemporalBinInfo () { + } + + __forceinline TemporalBinInfo (EmptyTy) + { + for (size_t i=0; i<BINS-1; i++) + { + count0[i] = count1[i] = 0; + bounds0[i] = bounds1[i] = empty; + } + } + + void bin(const PrimRefMB* prims, size_t begin, size_t end, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) + { + for (int b=0; b<BINS-1; b++) + { + const float t = float(b+1)/float(BINS); + const float ct = lerp(time_range.lower,time_range.upper,t); + const float center_time = set.align_time(ct); + if (center_time <= time_range.lower) continue; + if (center_time >= time_range.upper) continue; + const BBox1f dt0(time_range.lower,center_time); + const BBox1f dt1(center_time,time_range.upper); + + /* find linear bounds for both time segments */ + for (size_t i=begin; i<end; i++) + { + if (prims[i].time_range_overlap(dt0)) + { + const LBBox3fa bn0 = recalculatePrimRef.linearBounds(prims[i],dt0); +#if MBLUR_BIN_LBBOX + bounds0[b].extend(bn0); +#else + bounds0[b].extend(bn0.interpolate(0.5f)); +#endif + count0[b] += prims[i].timeSegmentRange(dt0).size(); + } + + if (prims[i].time_range_overlap(dt1)) + { + const LBBox3fa bn1 = recalculatePrimRef.linearBounds(prims[i],dt1); +#if MBLUR_BIN_LBBOX + bounds1[b].extend(bn1); +#else + bounds1[b].extend(bn1.interpolate(0.5f)); +#endif + count1[b] += prims[i].timeSegmentRange(dt1).size(); + } + } + } + } + + __forceinline void bin_parallel(const PrimRefMB* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) + { + if (likely(end-begin < parallelThreshold)) { + bin(prims,begin,end,time_range,set,recalculatePrimRef); + } + else + { + auto bin = [&](const range<size_t>& r) -> TemporalBinInfo { + TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; + }; + *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2); + } + } + + /*! merges in other binning information */ + __forceinline void merge (const TemporalBinInfo& other) + { + for (size_t i=0; i<BINS-1; i++) + { + count0[i] += other.count0[i]; + count1[i] += other.count1[i]; + bounds0[i].extend(other.bounds0[i]); + bounds1[i].extend(other.bounds1[i]); + } + } + + static __forceinline const TemporalBinInfo merge2(const TemporalBinInfo& a, const TemporalBinInfo& b) { + TemporalBinInfo r = a; r.merge(b); return r; + } + + Split best(int logBlockSize, BBox1f time_range, const SetMB& set) + { + float bestSAH = inf; + float bestPos = 0.0f; + for (int b=0; b<BINS-1; b++) + { + float t = float(b+1)/float(BINS); + float ct = lerp(time_range.lower,time_range.upper,t); + const float center_time = set.align_time(ct); + if (center_time <= time_range.lower) continue; + if (center_time >= time_range.upper) continue; + const BBox1f dt0(time_range.lower,center_time); + const BBox1f dt1(center_time,time_range.upper); + + /* calculate sah */ + const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize); + const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize); + float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size(); + float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size(); + if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time + if (unlikely(rCount == 0)) sah1 = 0.0f; + const float sah = sah0+sah1; + if (sah < bestSAH) { + bestSAH = sah; + bestPos = center_time; + } + } + return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos); + } + + public: + size_t count0[BINS-1]; + size_t count1[BINS-1]; + BBox bounds0[BINS-1]; + BBox bounds1[BINS-1]; + }; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize) + { + assert(set.size() > 0); + TemporalBinInfo binner(empty); + binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef); + Split tsplit = binner.best((int)logBlockSize,set.time_range,set); + if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split + return tsplit; + } + + __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset) + { + assert(tsplit.sah != float(inf)); + assert(tsplit.fpos > set.time_range.lower); + assert(tsplit.fpos < set.time_range.upper); + + float center_time = tsplit.fpos; + const BBox1f time_range0(set.time_range.lower,center_time); + const BBox1f time_range1(center_time,set.time_range.upper); + mvector<PrimRefMB>& prims = *set.prims; + + /* calculate primrefs for first time range */ + std::unique_ptr<mvector<PrimRefMB>> new_vector(new mvector<PrimRefMB>(device, set.size())); + PrimRefVector lprims = new_vector.get(); + + auto reduction_func0 = [&] (const range<size_t>& r) { + PrimInfoMB pinfo = empty; + for (size_t i=r.begin(); i<r.end(); i++) + { + if (likely(prims[i].time_range_overlap(time_range0))) + { + const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range0); + (*lprims)[i-set.begin()] = prim; + pinfo.add_primref(prim); + } + else + { + (*lprims)[i-set.begin()] = prims[i]; + } + } + return pinfo; + }; + PrimInfoMB linfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func0,PrimInfoMB::merge2); + + /* primrefs for first time range are in lprims[0 .. set.size()) */ + /* some primitives may need to be filtered out */ + if (linfo.size() != set.size()) + linfo.object_range._end = parallel_filter(lprims->data(), size_t(0), set.size(), size_t(1024), + [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); }); + + lset = SetMB(linfo,lprims,time_range0); + + /* calculate primrefs for second time range */ + auto reduction_func1 = [&] (const range<size_t>& r) { + PrimInfoMB pinfo = empty; + for (size_t i=r.begin(); i<r.end(); i++) + { + if (likely(prims[i].time_range_overlap(time_range1))) + { + const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range1); + prims[i] = prim; + pinfo.add_primref(prim); + } + } + return pinfo; + }; + PrimInfoMB rinfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func1,PrimInfoMB::merge2); + rinfo.object_range = range<size_t>(set.begin(), set.begin() + rinfo.size()); + + /* primrefs for second time range are in prims[set.begin() .. set.end()) */ + /* some primitives may need to be filtered out */ + if (rinfo.size() != set.size()) + rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024), + [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); }); + + rset = SetMB(rinfo,&prims,time_range1); + + return new_vector; + } + + private: + MemoryMonitorInterface* device; // device to report memory usage to + const RecalculatePrimRef recalculatePrimRef; + }; + } +} diff --git a/thirdparty/embree/kernels/builders/priminfo.h b/thirdparty/embree/kernels/builders/priminfo.h new file mode 100644 index 0000000000..fee515247a --- /dev/null +++ b/thirdparty/embree/kernels/builders/priminfo.h @@ -0,0 +1,362 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/primref.h" +#include "../common/primref_mb.h" + +namespace embree +{ + // FIXME: maybe there's a better place for this util fct + __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2) + { + const Vec3fa e0 = v1-v0; + const Vec3fa e1 = v2-v0; + const Vec3fa d = cross(e0,e1); + return fabs(d.x) + fabs(d.y) + fabs(d.z); + } + + //namespace isa + //{ + template<typename BBox> + class CentGeom + { + public: + __forceinline CentGeom () {} + + __forceinline CentGeom (EmptyTy) + : geomBounds(empty), centBounds(empty) {} + + __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) + : geomBounds(geomBounds), centBounds(centBounds) {} + + template<typename PrimRef> + __forceinline void extend_primref(const PrimRef& prim) + { + BBox bounds; Vec3fa center; + prim.binBoundsAndCenter(bounds,center); + geomBounds.extend(bounds); + centBounds.extend(center); + } + + template<typename PrimRef> + __forceinline void extend_center2(const PrimRef& prim) + { + BBox3fa bounds = prim.bounds(); + geomBounds.extend(bounds); + centBounds.extend(bounds.center2()); + } + + __forceinline void extend(const BBox& geomBounds_) { + geomBounds.extend(geomBounds_); + centBounds.extend(center2(geomBounds_)); + } + + __forceinline void merge(const CentGeom& other) + { + geomBounds.extend(other.geomBounds); + centBounds.extend(other.centBounds); + } + + static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) { + CentGeom r = a; r.merge(b); return r; + } + + public: + BBox geomBounds; //!< geometry bounds of primitives + BBox3fa centBounds; //!< centroid bounds of primitives + }; + + typedef CentGeom<BBox3fa> CentGeomBBox3fa; + + /*! stores bounding information for a set of primitives */ + template<typename BBox> + class PrimInfoT : public CentGeom<BBox> + { + public: + using CentGeom<BBox>::geomBounds; + using CentGeom<BBox>::centBounds; + + __forceinline PrimInfoT () {} + + __forceinline PrimInfoT (EmptyTy) + : CentGeom<BBox>(empty), begin(0), end(0) {} + + __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) + : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {} + + template<typename PrimRef> + __forceinline void add_primref(const PrimRef& prim) + { + CentGeom<BBox>::extend_primref(prim); + end++; + } + + template<typename PrimRef> + __forceinline void add_center2(const PrimRef& prim) { + CentGeom<BBox>::extend_center2(prim); + end++; + } + + template<typename PrimRef> + __forceinline void add_center2(const PrimRef& prim, const size_t i) { + CentGeom<BBox>::extend_center2(prim); + end+=i; + } + + /*__forceinline void add(const BBox& geomBounds_) { + CentGeom<BBox>::extend(geomBounds_); + end++; + } + + __forceinline void add(const BBox& geomBounds_, const size_t i) { + CentGeom<BBox>::extend(geomBounds_); + end+=i; + }*/ + + __forceinline void merge(const PrimInfoT& other) + { + CentGeom<BBox>::merge(other); + begin += other.begin; + end += other.end; + } + + static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) { + PrimInfoT r = a; r.merge(b); return r; + } + + /*! returns the number of primitives */ + __forceinline size_t size() const { + return end-begin; + } + + __forceinline float halfArea() { + return expectedApproxHalfArea(geomBounds); + } + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + //return halfArea(geomBounds)*blocks(num); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift); + //return halfArea(geomBounds)*float((num+3) >> 2); + //return halfArea(geomBounds)*blocks(num); + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) { + return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}"; + } + + public: + size_t begin,end; //!< number of primitives + }; + + typedef PrimInfoT<BBox3fa> PrimInfo; + //typedef PrimInfoT<LBBox3fa> PrimInfoMB; + + /*! stores bounding information for a set of primitives */ + template<typename BBox> + class PrimInfoMBT : public CentGeom<BBox> + { + public: + using CentGeom<BBox>::geomBounds; + using CentGeom<BBox>::centBounds; + + __forceinline PrimInfoMBT () { + } + + __forceinline PrimInfoMBT (EmptyTy) + : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {} + + __forceinline PrimInfoMBT (size_t begin, size_t end) + : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {} + + template<typename PrimRef> + __forceinline void add_primref(const PrimRef& prim) + { + CentGeom<BBox>::extend_primref(prim); + time_range.extend(prim.time_range); + object_range._end++; + num_time_segments += prim.size(); + if (max_num_time_segments < prim.totalTimeSegments()) { + max_num_time_segments = prim.totalTimeSegments(); + max_time_range = prim.time_range; + } + } + + __forceinline void merge(const PrimInfoMBT& other) + { + CentGeom<BBox>::merge(other); + time_range.extend(other.time_range); + object_range._begin += other.object_range.begin(); + object_range._end += other.object_range.end(); + num_time_segments += other.num_time_segments; + if (max_num_time_segments < other.max_num_time_segments) { + max_num_time_segments = other.max_num_time_segments; + max_time_range = other.max_time_range; + } + } + + static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) { + PrimInfoMBT r = a; r.merge(b); return r; + } + + __forceinline size_t begin() const { + return object_range.begin(); + } + + __forceinline size_t end() const { + return object_range.end(); + } + + /*! returns the number of primitives */ + __forceinline size_t size() const { + return object_range.size(); + } + + __forceinline float halfArea() const { + return time_range.size()*expectedApproxHalfArea(geomBounds); + } + + __forceinline float leafSAH() const { + return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); + } + + __forceinline float leafSAH(size_t block_shift) const { + return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift); + } + + __forceinline float align_time(float ct) const + { + //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments); + float t0 = (ct-max_time_range.lower)/max_time_range.size(); + float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments); + return t1*max_time_range.size()+max_time_range.lower; + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) + { + return cout << "PrimInfo { " << + "object_range = " << pinfo.object_range << + ", time_range = " << pinfo.time_range << + ", time_segments = " << pinfo.num_time_segments << + ", geomBounds = " << pinfo.geomBounds << + ", centBounds = " << pinfo.centBounds << + "}"; + } + + public: + range<size_t> object_range; //!< primitive range + size_t num_time_segments; //!< total number of time segments of all added primrefs + size_t max_num_time_segments; //!< maximum number of time segments of a primitive + BBox1f max_time_range; //!< time range of primitive with max_num_time_segments + BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB + }; + + typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB; + + struct SetMB : public PrimInfoMB + { + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + typedef mvector<PrimRefMB>* PrimRefVector; + + __forceinline SetMB() {} + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims) + : PrimInfoMB(pinfo_i), prims(prims) {} + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in) + : PrimInfoMB(pinfo_i), prims(prims) + { + object_range = object_range_in; + time_range = intersect(time_range,time_range_in); + } + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in) + : PrimInfoMB(pinfo_i), prims(prims) + { + time_range = intersect(time_range,time_range_in); + } + + void deterministic_order() const + { + /* required as parallel partition destroys original primitive order */ + PrimRefMB* prim = prims->data(); + std::sort(&prim[object_range.begin()],&prim[object_range.end()]); + } + + template<typename RecalculatePrimRef> + __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const + { + auto reduce = [&](const range<size_t>& r) -> LBBox3fa + { + LBBox3fa cbounds(empty); + for (size_t j = r.begin(); j < r.end(); j++) + { + PrimRefMB& ref = (*prims)[j]; + const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range); + cbounds.extend(bn); + }; + return cbounds; + }; + + return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty), + reduce, + [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); }); + } + + template<typename RecalculatePrimRef> + __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const + { + auto reduce = [&](const range<size_t>& r) -> LBBox3fa + { + LBBox3fa cbounds(empty); + for (size_t j = r.begin(); j < r.end(); j++) + { + PrimRefMB& ref = (*prims)[j]; + const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space); + cbounds.extend(bn); + }; + return cbounds; + }; + + return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty), + reduce, + [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); }); + } + + template<typename RecalculatePrimRef> + const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const + { + auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + PrimRefMB& ref = (*prims)[j]; + PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space); + pinfo.add_primref(ref1); + }; + return pinfo; + }; + + const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, + PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2); + + return SetMB(pinfo,prims,object_range,time_range); + } + + public: + PrimRefVector prims; + }; +//} +} diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp new file mode 100644 index 0000000000..d279dc4993 --- /dev/null +++ b/thirdparty/embree/kernels/builders/primrefgen.cpp @@ -0,0 +1,312 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "primrefgen.h" +#include "primrefgen_presplit.h" + +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + +namespace embree +{ + namespace isa + { + PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + ParallelPrefixSumState<PrimInfo> pstate; + + /* first try */ + progressMonitor(0); + PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,r.begin(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,base.size(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator2 iter(scene,types,mblur); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, const size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime) + { + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator2 iter(scene,types,true); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, const size_t numPrimRefs, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1) + { + ParallelForForPrefixSumState<PrimInfoMB> pstate; + Scene::Iterator2 iter(scene,types,true); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB { + return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID); + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB { + return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID); + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + } + + /* the BVH starts with that time range, even though primitives might have smaller/larger time range */ + pinfo.time_range = t0t1; + return pinfo; + } + + template<typename Mesh> + size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor) + { + size_t numPrimitives = morton.size(); + + /* compute scene bounds */ + std::pair<size_t,BBox3fa> cb_empty(0,empty); + auto cb = parallel_reduce + ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range<size_t>& r) -> std::pair<size_t,BBox3fa> + { + size_t num = 0; + BBox3fa bounds = empty; + + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa prim_bounds = empty; + if (unlikely(!mesh->buildBounds(j,&prim_bounds))) continue; + bounds.extend(center2(prim_bounds)); + num++; + } + return std::make_pair(num,bounds); + }, [] (const std::pair<size_t,BBox3fa>& a, const std::pair<size_t,BBox3fa>& b) { + return std::make_pair(a.first + b.first,merge(a.second,b.second)); + }); + + + size_t numPrimitivesGen = cb.first; + const BBox3fa centBounds = cb.second; + + /* compute morton codes */ + if (likely(numPrimitivesGen == numPrimitives)) + { + /* fast path if all primitives were valid */ + BVHBuilderMorton::MortonCodeMapping mapping(centBounds); + parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range<size_t>& r) -> void { + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]); + for (size_t j=r.begin(); j<r.end(); j++) + generator(mesh->bounds(j),unsigned(j)); + }); + } + else + { + /* slow path, fallback in case some primitives were invalid */ + ParallelPrefixSumState<size_t> pstate; + BVHBuilderMorton::MortonCodeMapping mapping(centBounds); + parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t { + size_t num = 0; + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (unlikely(!mesh->buildBounds(j,&bounds))) continue; + generator(bounds,unsigned(j)); + num++; + } + return num; + }, std::plus<size_t>()); + + parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t { + size_t num = 0; + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(j,&bounds)) continue; + generator(bounds,unsigned(j)); + num++; + } + return num; + }, std::plus<size_t>()); + } + return numPrimitivesGen; + } + + // ==================================================================================================== + // ==================================================================================================== + // ==================================================================================================== + + // special variants for grid meshes + +// -- GODOT start -- +#if defined(EMBREE_GEOMETRY_GRID) +// -- GODOT end -- + PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids) + { + PrimInfo pinfo(empty); + size_t numPrimitives = 0; + + /* first run to get #primitives */ + + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator<GridMesh,false> iter(scene); + + pstate.init(iter,size_t(1024)); + + /* iterate over all meshes in the scene */ + pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j)) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j); + if (!mesh->valid(j)) continue; + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + numPrimitives = pinfo.size(); + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + k = base.size(); + size_t p_index = k; + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j)) continue; + const GridMesh::Grid &g = mesh->grid(j); + for (unsigned int y=0; y<g.resY-1u; y+=2) + for (unsigned int x=0; x<g.resX-1u; x+=2) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + assert(pinfo.size() == numPrimitives); + return pinfo; + } + + PrimInfo createPrimRefArrayGrids(GridMesh* mesh, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids) + { + unsigned int geomID_ = std::numeric_limits<unsigned int>::max (); + + PrimInfo pinfo(empty); + size_t numPrimitives = 0; + + ParallelPrefixSumState<PrimInfo> pstate; + /* iterate over all grids in a single mesh */ + pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j)) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,geomID_,unsigned(j)); + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + numPrimitives = pinfo.size(); + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo + { + + size_t p_index = base.size(); + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j)) continue; + const GridMesh::Grid &g = mesh->grid(j); + for (unsigned int y=0; y<g.resY-1u; y+=2) + for (unsigned int x=0; x<g.resX-1u; x+=2) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,geomID_,unsigned(p_index)); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + return pinfo; + } +// -- GODOT start -- +#endif +// -- GODOT end -- + + // ==================================================================================================== + // ==================================================================================================== + // ==================================================================================================== + + IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor)); + } +} diff --git a/thirdparty/embree/kernels/builders/primrefgen.h b/thirdparty/embree/kernels/builders/primrefgen.h new file mode 100644 index 0000000000..c09a848ba3 --- /dev/null +++ b/thirdparty/embree/kernels/builders/primrefgen.h @@ -0,0 +1,34 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "../common/primref.h" +#include "../common/primref_mb.h" +#include "priminfo.h" +#include "bvh_builder_morton.h" + +namespace embree +{ + namespace isa + { + PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor); + + PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor); + + PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0); + + PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, size_t numPrimitives, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f)); + + template<typename Mesh> + size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor); + + /* special variants for grids */ + PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids); + + PrimInfo createPrimRefArrayGrids(GridMesh* mesh, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids); + + } +} + diff --git a/thirdparty/embree/kernels/builders/primrefgen_presplit.h b/thirdparty/embree/kernels/builders/primrefgen_presplit.h new file mode 100644 index 0000000000..8cd251ddd2 --- /dev/null +++ b/thirdparty/embree/kernels/builders/primrefgen_presplit.h @@ -0,0 +1,371 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../builders/primrefgen.h" +#include "../builders/heuristic_spatial.h" +#include "../builders/splitter.h" + +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + +#define DBG_PRESPLIT(x) +#define CHECK_PRESPLIT(x) + +#define GRID_SIZE 1024 +#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5 +#define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG) +#define PRIORITY_CUTOFF_THRESHOLD 1.0f +#define PRIORITY_SPLIT_POS_WEIGHT 1.5f + +namespace embree +{ + namespace isa + { + + struct PresplitItem + { + union { + float priority; + unsigned int data; + }; + unsigned int index; + + __forceinline operator unsigned() const + { + return reinterpret_cast<const unsigned&>(priority); + } + __forceinline bool operator < (const PresplitItem& item) const + { + return (priority < item.priority); + } + + template<typename Mesh> + __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc) + { + const unsigned int geomID = ref.geomID(); + const unsigned int primID = ref.primID(); + const float area_aabb = area(ref.bounds()); + const float area_prim = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID); + const unsigned int diff = 31 - lzcnt(mc.x^mc.y); + assert(area_prim <= area_aabb); + //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f); + const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) )); + assert(priority >= 0.0f && priority < FLT_LARGE); + return priority; + } + + + }; + + inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) { + return cout << "index " << item.index << " priority " << item.priority; + }; + + template<typename SplitterFactory> + void splitPrimitive(SplitterFactory &Splitter, + const PrimRef &prim, + const unsigned int geomID, + const unsigned int primID, + const unsigned int split_level, + const Vec3fa &grid_base, + const float grid_scale, + const float grid_extend, + PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE], + unsigned int& numSubPrims) + { + assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); + if (split_level == 0) + { + assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); + subPrims[numSubPrims++] = prim; + } + else + { + const Vec3fa lower = prim.lower; + const Vec3fa upper = prim.upper; + const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); + const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); + Vec3ia ilower(floor(glower)); + Vec3ia iupper(floor(gupper)); + + /* this ignores dimensions that are empty */ + iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper))); + + /* compute a morton code for the lower and upper grid coordinates. */ + const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); + const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); + + /* if all bits are equal then we cannot split */ + if(unlikely(lower_code == upper_code)) + { + assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); + subPrims[numSubPrims++] = prim; + return; + } + + /* compute octree level and dimension to perform the split in */ + const unsigned int diff = 31 - lzcnt(lower_code^upper_code); + const unsigned int level = diff / 3; + const unsigned int dim = diff % 3; + + /* now we compute the grid position of the split */ + const unsigned int isplit = iupper[dim] & ~((1<<level)-1); + + /* compute world space position of split */ + const float inv_grid_size = 1.0f / GRID_SIZE; + const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend; + + assert(prim.lower[dim] <= fsplit && + prim.upper[dim] >= fsplit); + + /* split primitive */ + const auto splitter = Splitter(prim); + BBox3fa left,right; + splitter(prim.bounds(),dim,fsplit,left,right); + assert(!left.empty()); + assert(!right.empty()); + + + splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + } + } + + + template<typename Mesh, typename SplitterFactory> + PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + ParallelPrefixSumState<PrimInfo> pstate; + + /* first try */ + progressMonitor(0); + PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,r.begin(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,base.size(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref) + { + const Vec3fa lower = ref.lower; + const Vec3fa upper = ref.upper; + const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); + const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); + Vec3ia ilower(floor(glower)); + Vec3ia iupper(floor(gupper)); + + /* this ignores dimensions that are empty */ + iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)); + + /* compute a morton code for the lower and upper grid coordinates. */ + const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); + const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); + return Vec2i(lower_code,upper_code); + } + + template<typename Mesh, typename SplitterFactory> + PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + static const size_t MIN_STEP_SIZE = 128; + + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator2 iter(scene,types,mblur); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + + /* use correct number of primitives */ + size_t numPrimitives = pinfo.size(); + const size_t alloc_numPrimitives = prims.size(); + const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives; + + /* set up primitive splitter */ + SplitterFactory Splitter(scene); + + + DBG_PRESPLIT( + const size_t org_numPrimitives = pinfo.size(); + PRINT(numPrimitives); + PRINT(alloc_numPrimitives); + PRINT(numSplitPrimitivesBudget); + ); + + /* allocate double buffer presplit items */ + const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives; + PresplitItem *presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); + PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); + + /* compute grid */ + const Vec3fa grid_base = pinfo.geomBounds.lower; + const Vec3fa grid_diag = pinfo.geomBounds.size(); + const float grid_extend = max(grid_diag.x,max(grid_diag.y,grid_diag.z)); + const float grid_scale = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend; + + /* init presplit items and get total sum */ + const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float { + float sum = 0.0f; + for (size_t i=r.begin(); i<r.end(); i++) + { + presplitItem[i].index = (unsigned int)i; + const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]); + /* if all bits are equal then we cannot split */ + presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f; + /* FIXME: sum undeterministic */ + sum += presplitItem[i].priority; + } + return sum; + },[](const float& a, const float& b) -> float { return a+b; }); + + /* compute number of splits per primitive */ + const float inv_psum = 1.0f / psum; + parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { + for (size_t i=r.begin(); i<r.end(); i++) + { + if (presplitItem[i].priority > 0.0f) + { + const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum; + if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims + { + presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f); + //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); + assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); + } + else + presplitItem[i].priority = 0.0f; + } + } + }); + + auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; }; + size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024); + + /* anything to split ? */ + if (center < numPrimitives) + { + const size_t numPrimitivesToSplit = numPrimitives - center; + assert(presplitItem[center].priority >= 1.0f); + + /* sort presplit items in ascending order */ + radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024); + + CHECK_PRESPLIT( + parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { + for (size_t i=r.begin(); i<r.end(); i++) + assert(presplitItem[i-1].priority <= presplitItem[i].priority); + }); + ); + + unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem; + unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; + + /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */ + const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t { + size_t sum = 0; + for (size_t i=t.begin(); i<t.end(); i++) + { + PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; + assert(presplitItem[i].priority >= 1.0f); + const unsigned int primrefID = presplitItem[i].index; + const float prio = presplitItem[i].priority; + const unsigned int geomID = prims[primrefID].geomID(); + const unsigned int primID = prims[primrefID].primID(); + const unsigned int split_levels = (unsigned int)prio; + unsigned int numSubPrims = 0; + splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + assert(numSubPrims); + numSubPrims--; // can reuse slot + sum+=numSubPrims; + presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels; + primOffset0[i-center] = numSubPrims; + } + return sum; + },[](const size_t& a, const size_t& b) -> size_t { return a+b; }); + + /* if we are over budget, need to shrink the range */ + if (totalNumSubPrims > numSplitPrimitivesBudget) + { + size_t new_center = numPrimitives-1; + size_t sum = 0; + for (;new_center>=center;new_center--) + { + const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG; + if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break; + sum += numSubPrims; + } + new_center++; + center = new_center; + } + + /* parallel prefix sum to compute offsets for storing sub-primitives */ + const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>()); + + /* iterate over range, and split primitives into sub primitives and append them to prims array */ + parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void { + for (size_t j=rn.begin(); j<rn.end(); j++) + { + PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; + const unsigned int primrefID = presplitItem[j].index; + const unsigned int geomID = prims[primrefID].geomID(); + const unsigned int primID = prims[primrefID].primID(); + const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1); + + assert(split_levels); + assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); + unsigned int numSubPrims = 0; + splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + const size_t newID = numPrimitives + primOffset1[j-center]; + assert(newID+numSubPrims <= alloc_numPrimitives); + prims[primrefID] = subPrims[0]; + for (size_t i=1;i<numSubPrims;i++) + prims[newID+i-1] = subPrims[i]; + } + }); + + numPrimitives += offset; + DBG_PRESPLIT( + PRINT(pinfo.size()); + PRINT(numPrimitives); + PRINT((float)numPrimitives/org_numPrimitives)); + } + + /* recompute centroid bounding boxes */ + pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo { + PrimInfo p(empty); + for (size_t j=r.begin(); j<r.end(); j++) + p.add_center2(prims[j]); + return p; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + assert(pinfo.size() == numPrimitives); + + /* free double buffer presplit items */ + alignedFree(tmp_presplitItem); + alignedFree(presplitItem); + return pinfo; + } + } +} diff --git a/thirdparty/embree/kernels/builders/splitter.h b/thirdparty/embree/kernels/builders/splitter.h new file mode 100644 index 0000000000..f7720bd284 --- /dev/null +++ b/thirdparty/embree/kernels/builders/splitter.h @@ -0,0 +1,191 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "../common/primref.h" + +namespace embree +{ + namespace isa + { + template<size_t N> + __forceinline void splitPolygon(const BBox3fa& bounds, + const size_t dim, + const float pos, + const Vec3fa (&v)[N+1], + const Vec3fa (&inv_length)[N], + BBox3fa& left_o, + BBox3fa& right_o) + { + BBox3fa left = empty, right = empty; + /* clip triangle to left and right box by processing all edges */ + for (size_t i=0; i<N; i++) + { + const Vec3fa &v0 = v[i]; + const Vec3fa &v1 = v[i+1]; + const float v0d = v0[dim]; + const float v1d = v1[dim]; + + if (v0d <= pos) left. extend(v0); // this point is on left side + if (v0d >= pos) right.extend(v0); // this point is on right side + + if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location + { + assert((v1d-v0d) != 0.0f); + const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0); + left.extend(c); + right.extend(c); + } + } + + /* clip against current bounds */ + left_o = intersect(left,bounds); + right_o = intersect(right,bounds); + } + + template<size_t N> + __forceinline void splitPolygon(const PrimRef& prim, + const size_t dim, + const float pos, + const Vec3fa (&v)[N+1], + PrimRef& left_o, + PrimRef& right_o) + { + BBox3fa left = empty, right = empty; + for (size_t i=0; i<N; i++) + { + const Vec3fa &v0 = v[i]; + const Vec3fa &v1 = v[i+1]; + const float v0d = v0[dim]; + const float v1d = v1[dim]; + + if (v0d <= pos) left. extend(v0); // this point is on left side + if (v0d >= pos) right.extend(v0); // this point is on right side + + if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location + { + assert((v1d-v0d) != 0.0f); + const float inv_length = 1.0f/(v1d-v0d); + const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0); + left.extend(c); + right.extend(c); + } + } + + /* clip against current bounds */ + new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID()); + new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID()); + } + + struct TriangleSplitter + { + __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim) + { + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask ); + TriangleMesh::Triangle tri = mesh->triangle(prim.primID()); + v[0] = mesh->vertex(tri.v[0]); + v[1] = mesh->vertex(tri.v[1]); + v[2] = mesh->vertex(tri.v[2]); + v[3] = mesh->vertex(tri.v[0]); + inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); + inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); + inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]); + } + + __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { + splitPolygon<3>(prim,dim,pos,v,left_o,right_o); + } + + __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { + splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o); + } + + private: + Vec3fa v[4]; + Vec3fa inv_length[3]; + }; + + struct TriangleSplitterFactory + { + __forceinline TriangleSplitterFactory(const Scene* scene) + : scene(scene) {} + + __forceinline TriangleSplitter operator() (const PrimRef& prim) const { + return TriangleSplitter(scene,prim); + } + + private: + const Scene* scene; + }; + + struct QuadSplitter + { + __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim) + { + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask ); + QuadMesh::Quad quad = mesh->quad(prim.primID()); + v[0] = mesh->vertex(quad.v[0]); + v[1] = mesh->vertex(quad.v[1]); + v[2] = mesh->vertex(quad.v[2]); + v[3] = mesh->vertex(quad.v[3]); + v[4] = mesh->vertex(quad.v[0]); + inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); + inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); + inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]); + inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]); + } + + __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { + splitPolygon<4>(prim,dim,pos,v,left_o,right_o); + } + + __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { + splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o); + } + + private: + Vec3fa v[5]; + Vec3fa inv_length[4]; + }; + + struct QuadSplitterFactory + { + __forceinline QuadSplitterFactory(const Scene* scene) + : scene(scene) {} + + __forceinline QuadSplitter operator() (const PrimRef& prim) const { + return QuadSplitter(scene,prim); + } + + private: + const Scene* scene; + }; + + + struct DummySplitter + { + __forceinline DummySplitter(const Scene* scene, const PrimRef& prim) + { + } + }; + + struct DummySplitterFactory + { + __forceinline DummySplitterFactory(const Scene* scene) + : scene(scene) {} + + __forceinline DummySplitter operator() (const PrimRef& prim) const { + return DummySplitter(scene,prim); + } + + private: + const Scene* scene; + }; + + } +} + diff --git a/thirdparty/embree/kernels/bvh/bvh.cpp b/thirdparty/embree/kernels/bvh/bvh.cpp new file mode 100644 index 0000000000..a84295f0da --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh.cpp @@ -0,0 +1,190 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_statistics.h" + +namespace embree +{ + template<int N> + BVHN<N>::BVHN (const PrimitiveType& primTy, Scene* scene) + : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN), + primTy(&primTy), device(scene->device), scene(scene), + root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0) + { + } + + template<int N> + BVHN<N>::~BVHN () + { + for (size_t i=0; i<objects.size(); i++) + delete objects[i]; + } + + template<int N> + void BVHN<N>::clear() + { + set(BVHN::emptyNode,empty,0); + alloc.clear(); + } + + template<int N> + void BVHN<N>::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives) + { + this->root = root; + this->bounds = bounds; + this->numPrimitives = numPrimitives; + } + + template<int N> + void BVHN<N>::clearBarrier(NodeRef& node) + { + if (node.isBarrier()) + node.clearBarrier(); + else if (!node.isLeaf()) { + BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH + for (size_t c=0; c<N; c++) + clearBarrier(n->child(c)); + } + } + + template<int N> + void BVHN<N>::layoutLargeNodes(size_t num) + { +#if defined(__64BIT__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues + struct NodeArea + { + __forceinline NodeArea() {} + + __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds) + : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {} + + __forceinline bool operator< (const NodeArea& other) const { + return this->A < other.A; + } + + NodeRef* node; + float A; + }; + std::vector<NodeArea> lst; + lst.reserve(num); + lst.push_back(NodeArea(root,empty)); + + while (lst.size() < num) + { + std::pop_heap(lst.begin(), lst.end()); + NodeArea n = lst.back(); lst.pop_back(); + if (!n.node->isAABBNode()) break; + AABBNode* node = n.node->getAABBNode(); + for (size_t i=0; i<N; i++) { + if (node->child(i) == BVHN::emptyNode) continue; + lst.push_back(NodeArea(node->child(i),node->bounds(i))); + std::push_heap(lst.begin(), lst.end()); + } + } + + for (size_t i=0; i<lst.size(); i++) + lst[i].node->setBarrier(); + + root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator()); +#endif + } + + template<int N> + typename BVHN<N>::NodeRef BVHN<N>::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator) + { + if (node.isBarrier()) { + node.clearBarrier(); + return node; + } + else if (node.isAABBNode()) + { + AABBNode* oldnode = node.getAABBNode(); + AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment); + *newnode = *oldnode; + for (size_t c=0; c<N; c++) + newnode->child(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator); + return encodeNode(newnode); + } + else return node; + } + + template<int N> + double BVHN<N>::preBuild(const std::string& builderName) + { + if (builderName == "") + return inf; + + if (device->verbosity(2)) + { + Lock<MutexSys> lock(g_printMutex); + std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush; + } + + double t0 = 0.0; + if (device->benchmark || device->verbosity(2)) t0 = getSeconds(); + return t0; + } + + template<int N> + void BVHN<N>::postBuild(double t0) + { + if (t0 == double(inf)) + return; + + double dt = 0.0; + if (device->benchmark || device->verbosity(2)) + dt = getSeconds()-t0; + + std::unique_ptr<BVHNStatistics<N>> stat; + + /* print statistics */ + if (device->verbosity(2)) + { + if (!stat) stat.reset(new BVHNStatistics<N>(this)); + const size_t usedBytes = alloc.getUsedBytes(); + Lock<MutexSys> lock(g_printMutex); + std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl; + + if (device->verbosity(2)) + std::cout << stat->str(); + + if (device->verbosity(2)) + { + FastAllocator::AllStatistics stat(&alloc); + for (size_t i=0; i<objects.size(); i++) + if (objects[i]) + stat = stat + FastAllocator::AllStatistics(&objects[i]->alloc); + + stat.print(numPrimitives); + } + + if (device->verbosity(3)) + { + alloc.print_blocks(); + for (size_t i=0; i<objects.size(); i++) + if (objects[i]) + objects[i]->alloc.print_blocks(); + } + + std::cout << std::flush; + } + + /* benchmark mode */ + if (device->benchmark) + { + if (!stat) stat.reset(new BVHNStatistics<N>(this)); + Lock<MutexSys> lock(g_printMutex); + std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush; + } + } + +#if defined(__AVX__) + template class BVHN<8>; +#endif + +#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) + template class BVHN<4>; +#endif +} + diff --git a/thirdparty/embree/kernels/bvh/bvh.h b/thirdparty/embree/kernels/bvh/bvh.h new file mode 100644 index 0000000000..565eec5a58 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh.h @@ -0,0 +1,235 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +/* include all node types */ +#include "bvh_node_aabb.h" +#include "bvh_node_aabb_mb.h" +#include "bvh_node_aabb_mb4d.h" +#include "bvh_node_obb.h" +#include "bvh_node_obb_mb.h" +#include "bvh_node_qaabb.h" + +namespace embree +{ + /*! flags used to enable specific node types in intersectors */ + enum BVHNodeFlags + { + BVH_FLAG_ALIGNED_NODE = 0x00001, + BVH_FLAG_ALIGNED_NODE_MB = 0x00010, + BVH_FLAG_UNALIGNED_NODE = 0x00100, + BVH_FLAG_UNALIGNED_NODE_MB = 0x01000, + BVH_FLAG_QUANTIZED_NODE = 0x100000, + BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000, + + /* short versions */ + BVH_AN1 = BVH_FLAG_ALIGNED_NODE, + BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB, + BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D, + BVH_UN1 = BVH_FLAG_UNALIGNED_NODE, + BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB, + BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D, + BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE, + BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB, + BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB, + BVH_QN1 = BVH_FLAG_QUANTIZED_NODE + }; + + /*! Multi BVH with N children. Each node stores the bounding box of + * it's N children as well as N child references. */ + template<int N> + class BVHN : public AccelData + { + ALIGNED_CLASS_(16); + public: + + /*! forward declaration of node ref type */ + typedef NodeRefPtr<N> NodeRef; + typedef BaseNode_t<NodeRef,N> BaseNode; + typedef AABBNode_t<NodeRef,N> AABBNode; + typedef AABBNodeMB_t<NodeRef,N> AABBNodeMB; + typedef AABBNodeMB4D_t<NodeRef,N> AABBNodeMB4D; + typedef OBBNode_t<NodeRef,N> OBBNode; + typedef OBBNodeMB_t<NodeRef,N> OBBNodeMB; + typedef QuantizedBaseNode_t<N> QuantizedBaseNode; + typedef QuantizedBaseNodeMB_t<N> QuantizedBaseNodeMB; + typedef QuantizedNode_t<NodeRef,N> QuantizedNode; + + /*! Number of bytes the nodes and primitives are minimally aligned to.*/ + static const size_t byteAlignment = 16; + static const size_t byteNodeAlignment = 4*N; + + /*! Empty node */ + static const size_t emptyNode = NodeRef::emptyNode; + + /*! Invalid node, used as marker in traversal */ + static const size_t invalidNode = NodeRef::invalidNode; + static const size_t popRay = NodeRef::popRay; + + /*! Maximum depth of the BVH. */ + static const size_t maxBuildDepth = 32; + static const size_t maxBuildDepthLeaf = maxBuildDepth+8; + static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder + + /*! Maximum number of primitive blocks in a leaf. */ + static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks; + + public: + + /*! Builder interface to create allocator */ + struct CreateAlloc : public FastAllocator::Create { + __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {} + }; + + typedef BVHNodeRecord<NodeRef> NodeRecord; + typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + + public: + + /*! BVHN default constructor. */ + BVHN (const PrimitiveType& primTy, Scene* scene); + + /*! BVHN destruction */ + ~BVHN (); + + /*! clears the acceleration structure */ + void clear(); + + /*! sets BVH members after build */ + void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives); + + /*! Clears the barrier bits of a subtree. */ + void clearBarrier(NodeRef& node); + + /*! lays out num large nodes of the BVH */ + void layoutLargeNodes(size_t num); + NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator); + + /*! called by all builders before build starts */ + double preBuild(const std::string& builderName); + + /*! called by all builders after build ended */ + void postBuild(double t0); + + /*! allocator class */ + struct Allocator { + BVHN* bvh; + Allocator (BVHN* bvh) : bvh(bvh) {} + __forceinline void* operator() (size_t bytes) const { + return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes); + } + }; + + /*! post build cleanup */ + void cleanup() { + alloc.cleanup(); + } + + public: + + /*! Encodes a node */ + static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); } + static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); } + + public: + + /*! Prefetches the node this reference points to */ + __forceinline static void prefetch(const NodeRef ref, int types=0) + { +#if defined(__AVX512PF__) // MIC + if (types != BVH_FLAG_QUANTIZED_NODE) { + prefetchL2(((char*)ref.ptr)+0*64); + prefetchL2(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + prefetchL2(((char*)ref.ptr)+2*64); + prefetchL2(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + /* KNL still needs L2 prefetches for large nodes */ + prefetchL2(((char*)ref.ptr)+4*64); + prefetchL2(((char*)ref.ptr)+5*64); + prefetchL2(((char*)ref.ptr)+6*64); + prefetchL2(((char*)ref.ptr)+7*64); + } + } + else + { + /* todo: reduce if 32bit offsets are enabled */ + prefetchL2(((char*)ref.ptr)+0*64); + prefetchL2(((char*)ref.ptr)+1*64); + prefetchL2(((char*)ref.ptr)+2*64); + } +#else + if (types != BVH_FLAG_QUANTIZED_NODE) { + prefetchL1(((char*)ref.ptr)+0*64); + prefetchL1(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + prefetchL1(((char*)ref.ptr)+2*64); + prefetchL1(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + /* deactivate for large nodes on Xeon, as it introduces regressions */ + //prefetchL1(((char*)ref.ptr)+4*64); + //prefetchL1(((char*)ref.ptr)+5*64); + //prefetchL1(((char*)ref.ptr)+6*64); + //prefetchL1(((char*)ref.ptr)+7*64); + } + } + else + { + /* todo: reduce if 32bit offsets are enabled */ + prefetchL1(((char*)ref.ptr)+0*64); + prefetchL1(((char*)ref.ptr)+1*64); + prefetchL1(((char*)ref.ptr)+2*64); + } +#endif + } + + __forceinline static void prefetchW(const NodeRef ref, int types=0) + { + embree::prefetchEX(((char*)ref.ptr)+0*64); + embree::prefetchEX(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + embree::prefetchEX(((char*)ref.ptr)+2*64); + embree::prefetchEX(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + embree::prefetchEX(((char*)ref.ptr)+4*64); + embree::prefetchEX(((char*)ref.ptr)+5*64); + embree::prefetchEX(((char*)ref.ptr)+6*64); + embree::prefetchEX(((char*)ref.ptr)+7*64); + } + } + + /*! bvh type information */ + public: + const PrimitiveType* primTy; //!< primitive type stored in the BVH + + /*! bvh data */ + public: + Device* device; //!< device pointer + Scene* scene; //!< scene pointer + NodeRef root; //!< root node + FastAllocator alloc; //!< allocator used to allocate nodes + + /*! statistics data */ + public: + size_t numPrimitives; //!< number of primitives the BVH is build over + size_t numVertices; //!< number of vertices the BVH references + + /*! data arrays for special builders */ + public: + std::vector<BVHN*> objects; + vector_t<char,aligned_allocator<char,32>> subdiv_patches; + }; + + typedef BVHN<4> BVH4; + typedef BVHN<8> BVH8; +} diff --git a/thirdparty/embree/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree/kernels/bvh/bvh4_factory.cpp new file mode 100644 index 0000000000..890d5e7b7c --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh4_factory.cpp @@ -0,0 +1,1325 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh4_factory.h" +#include "../bvh/bvh.h" + +#include "../geometry/curveNv.h" +#include "../geometry/curveNi.h" +#include "../geometry/curveNi_mb.h" +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/subdivpatch1.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" +#include "../common/accelinstance.h" + +namespace embree +{ + DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom); + + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream); + + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + + BVH4Factory::BVH4Factory(int bfeatures, int ifeatures) + { + SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom); + + selectBuilders(bfeatures); + selectIntersectors(ifeatures); + } + + void BVH4Factory::selectBuilders(int features) + { + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4MeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4iMeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelTriangle4vMeshSAH)); + IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelQuadMeshSAH)); + IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelVirtualSAH)); + IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX(features,BVH4BuilderTwoLevelInstanceSAH)); + + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualSceneBuilderSAH)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceSceneBuilderSAH)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4SubdivPatch1BuilderSAH)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4SubdivPatch1MBBuilderSAH)); + } + + void BVH4Factory::selectIntersectors(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4i)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8i)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,VirtualCurveIntersector4iMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8iMB)); + + /* select intersectors1 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector1MB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust1MB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4iIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4vIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,BVH4Triangle4iIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector1Moeller)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,QBVH4Triangle4iIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,QBVH4Quad4iIntersector1Pluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector1)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector1)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersector1)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector1)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersector1)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector1)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector1Moeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridMBIntersector1Moeller)) + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector1Pluecker)); + +#if defined (EMBREE_RAY_PACKETS) + + /* select intersectors4 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector4HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector4HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector4HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector4HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector4)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector4)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersector4Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector4Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersector4Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector4Chunk)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector4HybridMoeller)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridMBIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4GridIntersector4HybridPluecker)); + + /* select intersectors8 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersector8HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4Intersector8HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersector8HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4vMBIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Triangle4iMBIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4vIntersector8HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4Quad4iMBIntersector8HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4SubdivPatch1Intersector8)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4SubdivPatch1MBIntersector8)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4VirtualIntersector8Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4VirtualMBIntersector8Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceIntersector8Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4InstanceMBIntersector8Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridMBIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH4GridIntersector8HybridPluecker)); + + /* select intersectors16 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersector16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersector16HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4Intersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4Intersector16HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iIntersector16HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4vMBIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Triangle4iMBIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4vIntersector16HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iMBIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH4Quad4iMBIntersector16HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512(features,BVH4SubdivPatch1Intersector16)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512(features,BVH4SubdivPatch1MBIntersector16)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH4VirtualIntersector16Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH4VirtualMBIntersector16Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceIntersector16Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH4InstanceMBIntersector16Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridMBIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH4GridIntersector16HybridPluecker)); + + /* select stream intersectors */ + SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4IntersectorStreamPacketFallback); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4IntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4IntersectorStreamMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4vIntersectorStreamPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Triangle4iIntersectorStreamPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4vIntersectorStreamPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4Quad4iIntersectorStreamPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4VirtualIntersectorStream)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,BVH4InstanceIntersectorStream)); + +#endif + } + + Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4Hybrid(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8Hybrid(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4Hybrid(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8Hybrid(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4HybridMB(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8HybridMB(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4HybridMB(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8HybridMB(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::FAST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4Intersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH4Triangle4Intersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH4Triangle4Intersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH4Triangle4Intersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH4Triangle4Intersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH4Triangle4Intersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH4Triangle4IntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH4Triangle4IntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::ROBUST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Triangle4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4Triangle4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Triangle4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4vIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH4Quad4vIntersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH4Quad4vIntersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH4Quad4vIntersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH4Quad4vIntersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH4Quad4vIntersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH4Quad4vIntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH4Quad4vIntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Quad4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller(); + intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker(); + intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller(); + intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker(); + intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4VirtualIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4VirtualIntersector4Chunk(); + intersectors.intersector8 = BVH4VirtualIntersector8Chunk(); + intersectors.intersector16 = BVH4VirtualIntersector16Chunk(); + intersectors.intersectorN = BVH4VirtualIntersectorStream(); +#endif + intersectors.collider = BVH4ColliderUserGeom(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4VirtualMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4VirtualMBIntersector4Chunk(); + intersectors.intersector8 = BVH4VirtualMBIntersector8Chunk(); + intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4InstanceIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4InstanceIntersector4Chunk(); + intersectors.intersector8 = BVH4InstanceIntersector8Chunk(); + intersectors.intersector16 = BVH4InstanceIntersector16Chunk(); + intersectors.intersectorN = BVH4InstanceIntersectorStream(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4InstanceMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4InstanceMBIntersector4Chunk(); + intersectors.intersector8 = BVH4InstanceMBIntersector8Chunk(); + intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4SubdivPatch1Intersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4SubdivPatch1Intersector4(); + intersectors.intersector8 = BVH4SubdivPatch1Intersector8(); + intersectors.intersector16 = BVH4SubdivPatch1Intersector16(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4SubdivPatch1MBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4SubdivPatch1MBIntersector4(); + intersectors.intersector8 = BVH4SubdivPatch1MBIntersector8(); + intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4i::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + +#if defined(EMBREE_TARGET_SIMD8) + Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve8i::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8i>"); + + return new AccelInstance(accel,builder,intersectors); + } +#endif + + Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4v::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4v>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4iMB::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + +#if defined(EMBREE_TARGET_SIMD8) + Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve8iMB::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } +#endif + + Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4v::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4v>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4i>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default" ) { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant); + else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4iMB>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4vMB::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant); + else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4vMB>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4vMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4v::type,scene); + Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->quad_builder == "sah" ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); + else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->quad_builder == "dynamic" ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4v>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4<Quad4iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0); + Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0); + Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene) + { + BVH4* accel = new BVH4(SubdivPatch1::type,scene); + Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel); + Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene) + { + BVH4* accel = new BVH4(SubdivPatch1::type,scene); + Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel); + Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant) + { + BVH4* accel = new BVH4(Object::type,scene); + Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); + else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene) + { + BVH4* accel = new BVH4(Object::type,scene); + Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel); + Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant) + { + BVH4* accel = new BVH4(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP; + // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); + else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive) + { + BVH4* accel = new BVH4(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP; + Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype); + return new AccelInstance(accel,builder,intersectors); + } + + Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + if (ivariant == IntersectVariant::FAST) + { + intersectors.intersector1 = BVH4GridIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4GridIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4GridIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + } + else /* if (ivariant == IntersectVariant::ROBUST) */ + { + intersectors.intersector1 = BVH4GridIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4GridIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4GridIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + } + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4GridMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4GridMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(SubGridQBVH4::type,scene); + Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + builder = BVH4GridSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4<GridMesh>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(SubGridQBVH4::type,scene); + Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + builder = BVH4GridMBSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB<GridMesh>"); + return new AccelInstance(accel,builder,intersectors); + } + +} diff --git a/thirdparty/embree/kernels/bvh/bvh4_factory.h b/thirdparty/embree/kernels/bvh/bvh4_factory.h new file mode 100644 index 0000000000..30973971a4 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh4_factory.h @@ -0,0 +1,316 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_factory.h" + +namespace embree +{ + /*! BVH4 instantiations */ + class BVH4Factory : public BVHFactory + { + public: + BVH4Factory(int bfeatures, int ifeatures); + + public: + Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB); + + Accel* BVH4Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST); + Accel* BVH4Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH4Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH4QuantizedTriangle4i(Scene* scene); + Accel* BVH4QuantizedQuad4i(Scene* scene); + + Accel* BVH4SubdivPatch1(Scene* scene); + Accel* BVH4SubdivPatch1MB(Scene* scene); + + Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH4UserGeometryMB(Scene* scene); + + Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH4InstanceMB(Scene* scene, bool isExpensive); + + Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + private: + void selectBuilders(int features); + void selectIntersectors(int features); + + private: + Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + + Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh); + Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh); + Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh); + Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh); + Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + private: + + DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback); + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream); + + // SAH scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + // spatial scene builder + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + // twolevel scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree/kernels/bvh/bvh8_factory.cpp new file mode 100644 index 0000000000..d4521af241 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh8_factory.cpp @@ -0,0 +1,1165 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8 + +#if defined (EMBREE_TARGET_SIMD8) + +#include "bvh8_factory.h" +#include "../bvh/bvh.h" + +#include "../geometry/curveNv.h" +#include "../geometry/curveNi.h" +#include "../geometry/curveNi_mb.h" +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/subdivpatch1.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" +#include "../common/accelinstance.h" + +namespace embree +{ + DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom); + + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + + BVH8Factory::BVH8Factory(int bfeatures, int ifeatures) + { + SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom); + + selectBuilders(bfeatures); + selectIntersectors(ifeatures); + } + + void BVH8Factory::selectBuilders(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4SceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4iMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4vSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4iSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4iMBSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4SceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8Triangle4vSceneBuilderFastSpatialSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8Quad4vSceneBuilderFastSpatialSAH)); + + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4MeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4vMeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelTriangle4iMeshSAH)); + IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelQuadMeshSAH)); + IF_ENABLED_USER (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelVirtualSAH)); + IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX(features,BVH8BuilderTwoLevelInstanceSAH)); + } + + void BVH8Factory::selectIntersectors(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,VirtualCurveIntersector8iMB)); + + /* select intersectors1 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector1MB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust1MB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector1Woop)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iMBIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iMBIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Triangle4iIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Triangle4Intersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,QBVH8Quad4iIntersector1Pluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector1)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector1)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector1)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector1)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector1Moeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridMBIntersector1Moeller)) + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector1Pluecker)); + +#if defined (EMBREE_RAY_PACKETS) + + /* select intersectors4 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector4HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector4HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector4HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector4HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector4Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector4Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector4Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector4Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector4HybridPluecker)); + + /* select intersectors8 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersector8HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4Intersector8HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersector8HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vMBIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iMBIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersector8HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersector8Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualMBIntersector8Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersector8Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceMBIntersector8Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8GridIntersector8HybridPluecker)); + + /* select intersectors16 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersector16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersector16HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4Intersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4Intersector16HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iIntersector16HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4vMBIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Triangle4iMBIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4vIntersector16HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iMBIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512(features,BVH8Quad4iMBIntersector16HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH8VirtualIntersector16Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512(features,BVH8VirtualMBIntersector16Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceIntersector16Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512(features,BVH8InstanceMBIntersector16Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH8GridIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512(features,BVH8GridIntersector16HybridPluecker)); + + /* select stream intersectors */ + + SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8IntersectorStreamPacketFallback); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4IntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4IntersectorStreamMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4vIntersectorStreamPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Triangle4iIntersectorStreamPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4vIntersectorStreamPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8Quad4iIntersectorStreamPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8VirtualIntersectorStream)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,BVH8InstanceIntersectorStream)); + +#endif + } + + Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4Hybrid(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8Hybrid(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4Hybrid(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8Hybrid(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4HybridMB(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8HybridMB(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4HybridMB(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8HybridMB(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::FAST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4Intersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH8Triangle4Intersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH8Triangle4Intersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH8Triangle4Intersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH8Triangle4Intersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH8Triangle4Intersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH8Triangle4IntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH8Triangle4IntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; +#define ENABLE_WOOP_TEST 0 +#if ENABLE_WOOP_TEST == 0 + //assert(ivariant == IntersectVariant::ROBUST); + intersectors.intersector1 = BVH8Triangle4vIntersector1Pluecker(); +#else + intersectors.intersector1 = BVH8Triangle4vIntersector1Woop(); +#endif + +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Triangle4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8Triangle4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Triangle4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4vMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4vMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4vIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH8Quad4vIntersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH8Quad4vIntersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH8Quad4vIntersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH8Quad4vIntersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH8Quad4vIntersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH8Quad4vIntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH8Quad4vIntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Quad4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Quad4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8Quad4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Quad4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8VirtualIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8VirtualIntersector4Chunk(); + intersectors.intersector8 = BVH8VirtualIntersector8Chunk(); + intersectors.intersector16 = BVH8VirtualIntersector16Chunk(); + intersectors.intersectorN = BVH8VirtualIntersectorStream(); +#endif + intersectors.collider = BVH8ColliderUserGeom(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8VirtualMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8VirtualMBIntersector4Chunk(); + intersectors.intersector8 = BVH8VirtualMBIntersector8Chunk(); + intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8InstanceIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8InstanceIntersector4Chunk(); + intersectors.intersector8 = BVH8InstanceIntersector8Chunk(); + intersectors.intersector16 = BVH8InstanceIntersector16Chunk(); + intersectors.intersectorN = BVH8InstanceIntersectorStream(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8InstanceMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8InstanceMBIntersector4Chunk(); + intersectors.intersector8 = BVH8InstanceMBIntersector8Chunk(); + intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Curve8v::type,scene); + Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant); + Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Curve8iMB::type,scene); + Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant); + Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4v::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4v>"); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { // FIXME: implement + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4vMB::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4vMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel); + Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene) + { + BVH8* accel = new BVH8(Triangle4::type,scene); + Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel); + Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4v::type,scene); + Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->quad_builder == "dynamic" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); + else if (scene->device->quad_builder == "morton" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true); + else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4v>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8<Quad4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel); + Builder* builder = nullptr; + if (scene->device->quad_builder == "default" ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8<Quad4i>"); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant) + { + BVH8* accel = new BVH8(Object::type,scene); + Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); + else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene) + { + BVH8* accel = new BVH8(Object::type,scene); + Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel); + Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant) + { + BVH8* accel = new BVH8(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; + // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype); + else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive) + { + BVH8* accel = new BVH8(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; + Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype); + return new AccelInstance(accel,builder,intersectors); + } + + Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + if (ivariant == IntersectVariant::FAST) + { + intersectors.intersector1 = BVH8GridIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8GridIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8GridIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8GridIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + } + else /* if (ivariant == IntersectVariant::ROBUST) */ + { + intersectors.intersector1 = BVH8GridIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8GridIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8GridIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8GridIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + } + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8GridMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = nullptr; + intersectors.intersector8 = nullptr; + intersectors.intersector16 = nullptr; + intersectors.intersectorN = nullptr; +#endif + return intersectors; + } + + Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(SubGridQBVH8::type,scene); + Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->grid_builder == "default") { + builder = BVH8GridSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<GridMesh>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(SubGridQBVH8::type,scene); + Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->grid_builder_mb == "default") { + builder = BVH8GridMBSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB<GridMesh>"); + return new AccelInstance(accel,builder,intersectors); + } +} + +#endif diff --git a/thirdparty/embree/kernels/bvh/bvh8_factory.h b/thirdparty/embree/kernels/bvh/bvh8_factory.h new file mode 100644 index 0000000000..198d6f1df0 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh8_factory.h @@ -0,0 +1,280 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_factory.h" + +namespace embree +{ + /*! BVH8 instantiations */ + class BVH8Factory : public BVHFactory + { + public: + BVH8Factory(int bfeatures, int ifeatures); + + public: + Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant); + Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB); + + Accel* BVH8Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH8Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH8QuantizedTriangle4i(Scene* scene); + Accel* BVH8QuantizedTriangle4(Scene* scene); + Accel* BVH8QuantizedQuad4i(Scene* scene); + + Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH8UserGeometryMB(Scene* scene); + + Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH8InstanceMB(Scene* scene, bool isExpensive); + + Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + private: + void selectBuilders(int features); + void selectIntersectors(int features); + + private: + Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + + Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh); + Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh); + Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh); + Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh); + Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + private: + DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream); + + // SAH scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + // SAH spatial scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + // twolevel scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder.cpp b/thirdparty/embree/kernels/bvh/bvh_builder.cpp new file mode 100644 index 0000000000..161d01bb5c --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder.cpp @@ -0,0 +1,60 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_builder.h" + +namespace embree +{ + namespace isa + { + template<int N> + typename BVHN<N>::NodeRef BVHNBuilderVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build<NodeRef> + (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + + template<int N> + typename BVHN<N>::NodeRef BVHNBuilderQuantizedVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build<NodeRef> + (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + template<int N> + typename BVHN<N>::NodeRecordMB BVHNBuilderMblurVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRecordMB { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build<NodeRecordMB> + (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + template struct BVHNBuilderVirtual<4>; + template struct BVHNBuilderQuantizedVirtual<4>; + template struct BVHNBuilderMblurVirtual<4>; + +#if defined(__AVX__) + template struct BVHNBuilderVirtual<8>; + template struct BVHNBuilderQuantizedVirtual<8>; + template struct BVHNBuilderMblurVirtual<8>; +#endif + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder.h b/thirdparty/embree/kernels/bvh/bvh_builder.h new file mode 100644 index 0000000000..e35d052a62 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder.h @@ -0,0 +1,115 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "../builders/bvh_builder_sah.h" +#include "../builders/bvh_builder_msmblur.h" + +namespace embree +{ + namespace isa + { + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template<int N> + struct BVHNBuilderVirtual + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings); + virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0; + }; + + template<typename CreateLeafFunc> + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template<typename CreateLeafFunc> + static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) { + return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings); + } + }; + + template<int N> + struct BVHNBuilderQuantizedVirtual + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings); + virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0; + }; + + template<typename CreateLeafFunc> + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template<typename CreateLeafFunc> + static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) { + return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings); + } + }; + + template<int N> + struct BVHNBuilderMblurVirtual + { + typedef BVHN<N> BVH; + typedef typename BVH::AABBNodeMB AABBNodeMB; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange); + virtual NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0; + }; + + template<typename CreateLeafFunc> + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template<typename CreateLeafFunc> + static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) { + return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange); + } + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp new file mode 100644 index 0000000000..4a4d8d71df --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder_morton.cpp @@ -0,0 +1,531 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_statistics.h" +#include "bvh_rotate.h" +#include "../common/profile.h" +#include "../../common/algorithms/parallel_prefix_sum.h" + +#include "../builders/primrefgen.h" +#include "../builders/bvh_builder_morton.h" + +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +#if defined(__64BIT__) +# define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform +#else +# define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues +#endif + +namespace embree +{ + namespace isa + { + template<int N> + struct SetBVHNBounds + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + typedef typename BVH::AABBNode AABBNode; + + BVH* bvh; + __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num) + { + AABBNode* node = ref.getAABBNode(); + + BBox3fa res = empty; + for (size_t i=0; i<num; i++) { + const BBox3fa b = children[i].bounds; + res.extend(b); + node->setRef(i,children[i].ref); + node->setBounds(i,b); + } + + BBox3fx result = (BBox3fx&)res; +#if ROTATE_TREE + if (N == 4) + { + size_t n = 0; + for (size_t i=0; i<num; i++) + n += children[i].bounds.lower.a; + + if (n >= 4096) { + for (size_t i=0; i<num; i++) { + if (children[i].bounds.lower.a < 4096) { + for (int j=0; j<ROTATE_TREE; j++) + BVHNRotate<N>::rotate(node->child(i)); + node->child(i).setBarrier(); + } + } + } + result.lower.a = unsigned(n); + } +#endif + + return NodeRecord(ref,result); + } + }; + + template<int N, typename Primitive> + struct CreateMortonLeaf; + + template<int N> + struct CreateMortonLeaf<N,Triangle4> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero; + const TriangleMesh* __restrict__ const mesh = this->mesh; + + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + + Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = unsigned(current.size()); +#endif + return NodeRecord(ref,box_o); + } + + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,Triangle4v> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero; + const TriangleMesh* __restrict__ mesh = this->mesh; + + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,Triangle4i> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + + vuint4 v0 = zero, v1 = zero, v2 = zero; + vuint4 vgeomID = -1, vprimID = -1; + const TriangleMesh* __restrict__ const mesh = this->mesh; + + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID[i] = geomID_; + vprimID[i] = primID; + unsigned int int_stride = mesh->vertices0.getStride()/4; + v0[i] = tri.v[0] * int_stride; + v1[i] = tri.v[1] * int_stride; + v2[i] = tri.v[2] * int_stride; + } + + for (size_t i=items; i<4; i++) + { + vgeomID[i] = vgeomID[0]; + vprimID[i] = -1; + v0[i] = 0; + v1[i] = 0; + v2[i] = 0; + } + Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,Quad4v> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero; + const QuadMesh* __restrict__ mesh = this->mesh; + + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + const QuadMesh::Quad& tri = mesh->quad(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + const Vec3fa& p3 = mesh->vertex(tri.v[3]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; + } + Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + QuadMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,Object> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + + /* allocate leaf node */ + Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,items); + const UserGeometry* mesh = this->mesh; + + BBox3fa bounds = empty; + for (size_t i=0; i<items; i++) + { + const unsigned int index = morton[start+i].index; + const unsigned int primID = index; + bounds.extend(mesh->bounds(primID)); + new (&accel[i]) Object(geomID_,primID); + } + + BBox3fx box_o = (BBox3fx&)bounds; +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + UserGeometry* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,InstancePrimitive> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items <= 1); + + /* allocate leaf node */ + InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,items); + const Instance* instance = this->mesh; + + BBox3fa bounds = empty; + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + bounds.extend(instance->bounds(primID)); + new (&accel[i]) InstancePrimitive(instance, geomID_); + } + + BBox3fx box_o = (BBox3fx&)bounds; +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + Instance* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<typename Mesh> + struct CalculateMeshBounds + { + __forceinline CalculateMeshBounds (Mesh* mesh) + : mesh(mesh) {} + + __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) { + return mesh->bounds(morton.index); + } + + private: + Mesh* mesh; + }; + + template<int N, typename Mesh, typename Primitive> + class BVHNMeshBuilderMorton : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + public: + + BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD) + : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {} + + /* build function */ + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + morton.clear(); + } + size_t numPrimitives = mesh->size(); + numPreviousPrimitives = numPrimitives; + + /* skip build for empty scene */ + if (numPrimitives == 0) { + bvh->set(BVH::emptyNode,empty,0); + return; + } + + /* preallocate arrays */ + morton.resize(numPrimitives); + size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim); + bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes + bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated); + + /* create morton code array */ + BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes); + size_t numPrimitivesGen = createMortonCodeArray<Mesh>(mesh,morton,bvh->scene->progressInterface); + + /* create BVH */ + SetBVHNBounds<N> setBounds(bvh); + CreateMortonLeaf<N,Primitive> createLeaf(mesh,geomID_,morton.data()); + CalculateMeshBounds<Mesh> calculateBounds(mesh); + auto root = BVHBuilderMorton::build<NodeRecord>( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create(), + setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface, + morton.data(),dest,numPrimitivesGen,settings); + + bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives); + +#if ROTATE_TREE + if (N == 4) + { + for (int i=0; i<ROTATE_TREE; i++) + BVHNRotate<N>::rotate(bvh->root); + bvh->clearBarrier(bvh->root); + } +#endif + + /* clear temporary data for static geometry */ + if (bvh->scene->isStaticAccel()) { + morton.clear(); + } + bvh->cleanup(); + } + + void clear() { + morton.clear(); + } + + private: + BVH* bvh; + Mesh* mesh; + mvector<BVHBuilderMorton::BuildPrim> morton; + BVHBuilderMorton::Settings settings; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + unsigned int numPreviousPrimitives = 0; + }; + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); } + Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); } + Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); } + Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); } + Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); } +#if defined(__AVX__) + Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); } +#if defined(__AVX__) + Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); } +#if defined(__AVX__) + Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); } +#endif +#endif + + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp new file mode 100644 index 0000000000..fad02fcc04 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah.cpp @@ -0,0 +1,543 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" +#include "../builders/primrefgen.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + +#define PROFILE 0 +#define PROFILE_RUNS 20 + +namespace embree +{ + namespace isa + { + template<int N, typename Primitive> + struct CreateLeaf + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; i<items; i++) { + accel[i].fill(prims,start,set.end(),bvh->scene); + } + return node; + } + + BVH* bvh; + }; + + + template<int N, typename Primitive> + struct CreateLeafQuantized + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; i<items; i++) { + accel[i].fill(prims,start,set.end(),bvh->scene); + } + return node; + } + + BVH* bvh; + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template<int N, typename Primitive> + struct BVHNBuilderSAH : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + Geometry* mesh; + mvector<PrimRef> prims; + GeneralBVHBuilder::Settings settings; + Geometry::GTypeMask gtype_; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max (); + bool primrefarrayalloc; + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, + const Geometry::GTypeMask gtype, bool primrefarrayalloc = false) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), + settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {} + + BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* if we use the primrefarray for allocations we have to take it back from the BVH */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.unshare(prims); + + /* skip build for empty scene */ + const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false); + numPreviousPrimitives = numPrimitives; + if (numPrimitives == 0) { + bvh->clear(); + prims.clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + + /* create primref array */ + if (primrefarrayalloc) { + settings.primrefarrayalloc = numPrimitives/1000; + if (settings.primrefarrayalloc < 1000) + settings.primrefarrayalloc = inf; + } + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* initialize allocator */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + prims.resize(numPrimitives); + + PrimInfo pinfo = mesh ? + createPrimRefArray(mesh,geomID_,numPrimitives,prims,bvh->scene->progressInterface) : + createPrimRefArray(scene,gtype_,false,numPrimitives,prims,bvh->scene->progressInterface); + + /* pinfo might has zero size due to invalid geometry */ + if (unlikely(pinfo.size() == 0)) + { + bvh->clear(); + prims.clear(); + return; + } + + /* call BVH builder */ + NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeaf<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + +#if PROFILE + }); +#endif + + /* if we allocated using the primrefarray we have to keep it alive */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.share(prims); + + /* for static geometries we can do some cleanups */ + else if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template<int N, typename Primitive> + struct BVHNBuilderSAHQuantized : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + Geometry* mesh; + mvector<PrimRef> prims; + GeneralBVHBuilder::Settings settings; + Geometry::GTypeMask gtype_; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {} + + BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* skip build for empty scene */ + const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false); + numPreviousPrimitives = numPrimitives; + if (numPrimitives == 0) { + prims.clear(); + bvh->clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + /* create primref array */ + prims.resize(numPrimitives); + PrimInfo pinfo = mesh ? + createPrimRefArray(mesh,geomID_,numPrimitives,prims,bvh->scene->progressInterface) : + createPrimRefArray(scene,gtype_,false,numPrimitives,prims,bvh->scene->progressInterface); + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* call BVH builder */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + NodeRef root = BVHNBuilderQuantizedVirtual<N>::build(&bvh->alloc,CreateLeafQuantized<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!! +#if PROFILE + }); +#endif + + /* clear temporary data for static geometry */ + if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + + template<int N, typename Primitive> + struct CreateLeafGrid + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = set.size(); //Primitive::blocks(n); + const size_t start = set.begin(); + + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i<items;i++) + { + bool found = false; + const unsigned int new_geomID = prims[start+i].geomID(); + for (size_t j=0;j<num_geomIDs;j++) + if (new_geomID == geomIDs[j]) + { found = true; break; } + if (!found) + geomIDs[num_geomIDs++] = new_geomID; + } + + /* allocate all leaf memory in one single block */ + SubGridQBVHN<N>* accel = (SubGridQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN<N>),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs); + + for (size_t g=0;g<num_geomIDs;g++) + { + unsigned int x[N]; + unsigned int y[N]; + unsigned int primID[N]; + BBox3fa bounds[N]; + unsigned int pos = 0; + for (size_t i=0;i<items;i++) + { + if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue; + + const SubGridBuildData& sgrid_bd = sgrids[prims[start+i].primID()]; + x[pos] = sgrid_bd.sx; + y[pos] = sgrid_bd.sy; + primID[pos] = sgrid_bd.primID; + bounds[pos] = prims[start+i].bounds(); + pos++; + } + assert(pos <= N); + new (&accel[g]) SubGridQBVHN<N>(x,y,primID,bounds,geomIDs[g],pos); + } + + return node; + } + + BVH* bvh; + const SubGridBuildData * const sgrids; + }; + + + template<int N> + struct BVHNBuilderSAHGrid : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + GridMesh* mesh; + mvector<PrimRef> prims; + mvector<SubGridBuildData> sgrids; + GeneralBVHBuilder::Settings settings; + const unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {} + + BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {} + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* if we use the primrefarray for allocations we have to take it back from the BVH */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.unshare(prims); + + const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false); + numPreviousPrimitives = numGridPrimitives; + + + PrimInfo pinfo = mesh ? createPrimRefArrayGrids(mesh,prims,sgrids) : createPrimRefArrayGrids(scene,prims,sgrids); + const size_t numPrimitives = pinfo.size(); + /* no primitives */ + if (numPrimitives == 0) { + bvh->clear(); + prims.clear(); + sgrids.clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH"); + + /* create primref array */ + settings.primrefarrayalloc = numPrimitives/1000; + if (settings.primrefarrayalloc < 1000) + settings.primrefarrayalloc = inf; + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* initialize allocator */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>)); + + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + + /* pinfo might has zero size due to invalid geometry */ + if (unlikely(pinfo.size() == 0)) + { + bvh->clear(); + sgrids.clear(); + prims.clear(); + return; + } + + /* call BVH builder */ + NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafGrid<N,SubGridQBVHN<N>>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + + /* clear temporary array */ + sgrids.clear(); + + /* if we allocated using the primrefarray we have to keep it alive */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.share(prims); + + /* for static geometries we can do some cleanups */ + else if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + + Builder* BVH4Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); } + + Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + + Builder* BVH8Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); } + Builder* BVH8QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8QuantizedTriangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + + + +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4iMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); } + Builder* BVH4QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + +#if defined(__AVX__) + Builder* BVH8Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); } + Builder* BVH8QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + + Builder* BVH4VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_min_leaf_size; + int maxLeafSize = scene->device->object_accel_max_leaf_size; + return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type); + } + + Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type); + } +#if defined(__AVX__) + + Builder* BVH8VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_min_leaf_size; + int maxLeafSize = scene->device->object_accel_max_leaf_size; + return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type); + } + + Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); } + Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype); + } +#if defined(__AVX__) + Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); } + Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + Builder* BVH4GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); } + Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct + +#if defined(__AVX__) + Builder* BVH8GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); } + Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct +#endif +#endif + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp new file mode 100644 index 0000000000..d163a80ab1 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah_mb.cpp @@ -0,0 +1,705 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" +#include "../builders/bvh_builder_msmblur.h" + +#include "../builders/primrefgen.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" + +// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + + +namespace embree +{ + namespace isa + { + +#if 0 + template<int N, typename Primitive> + struct CreateMBlurLeaf + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + + __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {} + + __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t items = Primitive::blocks(set.size()); + size_t start = set.begin(); + for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + NodeRef node = bvh->encodeLeaf((char*)accel,items); + + LBBox3fa allBounds = empty; + for (size_t i=0; i<items; i++) + allBounds.extend(accel[i].fillMB(prims, start, set.end(), bvh->scene, time)); + + return NodeRecordMB(node,allBounds); + } + + BVH* bvh; + PrimRef* prims; + size_t time; + }; +#endif + + template<int N, typename Mesh, typename Primitive> + struct CreateMSMBlurLeaf + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB4D NodeRecordMB4D; + + __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {} + + __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const + { + size_t items = Primitive::blocks(current.prims.size()); + size_t start = current.prims.begin(); + size_t end = current.prims.end(); + for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteNodeAlignment); + NodeRef node = bvh->encodeLeaf((char*)accel,items); + LBBox3fa allBounds = empty; + for (size_t i=0; i<items; i++) + allBounds.extend(accel[i].fillMB(current.prims.prims->data(), start, current.prims.end(), bvh->scene, current.prims.time_range)); + return NodeRecordMB4D(node,allBounds,current.prims.time_range); + } + + BVH* bvh; + }; + + /* Motion blur BVH with 4D nodes and internal time splits */ + template<int N, typename Mesh, typename Primitive> + struct BVHNBuilderMBlurSAH : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + typedef typename BVHN<N>::NodeRecordMB NodeRecordMB; + typedef typename BVHN<N>::AABBNodeMB AABBNodeMB; + + BVH* bvh; + Scene* scene; + const size_t sahBlockSize; + const float intCost; + const size_t minLeafSize; + const size_t maxLeafSize; + const Geometry::GTypeMask gtype_; + + BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {} + + void build() + { + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(gtype_,true); + if (numPrimitives == 0) { bvh->clear(); return; } + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + + //const size_t numTimeSteps = scene->getNumTimeSteps<typename Mesh::type_t,true>(); + //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1); + + /*if (numTimeSegments == 1) + buildSingleSegment(numPrimitives); + else*/ + buildMultiSegment(numPrimitives); + +#if PROFILE + }); +#endif + + /* clear temporary data for static geometry */ + bvh->cleanup(); + bvh->postBuild(t0); + } + +#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf. + void buildSingleSegment(size_t numPrimitives) + { + /* create primref array */ + mvector<PrimRef> prims(scene->device,numPrimitives); + const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,numPrimitives,prims,bvh->scene->progressInterface,0); + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = BVHBuilderBinnedSAH::build<NodeRecordMB> + (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(), + CreateMBlurLeaf<N,Primitive>(bvh,prims.data(),0),bvh->scene->progressInterface, + prims.data(),pinfo,settings); + + bvh->set(root.ref,root.lbounds,pinfo.size()); + } +#endif + + void buildMultiSegment(size_t numPrimitives) + { + /* create primref array */ + mvector<PrimRefMB> prims(scene->device,numPrimitives); + PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,numPrimitives,prims,bvh->scene->progressInterface); + + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + BVHBuilderMSMBlur::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxDepth; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleLeafTimeSegment = Primitive::singleTimeSegment; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = + BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device, + RecalculatePrimRef<Mesh>(scene), + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB4D::Create(), + typename BVH::AABBNodeMB4D::Set(), + CreateMSMBlurLeaf<N,Mesh,Primitive>(bvh), + bvh->scene->progressInterface, + settings); + + bvh->set(root.ref,root.lbounds,pinfo.num_time_segments); + } + + void clear() { + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + struct GridRecalculatePrimRef + { + Scene* scene; + const SubGridBuildData * const sgrids; + + __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids) + : scene(scene), sgrids(sgrids) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned int geomID = prim.geomID(); + const GridMesh* mesh = scene->get<GridMesh>(geomID); + const unsigned int buildID = prim.primID(); + const SubGridBuildData &subgrid = sgrids[buildID]; + const unsigned int primID = subgrid.primID; + const size_t x = subgrid.x(); + const size_t y = subgrid.y(); + const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range); + const unsigned num_time_segments = mesh->numTimeSegments(); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + const unsigned int geomID = prim.geomID(); + const GridMesh* mesh = scene->get<GridMesh>(geomID); + const unsigned int buildID = prim.primID(); + const SubGridBuildData &subgrid = sgrids[buildID]; + const unsigned int primID = subgrid.primID; + const size_t x = subgrid.x(); + const size_t y = subgrid.y(); + return mesh->linearBounds(mesh->grid(primID),x,y,time_range); + } + + }; + + template<int N> + struct CreateMSMBlurLeafGrid + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB4D NodeRecordMB4D; + + __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {} + + __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = current.prims.size(); + const size_t start = current.prims.begin(); + + const PrimRefMB* prims = current.prims.prims->data(); + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i<items;i++) + { + bool found = false; + const unsigned int new_geomID = prims[start+i].geomID(); + for (size_t j=0;j<num_geomIDs;j++) + if (new_geomID == geomIDs[j]) + { found = true; break; } + if (!found) + geomIDs[num_geomIDs++] = new_geomID; + } + + /* allocate all leaf memory in one single block */ + SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment); + typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs); + + LBBox3fa allBounds = empty; + + for (size_t g=0;g<num_geomIDs;g++) + { + const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]); + unsigned int x[N]; + unsigned int y[N]; + unsigned int primID[N]; + BBox3fa bounds0[N]; + BBox3fa bounds1[N]; + unsigned int pos = 0; + for (size_t i=0;i<items;i++) + { + if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue; + + const SubGridBuildData &sgrid_bd = sgrids[prims[start+i].primID()]; + x[pos] = sgrid_bd.sx; + y[pos] = sgrid_bd.sy; + primID[pos] = sgrid_bd.primID; + const size_t x = sgrid_bd.x(); + const size_t y = sgrid_bd.y(); + LBBox3fa newBounds = mesh->linearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range); + allBounds.extend(newBounds); + bounds0[pos] = newBounds.bounds0; + bounds1[pos] = newBounds.bounds1; + pos++; + } + assert(pos <= N); + new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos); + } + return NodeRecordMB4D(node,allBounds,current.prims.time_range); + } + + Scene *scene; + BVH* bvh; + const SubGridBuildData * const sgrids; + }; + +#if 0 + template<int N> + struct CreateLeafGridMB + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + + __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) + : scene(scene), bvh(bvh), sgrids(sgrids) {} + + __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = set.size(); + const size_t start = set.begin(); + + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i<items;i++) + { + bool found = false; + const unsigned int new_geomID = prims[start+i].geomID(); + for (size_t j=0;j<num_geomIDs;j++) + if (new_geomID == geomIDs[j]) + { found = true; break; } + if (!found) + geomIDs[num_geomIDs++] = new_geomID; + } + + /* allocate all leaf memory in one single block */ + SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment); + typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs); + + LBBox3fa allBounds = empty; + + for (size_t g=0;g<num_geomIDs;g++) + { + const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]); + + unsigned int x[N]; + unsigned int y[N]; + unsigned int primID[N]; + BBox3fa bounds0[N]; + BBox3fa bounds1[N]; + unsigned int pos = 0; + for (size_t i=0;i<items;i++) + { + if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue; + + const SubGridBuildData &sgrid_bd = sgrids[prims[start+i].primID()]; + x[pos] = sgrid_bd.sx; + y[pos] = sgrid_bd.sy; + primID[pos] = sgrid_bd.primID; + const size_t x = sgrid_bd.x(); + const size_t y = sgrid_bd.y(); + bool MAYBE_UNUSED valid0 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]); + bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]); + assert(valid0); + assert(valid1); + allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos])); + pos++; + } + new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos); + } + return NodeRecordMB(node,allBounds); + } + + Scene *scene; + BVH* bvh; + const SubGridBuildData * const sgrids; + }; +#endif + + + /* Motion blur BVH with 4D nodes and internal time splits */ + template<int N> + struct BVHNBuilderMBlurSAHGrid : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + typedef typename BVHN<N>::NodeRecordMB NodeRecordMB; + typedef typename BVHN<N>::AABBNodeMB AABBNodeMB; + + BVH* bvh; + Scene* scene; + const size_t sahBlockSize; + const float intCost; + const size_t minLeafSize; + const size_t maxLeafSize; + mvector<SubGridBuildData> sgrids; + + + BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize) + : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {} + + + PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime) + { + /* first run to get #primitives */ + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator<GridMesh,true> iter(scene); + + pstate.init(iter,size_t(1024)); + + /* iterate over all meshes in the scene */ + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j,range<size_t>(0,1))) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,unsigned(geomID),unsigned(j)); + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + size_t numPrimitives = pinfo.size(); + if (numPrimitives == 0) return pinfo; + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + + k = base.size(); + size_t p_index = k; + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + const GridMesh::Grid &g = mesh->grid(j); + if (!mesh->valid(j,range<size_t>(0,1))) continue; + + for (unsigned int y=0; y<g.resY-1u; y+=2) + for (unsigned int x=0; x<g.resX-1u; x+=2) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index)); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + assert(pinfo.size() == numPrimitives); + return pinfo; + } + + PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f)) + { + /* first run to get #primitives */ + ParallelForForPrefixSumState<PrimInfoMB> pstate; + Scene::Iterator<GridMesh,true> iter(scene); + + pstate.init(iter,size_t(1024)); + /* iterate over all meshes in the scene */ + PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t /*geomID*/) -> PrimInfoMB { + + PrimInfoMB pinfoMB(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue; + LBBox3fa bounds(empty); + PrimInfoMB gridMB(0,mesh->getNumSubGrids(j)); + pinfoMB.merge(gridMB); + } + return pinfoMB; + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + size_t numPrimitives = pinfoMB.size(); + if (numPrimitives == 0) return pinfoMB; + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB { + + k = base.size(); + size_t p_index = k; + PrimInfoMB pinfoMB(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue; + const GridMesh::Grid &g = mesh->grid(j); + + for (unsigned int y=0; y<g.resY-1u; y+=2) + for (unsigned int x=0; x<g.resX-1u; x+=2) + { + const PrimRefMB prim(mesh->linearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index)); + pinfoMB.add_primref(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfoMB; + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + assert(pinfoMB.size() == numPrimitives); + pinfoMB.time_range = t0t1; + return pinfoMB; + } + + void build() + { + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true); + if (numPrimitives == 0) { bvh->clear(); return; } + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid"); + + //const size_t numTimeSteps = scene->getNumTimeSteps<GridMesh,true>(); + //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1); + //if (numTimeSegments == 1) + // buildSingleSegment(numPrimitives); + //else + buildMultiSegment(numPrimitives); + + /* clear temporary data for static geometry */ + bvh->cleanup(); + bvh->postBuild(t0); + } + +#if 0 + void buildSingleSegment(size_t numPrimitives) + { + /* create primref array */ + mvector<PrimRef> prims(scene->device,numPrimitives); + const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0); + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N); + //TODO: check leaf_bytes + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = BVHBuilderBinnedSAH::build<NodeRecordMB> + (typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB::Create(), + typename BVH::AABBNodeMB::Set(), + CreateLeafGridMB<N>(scene,bvh,sgrids.data()), + bvh->scene->progressInterface, + prims.data(),pinfo,settings); + + bvh->set(root.ref,root.lbounds,pinfo.size()); + } +#endif + + void buildMultiSegment(size_t numPrimitives) + { + /* create primref array */ + mvector<PrimRefMB> prims(scene->device,numPrimitives); + PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface); + + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + + + GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data()); + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N); + //FIXME: check leaf_bytes + //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN<N>)); + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>)); + + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + BVHBuilderMSMBlur::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxDepth; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleLeafTimeSegment = false; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = + BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device, + recalculatePrimRef, + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB4D::Create(), + typename BVH::AABBNodeMB4D::Set(), + CreateMSMBlurLeafGrid<N>(scene,bvh,sgrids.data()), + bvh->scene->progressInterface, + settings); + bvh->set(root.ref,root.lbounds,pinfo.num_time_segments); + } + + void clear() { + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } + Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } +#if defined(__AVX__) + Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } + Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); } +#if defined(__AVX__) + Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_mb_min_leaf_size; + int maxLeafSize = scene->device->object_accel_mb_max_leaf_size; + return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY); + } +#if defined(__AVX__) + Builder* BVH8VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_mb_min_leaf_size; + int maxLeafSize = scene->device->object_accel_mb_max_leaf_size; + return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); } +#if defined(__AVX__) + Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); } +#if defined(__AVX__) + Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); } +#endif +#endif + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp new file mode 100644 index 0000000000..a4e55d7484 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder_sah_spatial.cpp @@ -0,0 +1,201 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" + +#include "../builders/primrefgen.h" +#include "../builders/primrefgen_presplit.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" + +namespace embree +{ + namespace isa + { + template<int N, typename Primitive> + struct CreateLeafSpatial + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; i<items; i++) { + accel[i].fill(prims,start,set.end(),bvh->scene); + } + return node; + } + + BVH* bvh; + }; + + template<int N, typename Mesh, typename Primitive, typename Splitter> + struct BVHNBuilderFastSpatialSAH : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + BVH* bvh; + Scene* scene; + Mesh* mesh; + mvector<PrimRef> prims0; + GeneralBVHBuilder::Settings settings; + const float splitFactor; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), + splitFactor(scene->device->max_spatial_split_replications) {} + + BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), + splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* skip build for empty scene */ + const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false); + numPreviousPrimitives = numOriginalPrimitives; + if (numOriginalPrimitives == 0) { + prims0.clear(); + bvh->clear(); + return; + } + + const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID<Mesh,false>(); + const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))); + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH")); + + /* create primref array */ + const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives)); + prims0.resize(numSplitPrimitives); + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + NodeRef root(0); + PrimInfo pinfo; + + + if (likely(usePreSplits)) + { + /* spatial presplit SAH BVH builder */ + pinfo = mesh ? + createPrimRefArray_presplit<Mesh,Splitter>(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) : + createPrimRefArray_presplit<Mesh,Splitter>(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface); + + const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + + /* call BVH builder */ + root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafSpatial<N,Primitive>(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings); + } + else + { + /* standard spatial split SAH BVH builder */ + pinfo = mesh ? + createPrimRefArray(mesh,geomID_,numSplitPrimitives,prims0,bvh->scene->progressInterface) : + createPrimRefArray(scene,Mesh::geom_type,false,numSplitPrimitives,prims0,bvh->scene->progressInterface); + + Splitter splitter(scene); + + const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + + /* call BVH builder */ + root = BVHBuilderBinnedFastSpatialSAH::build<NodeRef>( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + CreateLeafSpatial<N,Primitive>(bvh), + splitter, + bvh->scene->progressInterface, + prims0.data(), + numSplitPrimitives, + pinfo,settings); + + /* ==================== */ + } + + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + + /* clear temporary data for static geometry */ + if (scene && scene->isStaticAccel()) { + prims0.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims0.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + + Builder* BVH4Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + +#if defined(__AVX__) + Builder* BVH8Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + +#if defined(__AVX__) + Builder* BVH8Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } +#endif + +#endif + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp new file mode 100644 index 0000000000..5d45ed3748 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.cpp @@ -0,0 +1,369 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_builder_twolevel.h" +#include "bvh_statistics.h" +#include "../builders/bvh_builder_sah.h" +#include "../common/scene_line_segments.h" +#include "../common/scene_triangle_mesh.h" +#include "../common/scene_quad_mesh.h" + +#define PROFILE 0 + +namespace embree +{ + namespace isa + { + template<int N, typename Mesh, typename Primitive> + BVHNBuilderTwoLevel<N,Mesh,Primitive>::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold) + : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {} + + template<int N, typename Mesh, typename Primitive> + BVHNBuilderTwoLevel<N,Mesh,Primitive>::~BVHNBuilderTwoLevel () { + } + + // =========================================================================== + // =========================================================================== + // =========================================================================== + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::build() + { + /* delete some objects */ + size_t num = scene->size(); + if (num < bvh->objects.size()) { + parallel_for(num, bvh->objects.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + builders[i].reset(); + delete bvh->objects[i]; bvh->objects[i] = nullptr; + } + }); + } + +#if PROFILE + while(1) +#endif + { + /* reset memory allocator */ + bvh->alloc.reset(); + + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(gtype,false); + + if (numPrimitives == 0) { + prims.resize(0); + bvh->set(BVH::emptyNode,empty,0); + return; + } + + /* calculate the size of the entire BVH */ + const size_t numLeafBlocks = Primitive::blocks(numPrimitives); + const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N; + const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel"); + + /* resize object array if scene got larger */ + if (bvh->objects.size() < num) bvh->objects.resize(num); + if (builders.size() < num) builders.resize(num); + resizeRefsList (); + nextRef.store(0); + + /* create acceleration structures */ + parallel_for(size_t(0), num, [&] (const range<size_t>& r) + { + for (size_t objectID=r.begin(); objectID<r.end(); objectID++) + { + Mesh* mesh = scene->getSafe<Mesh>(objectID); + + /* ignore meshes we do not support */ + if (mesh == nullptr || mesh->numTimeSteps != 1) + continue; + + if (isSmallGeometry(mesh)) { + setupSmallBuildRefBuilder (objectID, mesh); + } else { + setupLargeBuildRefBuilder (objectID, mesh); + } + } + }); + + /* parallel build of acceleration structures */ + parallel_for(size_t(0), num, [&] (const range<size_t>& r) + { + for (size_t objectID=r.begin(); objectID<r.end(); objectID++) + { + /* ignore if no triangle mesh or not enabled */ + Mesh* mesh = scene->getSafe<Mesh>(objectID); + if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) + continue; + + builders[objectID]->attachBuildRefs (this); + } + }); + + +#if PROFILE + double d0 = getSeconds(); +#endif + /* fast path for single geometry scenes */ + if (nextRef == 1) { + bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives); + } + + else + { + /* open all large nodes */ + refs.resize(nextRef); + + /* this probably needs some more tuning */ + const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR)); + +#if !ENABLE_DIRECT_SAH_MERGE_BUILDER + +#if ENABLE_OPEN_SEQUENTIAL + open_sequential(extSize); +#endif + /* compute PrimRefs */ + prims.resize(refs.size()); +#endif + + { +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + + const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t i=r.begin(); i<r.end(); i++) { + pinfo.add_center2(refs[i]); + } + return pinfo; + }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); }); + +#else + const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t i=r.begin(); i<r.end(); i++) { + pinfo.add_center2(refs[i]); + prims[i] = PrimRef(refs[i].bounds(),(size_t)refs[i].node); + } + return pinfo; + }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); }); +#endif + + /* skip if all objects where empty */ + if (pinfo.size() == 0) + bvh->set(BVH::emptyNode,empty,0); + + /* otherwise build toplevel hierarchy */ + else + { + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(N); + settings.minLeafSize = 1; + settings.maxLeafSize = 1; + settings.travCost = 1.0f; + settings.intCost = 1.0f; + settings.singleThreadThreshold = singleThreadThreshold; + +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + + refs.resize(extSize); + + NodeRef root = BVHBuilderBinnedOpenMergeSAH::build<NodeRef,BuildRef>( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + + [&] (const BuildRef* refs, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef { + assert(range.size() == 1); + return (NodeRef) refs[range.begin()].node; + }, + [&] (BuildRef &bref, BuildRef *refs) -> size_t { + return openBuildRef(bref,refs); + }, + [&] (size_t dn) { bvh->scene->progressMonitor(0); }, + refs.data(),extSize,pinfo,settings); +#else + NodeRef root = BVHBuilderBinnedSAH::build<NodeRef>( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + + [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef { + assert(range.size() == 1); + return (NodeRef) prims[range.begin()].ID(); + }, + [&] (size_t dn) { bvh->scene->progressMonitor(0); }, + prims.data(),pinfo,settings); +#endif + + + bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives); + } + } + } + + bvh->alloc.cleanup(); + bvh->postBuild(t0); +#if PROFILE + double d1 = getSeconds(); + std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl; +#endif + } + + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::deleteGeometry(size_t geomID) + { + if (geomID >= bvh->objects.size()) return; + if (builders[geomID]) builders[geomID].reset(); + delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr; + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::clear() + { + for (size_t i=0; i<bvh->objects.size(); i++) + if (bvh->objects[i]) bvh->objects[i]->clear(); + + for (size_t i=0; i<builders.size(); i++) + if (builders[i]) builders[i].reset(); + + refs.clear(); + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::open_sequential(const size_t extSize) + { + if (refs.size() == 0) + return; + + refs.reserve(extSize); + +#if 1 + for (size_t i=0;i<refs.size();i++) + { + NodeRef ref = refs[i].node; + if (ref.isAABBNode()) + BVH::prefetch(ref); + } +#endif + + std::make_heap(refs.begin(),refs.end()); + while (refs.size()+N-1 <= extSize) + { + std::pop_heap (refs.begin(),refs.end()); + NodeRef ref = refs.back().node; + if (ref.isLeaf()) break; + refs.pop_back(); + + AABBNode* node = ref.getAABBNode(); + for (size_t i=0; i<N; i++) { + if (node->child(i) == BVH::emptyNode) continue; + refs.push_back(BuildRef(node->bounds(i),node->child(i))); + +#if 1 + NodeRef ref_pre = node->child(i); + if (ref_pre.isAABBNode()) + ref_pre.prefetch(); +#endif + std::push_heap (refs.begin(),refs.end()); + } + } + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/) + { + if (builders[objectID] == nullptr || // new mesh + dynamic_cast<RefBuilderSmall*>(builders[objectID].get()) == nullptr) // size change resulted in large->small change + { + builders[objectID].reset (new RefBuilderSmall(objectID)); + } + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh) + { + if (bvh->objects[objectID] == nullptr || // new mesh + builders[objectID]->meshQualityChanged (mesh->quality) || // changed build quality + dynamic_cast<RefBuilderLarge*>(builders[objectID].get()) == nullptr) // size change resulted in small->large change + { + Builder* builder = nullptr; + delete bvh->objects[objectID]; + createMeshAccel(objectID, builder); + builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality)); + } + } + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder); + } +#endif + +#if defined(__AVX__) +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder); + } +#endif + +#endif + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h new file mode 100644 index 0000000000..dc7ec7d278 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel.h @@ -0,0 +1,263 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <type_traits> + +#include "bvh_builder_twolevel_internal.h" +#include "bvh.h" +#include "../common/primref.h" +#include "../builders/priminfo.h" +#include "../builders/primrefgen.h" + +/* new open/merge builder */ +#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1 +#define ENABLE_OPEN_SEQUENTIAL 0 +#define SPLIT_MEMORY_RESERVE_FACTOR 1000 +#define SPLIT_MEMORY_RESERVE_SCALE 2 +#define SPLIT_MIN_EXT_SPACE 1000 + +namespace embree +{ + namespace isa + { + template<int N, typename Mesh, typename Primitive> + class BVHNBuilderTwoLevel : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + __forceinline static bool isSmallGeometry(Mesh* mesh) { + return mesh->size() <= 4; + } + + public: + + typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + + struct BuildRef : public PrimRef + { + public: + __forceinline BuildRef () {} + + __forceinline BuildRef (const BBox3fa& bounds, NodeRef node) + : PrimRef(bounds,(size_t)node), node(node) + { + if (node.isLeaf()) + bounds_area = 0.0f; + else + bounds_area = area(this->bounds()); + } + + /* used by the open/merge bvh builder */ + __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives) + : PrimRef(bounds,geomID,numPrimitives), node(node) + { + /* important for relative buildref ordering */ + if (node.isLeaf()) + bounds_area = 0.0f; + else + bounds_area = area(this->bounds()); + } + + __forceinline size_t size() const { + return primID(); + } + + friend bool operator< (const BuildRef& a, const BuildRef& b) { + return a.bounds_area < b.bounds_area; + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) { + return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }"; + } + + __forceinline unsigned int numPrimitives() const { return primID(); } + + public: + NodeRef node; + float bounds_area; + }; + + + __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) { + if (bref.node.isLeaf()) + { + refs[0] = bref; + return 1; + } + NodeRef ref = bref.node; + unsigned int geomID = bref.geomID(); + unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1); + AABBNode* node = ref.getAABBNode(); + size_t n = 0; + for (size_t i=0; i<N; i++) { + if (node->child(i) == BVH::emptyNode) continue; + refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims); + n++; + } + assert(n > 1); + return n; + } + + /*! Constructor. */ + BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD); + + /*! Destructor */ + ~BVHNBuilderTwoLevel (); + + /*! builder entry point */ + void build(); + void deleteGeometry(size_t geomID); + void clear(); + + void open_sequential(const size_t extSize); + + private: + + class RefBuilderBase { + public: + virtual ~RefBuilderBase () {} + virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0; + virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0; + }; + + class RefBuilderSmall : public RefBuilderBase { + public: + + RefBuilderSmall (size_t objectID) + : objectID_ (objectID) {} + + void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) { + + Mesh* mesh = topBuilder->scene->template getSafe<Mesh>(objectID_); + size_t meshSize = mesh->size(); + assert(isSmallGeometry(mesh)); + + mvector<PrimRef> prefs(topBuilder->scene->device, meshSize); + auto pinfo = createPrimRefArray(mesh,objectID_,meshSize,prefs,topBuilder->bvh->scene->progressInterface); + + size_t begin=0; + while (begin < pinfo.size()) + { + Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1); + accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene); + + /* create build primitive */ +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1); +#else + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node); +#endif + } + assert(begin == pinfo.size()); + } + + bool meshQualityChanged (RTCBuildQuality /*currQuality*/) { + return false; + } + + size_t objectID_; + }; + + class RefBuilderLarge : public RefBuilderBase { + public: + + RefBuilderLarge (size_t objectID, const Ref<Builder>& builder, RTCBuildQuality quality) + : objectID_ (objectID), builder_ (builder), quality_ (quality) {} + + void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) + { + BVH* object = topBuilder->getBVH(objectID_); assert(object); + + /* build object if it got modified */ + if (topBuilder->isGeometryModified(objectID_)) + builder_->build(); + + /* create build primitive */ + if (!object->getBounds().empty()) + { +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + Mesh* mesh = topBuilder->getMesh(objectID_); + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size()); +#else + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root); +#endif + } + } + + bool meshQualityChanged (RTCBuildQuality currQuality) { + return currQuality != quality_; + } + + private: + size_t objectID_; + Ref<Builder> builder_; + RTCBuildQuality quality_; + }; + + void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh); + void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh); + + BVH* getBVH (size_t objectID) { + return this->bvh->objects[objectID]; + } + Mesh* getMesh (size_t objectID) { + return this->scene->template getSafe<Mesh>(objectID); + } + bool isGeometryModified (size_t objectID) { + return this->scene->isGeometryModified(objectID); + } + + void resizeRefsList () + { + size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0), + [this](const range<size_t>& r)->size_t { + size_t c = 0; + for (auto i=r.begin(); i<r.end(); ++i) { + Mesh* mesh = scene->getSafe<Mesh>(i); + if (mesh == nullptr || mesh->numTimeSteps != 1) + continue; + size_t meshSize = mesh->size(); + c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1; + } + return c; + }, + std::plus<size_t>() + ); + + if (refs.size() < num) { + refs.resize(num); + } + } + + void createMeshAccel (size_t geomID, Builder*& builder) + { + bvh->objects[geomID] = new BVH(Primitive::type,scene); + BVH* accel = bvh->objects[geomID]; + auto mesh = scene->getSafe<Mesh>(geomID); + if (nullptr == mesh) { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type"); + return; + } + + __internal_two_level_builder__::MeshBuilder<N,Mesh,Primitive>()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder); + } + + using BuilderList = std::vector<std::unique_ptr<RefBuilderBase>>; + + BuilderList builders; + BVH* bvh; + Scene* scene; + mvector<BuildRef> refs; + mvector<PrimRef> prims; + std::atomic<int> nextRef; + const size_t singleThreadThreshold; + Geometry::GTypeMask gtype; + bool useMortonBuilder_ = false; + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h new file mode 100644 index 0000000000..023b52b780 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_builder_twolevel_internal.h @@ -0,0 +1,267 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +namespace embree +{ + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) + + namespace isa + { + + namespace __internal_two_level_builder__ { + + template<int N, typename Mesh, typename Primitive> + struct MortonBuilder {}; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4i> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,QuadMesh,Quad4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,UserGeometry,Object> { + MortonBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,Instance,InstancePrimitive> { + MortonBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4i> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,QuadMesh,Quad4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,UserGeometry,Object> { + MortonBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,Instance,InstancePrimitive> { + MortonBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);} + }; + + template<int N, typename Mesh, typename Primitive> + struct SAHBuilder {}; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4i> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,QuadMesh,Quad4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,UserGeometry,Object> { + SAHBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,Instance,InstancePrimitive> { + SAHBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4i> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,QuadMesh,Quad4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,UserGeometry,Object> { + SAHBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,Instance,InstancePrimitive> { + SAHBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);} + }; + + template<int N, typename Mesh, typename Primitive> + struct RefitBuilder {}; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4i> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,QuadMesh,Quad4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,UserGeometry,Object> { + RefitBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,Instance,InstancePrimitive> { + RefitBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4i> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,QuadMesh,Quad4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,UserGeometry,Object> { + RefitBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,Instance,InstancePrimitive> { + RefitBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);} + }; + + template<int N, typename Mesh, typename Primitive> + struct MeshBuilder { + MeshBuilder () {} + void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) { + if(useMortonBuilder) { + builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); + return; + } + switch (mesh->quality) { + case RTC_BUILD_QUALITY_LOW: builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break; + case RTC_BUILD_QUALITY_MEDIUM: + case RTC_BUILD_QUALITY_HIGH: builder = SAHBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break; + case RTC_BUILD_QUALITY_REFIT: builder = RefitBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break; + default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality"); + } + } + }; + } + } +}
\ No newline at end of file diff --git a/thirdparty/embree/kernels/bvh/bvh_collider.cpp b/thirdparty/embree/kernels/bvh/bvh_collider.cpp new file mode 100644 index 0000000000..9428c0b88e --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_collider.cpp @@ -0,0 +1,375 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_collider.h" +#include "../geometry/triangle_triangle_intersector.h" + +namespace embree +{ + namespace isa + { +#define CSTAT(x) + + size_t parallel_depth_threshold = 3; + CSTAT(std::atomic<size_t> bvh_collide_traversal_steps(0)); + CSTAT(std::atomic<size_t> bvh_collide_leaf_pairs(0)); + CSTAT(std::atomic<size_t> bvh_collide_leaf_iterations(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections1(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections2(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections3(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections4(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections5(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections(0)); + + struct Collision + { + __forceinline Collision() {} + + __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1) + : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {} + + unsigned geomID0; + unsigned primID0; + unsigned geomID1; + unsigned primID1; + }; + + template<int N> + __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN<N>::AABBNode& node1) + { + const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),node1.lower_x); + const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),node1.lower_y); + const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),node1.lower_z); + const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),node1.upper_x); + const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),node1.upper_y); + const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),node1.upper_z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + template<int N> + __forceinline size_t overlap(const BBox3fa& box0, const BBox<Vec3<vfloat<N>>>& box1) + { + const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),box1.lower.x); + const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),box1.lower.y); + const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),box1.lower.z); + const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),box1.upper.x); + const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),box1.upper.y); + const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),box1.upper.z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + template<int N> + __forceinline size_t overlap(const BBox<Vec3<vfloat<N>>>& box0, size_t i, const BBox<Vec3<vfloat<N>>>& box1) + { + const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x[i]),box1.lower.x); + const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y[i]),box1.lower.y); + const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z[i]),box1.lower.z); + const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x[i]),box1.upper.x); + const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y[i]),box1.upper.y); + const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z[i]),box1.upper.z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1) + { + CSTAT(bvh_collide_prim_intersections1++); + const TriangleMesh* mesh0 = scene0->get<TriangleMesh>(geomID0); + const TriangleMesh* mesh1 = scene1->get<TriangleMesh>(geomID1); + const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0); + const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1); + + /* special culling for scene intersection with itself */ + if (scene0 == scene1 && geomID0 == geomID1) + { + /* ignore self intersections */ + if (primID0 == primID1) + return false; + } + CSTAT(bvh_collide_prim_intersections2++); + + if (scene0 == scene1 && geomID0 == geomID1) + { + /* ignore intersection with topological neighbors */ + const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]); + if (any(vint4(tri1.v[0]) == t0)) return false; + if (any(vint4(tri1.v[1]) == t0)) return false; + if (any(vint4(tri1.v[2]) == t0)) return false; + } + CSTAT(bvh_collide_prim_intersections3++); + + const Vec3fa a0 = mesh0->vertex(tri0.v[0]); + const Vec3fa a1 = mesh0->vertex(tri0.v[1]); + const Vec3fa a2 = mesh0->vertex(tri0.v[2]); + const Vec3fa b0 = mesh1->vertex(tri1.v[0]); + const Vec3fa b1 = mesh1->vertex(tri1.v[1]); + const Vec3fa b2 = mesh1->vertex(tri1.v[2]); + + return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2); + } + + template<int N> + __forceinline void BVHNColliderUserGeom<N>::processLeaf(NodeRef node0, NodeRef node1) + { + Collision collisions[16]; + size_t num_collisions = 0; + + size_t N0; Object* leaf0 = (Object*) node0.leaf(N0); + size_t N1; Object* leaf1 = (Object*) node1.leaf(N1); + for (size_t i=0; i<N0; i++) { + for (size_t j=0; j<N1; j++) { + const unsigned geomID0 = leaf0[i].geomID(); + const unsigned primID0 = leaf0[i].primID(); + const unsigned geomID1 = leaf1[j].geomID(); + const unsigned primID1 = leaf1[j].primID(); + if (this->scene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue; + collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1); + if (num_collisions == 16) { + this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions); + num_collisions = 0; + } + } + } + if (num_collisions) + this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions); + } + + template<int N> + void BVHNCollider<N>::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1) + { + CSTAT(bvh_collide_traversal_steps++); + if (unlikely(ref0.isLeaf())) { + if (unlikely(ref1.isLeaf())) { + CSTAT(bvh_collide_leaf_pairs++); + processLeaf(ref0,ref1); + return; + } else goto recurse_node1; + + } else { + if (unlikely(ref1.isLeaf())) { + goto recurse_node0; + } else { + if (area(bounds0) > area(bounds1)) { + goto recurse_node0; + } + else { + goto recurse_node1; + } + } + } + + { + recurse_node0: + AABBNode* node0 = ref0.getAABBNode(); + size_t mask = overlap<N>(bounds1,*node0); + //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + //for (size_t i=0; i<N; i++) { +#if 0 + if (depth0 < parallel_depth_threshold) + { + parallel_for(size_t(N), [&] ( size_t i ) { + if (mask & ( 1 << i)) { + BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1); + } + }); + } + else +#endif + { + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1); + } + } + return; + } + + { + recurse_node1: + AABBNode* node1 = ref1.getAABBNode(); + size_t mask = overlap<N>(bounds0,*node1); + //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + //for (size_t i=0; i<N; i++) { +#if 0 + if (depth1 < parallel_depth_threshold) + { + parallel_for(size_t(N), [&] ( size_t i ) { + if (mask & ( 1 << i)) { + BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1); + } + }); + } + else +#endif + { + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1); + } + } + return; + } + } + + template<int N> + void BVHNCollider<N>::split(const CollideJob& job, jobvector& jobs) + { + if (unlikely(job.ref0.isLeaf())) { + if (unlikely(job.ref1.isLeaf())) { + jobs.push_back(job); + return; + } else goto recurse_node1; + } else { + if (unlikely(job.ref1.isLeaf())) { + goto recurse_node0; + } else { + if (area(job.bounds0) > area(job.bounds1)) { + goto recurse_node0; + } + else { + goto recurse_node1; + } + } + } + + { + recurse_node0: + const AABBNode* node0 = job.ref0.getAABBNode(); + size_t mask = overlap<N>(job.bounds1,*node0); + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1)); + } + return; + } + + { + recurse_node1: + const AABBNode* node1 = job.ref1.getAABBNode(); + size_t mask = overlap<N>(job.bounds0,*node1); + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1)); + } + return; + } + } + + template<int N> + void BVHNCollider<N>::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1) + { + CSTAT(bvh_collide_traversal_steps = 0); + CSTAT(bvh_collide_leaf_pairs = 0); + CSTAT(bvh_collide_leaf_iterations = 0); + CSTAT(bvh_collide_prim_intersections1 = 0); + CSTAT(bvh_collide_prim_intersections2 = 0); + CSTAT(bvh_collide_prim_intersections3 = 0); + CSTAT(bvh_collide_prim_intersections4 = 0); + CSTAT(bvh_collide_prim_intersections5 = 0); + CSTAT(bvh_collide_prim_intersections = 0); +#if 0 + collide_recurse(ref0,bounds0,ref1,bounds1,0,0); +#else + const int M = 2048; + jobvector jobs[2]; + jobs[0].reserve(M); + jobs[1].reserve(M); + jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0)); + int source = 0; + int target = 1; + + /* try to split job until job list is full */ + while (jobs[source].size()+8 <= M) + { + for (size_t i=0; i<jobs[source].size(); i++) + { + const CollideJob& job = jobs[source][i]; + size_t remaining = jobs[source].size()-i; + if (jobs[target].size()+remaining+8 > M) { + jobs[target].push_back(job); + } else { + split(job,jobs[target]); + } + } + + /* stop splitting jobs if we reached only leaves and cannot make progress anymore */ + if (jobs[target].size() == jobs[source].size()) + break; + + jobs[source].resize(0); + std::swap(source,target); + } + + /* parallel processing of all jobs */ + parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) { + CollideJob& j = jobs[source][i]; + collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1); + }); + + +#endif + CSTAT(PRINT(bvh_collide_traversal_steps)); + CSTAT(PRINT(bvh_collide_leaf_pairs)); + CSTAT(PRINT(bvh_collide_leaf_iterations)); + CSTAT(PRINT(bvh_collide_prim_intersections1)); + CSTAT(PRINT(bvh_collide_prim_intersections2)); + CSTAT(PRINT(bvh_collide_prim_intersections3)); + CSTAT(PRINT(bvh_collide_prim_intersections4)); + CSTAT(PRINT(bvh_collide_prim_intersections5)); + CSTAT(PRINT(bvh_collide_prim_intersections)); + } + + template<int N> + void BVHNColliderUserGeom<N>::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr) + { + BVHNColliderUserGeom<N>(bvh0->scene,bvh1->scene,callback,userPtr). + collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds()); + } + +#if defined (EMBREE_LOWEST_ISA) + struct collision_regression_test : public RegressionTest + { + collision_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f), + Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false; + return passed; + } + }; + + collision_regression_test collision_regression("collision_regression_test"); +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Collider Definitions + //////////////////////////////////////////////////////////////////////////////// + + DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>); + +#if defined(__AVX__) + DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>); +#endif + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_collider.h b/thirdparty/embree/kernels/bvh/bvh_collider.h new file mode 100644 index 0000000000..3c42f211c1 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_collider.h @@ -0,0 +1,72 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../geometry/trianglev.h" +#include "../geometry/object.h" + +namespace embree +{ + namespace isa + { + template<int N> + class BVHNCollider + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + + struct CollideJob + { + CollideJob () {} + + CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0, + NodeRef ref1, const BBox3fa& bounds1, size_t depth1) + : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {} + + NodeRef ref0; + BBox3fa bounds0; + size_t depth0; + NodeRef ref1; + BBox3fa bounds1; + size_t depth1; + }; + + typedef vector_t<CollideJob, aligned_allocator<CollideJob,16>> jobvector; + + void split(const CollideJob& job, jobvector& jobs); + + public: + __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr) + : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {} + + public: + virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0; + void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1); + void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1); + + protected: + Scene* scene0; + Scene* scene1; + RTCCollideFunc callback; + void* userPtr; + }; + + template<int N> + class BVHNColliderUserGeom : public BVHNCollider<N> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + + __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr) + : BVHNCollider<N>(scene0,scene1,callback,userPtr) {} + + virtual void processLeaf(NodeRef leaf0, NodeRef leaf1); + public: + static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr); + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_factory.h b/thirdparty/embree/kernels/bvh/bvh_factory.h new file mode 100644 index 0000000000..453d455bd9 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_factory.h @@ -0,0 +1,21 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../common/isa.h" +#include "../common/accel.h" +#include "../common/scene.h" +#include "../geometry/curve_intersector_virtual.h" + +namespace embree +{ + /*! BVH instantiations */ + class BVHFactory + { + public: + enum class BuildVariant { STATIC, DYNAMIC, HIGH_QUALITY }; + enum class IntersectVariant { FAST, ROBUST }; + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp new file mode 100644 index 0000000000..9594f402c3 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector1.cpp @@ -0,0 +1,321 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector1.h" +#include "node_intersector1.h" +#include "bvh_traverser1.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/curveNv_intersector.h" +#include "../geometry/curveNi_intersector.h" +#include "../geometry/curveNi_mb_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" +#include "../geometry/subgrid_intersector.h" +#include "../geometry/subgrid_mb_intersector.h" +#include "../geometry/curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + template<int N, int types, bool robust, typename PrimitiveIntersector1> + void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::intersect(const Accel::Intersectors* __restrict__ This, + RayHit& __restrict__ ray, + IntersectContext* __restrict__ context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + + /* perform per ray precalculations required by the primitive intersector */ + Precalculations pre(ray, bvh); + + /* stack state */ + StackItemT<NodeRef> stack[stackSize]; // stack of nodes + StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer + StackItemT<NodeRef>* stackEnd = stack+stackSize; + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + if (bvh->root == BVH::emptyNode) + return; + + /* filter out invalid rays */ +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (!ray.valid()) return; +#endif + /* verify correct input */ + assert(ray.valid()); + assert(ray.tnear() >= 0.0f); + assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f)); + + /* load the ray into SIMD registers */ + TravRay<N,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f)); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit<N, types> nodeTraverser; + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ + if (unlikely(*(float*)&stackPtr->dist > ray.tfar)) + continue; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat<N> tNear; + STAT3(normal.trav_nodes,1,1,1); + bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray, ray.time(), tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node); + tray.tfar = ray.tfar; + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + } + + template<int N, int types, bool robust, typename PrimitiveIntersector1> + void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::occluded(const Accel::Intersectors* __restrict__ This, + Ray& __restrict__ ray, + IntersectContext* __restrict__ context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + + /* early out for already occluded rays */ + if (unlikely(ray.tfar < 0.0f)) + return; + + /* perform per ray precalculations required by the primitive intersector */ + Precalculations pre(ray, bvh); + + /* stack state */ + NodeRef stack[stackSize]; // stack of nodes that still need to get traversed + NodeRef* stackPtr = stack+1; // current stack pointer + NodeRef* stackEnd = stack+stackSize; + stack[0] = bvh->root; + + /* filter out invalid rays */ +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (!ray.valid()) return; +#endif + + /* verify correct input */ + assert(ray.valid()); + assert(ray.tnear() >= 0.0f); + assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f)); + + /* load the ray into SIMD registers */ + TravRay<N,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f)); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit<N, types> nodeTraverser; + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = (NodeRef)*stackPtr; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat<N> tNear; + STAT3(shadow.trav_nodes,1,1,1); + bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray, ray.time(), tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) { + ray.tfar = neg_inf; + break; + } + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + *stackPtr = (NodeRef)lazy_node; + stackPtr++; + } + } + } + + template<int N, int types, bool robust, typename PrimitiveIntersector1> + struct PointQueryDispatch + { + typedef typename PrimitiveIntersector1::Precalculations Precalculations; + typedef typename PrimitiveIntersector1::Primitive Primitive; + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + + static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return false; + + /* stack state */ + StackItemT<NodeRef> stack[stackSize]; // stack of nodes + StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer + StackItemT<NodeRef>* stackEnd = stack+stackSize; + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + /* verify correct input */ + assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f)); + + /* load the point query into SIMD registers */ + TravPointQuery<N> tquery(query->p, context->query_radius); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit<N,types> nodeTraverser; + + bool changed = false; + float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE + ? query->radius * query->radius + : dot(context->query_radius, context->query_radius); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ + if (unlikely(*(float*)&stackPtr->dist > cull_radius)) + continue; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat<N> tNear; + STAT3(point_query.trav_nodes,1,1,1); + bool nodeIntersected; + if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { + nodeIntersected = BVHNNodePointQuerySphere1<N, types>::pointQuery(cur, tquery, query->time, tNear, mask); + } else { + nodeIntersected = BVHNNodePointQueryAABB1 <N, types>::pointQuery(cur, tquery, query->time, tNear, mask); + } + if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(point_query.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node)) + { + changed = true; + tquery.rad = context->query_radius; + cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE + ? query->radius * query->radius + : dot(context->query_radius, context->query_radius); + } + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + return changed; + } + }; + + /* disable point queries for not yet supported geometry types */ + template<int N, int types, bool robust> + struct PointQueryDispatch<N, types, robust, VirtualCurveIntersector1> { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template<int N, int types, bool robust> + struct PointQueryDispatch<N, types, robust, SubdivPatch1Intersector1> { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template<int N, int types, bool robust> + struct PointQueryDispatch<N, types, robust, SubdivPatch1MBIntersector1> { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template<int N, int types, bool robust, typename PrimitiveIntersector1> + bool BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::pointQuery( + const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) + { + return PointQueryDispatch<N, types, robust, PrimitiveIntersector1>::pointQuery(This, query, context); + } + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1.h b/thirdparty/embree/kernels/bvh/bvh_intersector1.h new file mode 100644 index 0000000000..2df3d6eddb --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector1.h @@ -0,0 +1,34 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/point_query.h" + +namespace embree +{ + namespace isa + { + /*! BVH single ray intersector. */ + template<int N, int types, bool robust, typename PrimitiveIntersector1> + class BVHNIntersector1 + { + /* shortcuts for frequently used types */ + typedef typename PrimitiveIntersector1::Precalculations Precalculations; + typedef typename PrimitiveIntersector1::Primitive Primitive; + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + + static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + + public: + static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context); + static void occluded (const Accel::Intersectors* This, Ray& ray, IntersectContext* context); + static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context); + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp new file mode 100644 index 0000000000..831d613367 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector1_bvh4.cpp @@ -0,0 +1,61 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector1.cpp" + +namespace embree +{ + namespace isa + { + int getISA() { + return VerifyMultiTargetLinking::getISA(); + } + + //////////////////////////////////////////////////////////////////////////////// + /// BVH4Intersector1 Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<TriangleMvIntersector1Pluecker<4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<4 COMMA true> > >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMvMBIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMiMBIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<TriangleMvMBIntersector1Pluecker<4 COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<TriangleMiMBIntersector1Pluecker<4 COMMA true> > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMvIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<QuadMvIntersector1Pluecker<4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<QuadMiMBIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<QuadMiMBIntersector1Pluecker<4 COMMA true> > >)); + + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>)); + + IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<ObjectIntersector1<false>> >)); + IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<ObjectIntersector1<true>> >)); + + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceIntersector1> >)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceIntersector1MB> >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >)); + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >)); + //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >)); + + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h new file mode 100644 index 0000000000..50ebf375c4 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.h @@ -0,0 +1,58 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/stack_item.h" +#include "node_intersector_frustum.h" + +namespace embree +{ + namespace isa + { + template<int K, bool robust> + struct TravRayK; + + /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */ + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single = true> + class BVHNIntersectorKHybrid + { + /* shortcuts for frequently used types */ + typedef typename PrimitiveIntersectorK::Precalculations Precalculations; + typedef typename PrimitiveIntersectorK::Primitive Primitive; + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + typedef typename BVH::AABBNode AABBNode; + + static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth; + + static const size_t switchThresholdIncoherent = \ + (K==4) ? 3 : + (K==8) ? ((N==4) ? 5 : 7) : + (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal + 0; + + private: + static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre, + RayHitK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context); + static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre, + RayK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context); + + public: + static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context); + static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context); + + static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context); + static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context); + + }; + + /*! BVH packet intersector. */ + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK> + class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, false> {}; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h new file mode 100644 index 0000000000..717f559677 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h @@ -0,0 +1,270 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector_packet_stream.h" +#include "node_intersector_frustum.h" +#include "bvh_traverser_stream.h" + +namespace embree +{ + namespace isa + { + /*! BVH ray stream intersector. */ + template<int N, int types, bool robust, typename PrimitiveIntersector> + class BVHNIntersectorStream + { + /* shortcuts for frequently used types */ + template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>; + template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK; + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB AABBNodeMB; + + template<int K> + __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays, + TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant) + { + const size_t numPackets = (numOctantRays+K-1)/K; + + Vec3vf<K> tmp_min_rdir(pos_inf); + Vec3vf<K> tmp_max_rdir(neg_inf); + Vec3vf<K> tmp_min_org(pos_inf); + Vec3vf<K> tmp_max_org(neg_inf); + vfloat<K> tmp_min_dist(pos_inf); + vfloat<K> tmp_max_dist(neg_inf); + + size_t m_active = 0; + for (size_t i = 0; i < numPackets; i++) + { + const vfloat<K> tnear = inputPackets[i]->tnear(); + const vfloat<K> tfar = inputPackets[i]->tfar; + vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f); + +#if defined(EMBREE_IGNORE_INVALID_RAYS) + m_valid &= inputPackets[i]->valid(); +#endif + + m_active |= (size_t)movemask(m_valid) << (i*K); + + vfloat<K> packet_min_dist = max(tnear, 0.0f); + vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf); + tmp_min_dist = min(tmp_min_dist, packet_min_dist); + tmp_max_dist = max(tmp_max_dist, packet_max_dist); + + const Vec3vf<K>& org = inputPackets[i]->org; + const Vec3vf<K>& dir = inputPackets[i]->dir; + + new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist); + + tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf))); + tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf))); + tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf))); + tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf))); + } + + m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1); + + + const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x), + reduce_min(tmp_min_rdir.y), + reduce_min(tmp_min_rdir.z)); + + const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x), + reduce_max(tmp_max_rdir.y), + reduce_max(tmp_max_rdir.z)); + + const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x), + reduce_min(tmp_min_org.y), + reduce_min(tmp_min_org.z)); + + const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x), + reduce_max(tmp_max_org.y), + reduce_max(tmp_max_org.z)); + + commonOctant = + (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) && + (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) && + (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f); + + const float frustum_min_dist = reduce_min(tmp_min_dist); + const float frustum_max_dist = reduce_max(tmp_max_dist); + + frustum.init(reduced_min_origin, reduced_max_origin, + reduced_min_rdir, reduced_max_rdir, + frustum_min_dist, frustum_max_dist, + N); + + return m_active; + } + + template<int K> + __forceinline static size_t intersectAABBNodePacket(size_t m_active, + const TravRayKStream<K,robust>* packets, + const AABBNode* __restrict__ node, + size_t boxID, + const NearFarPrecalculations& nf) + { + assert(m_active); + const size_t startPacketID = bsf(m_active) / K; + const size_t endPacketID = bsr(m_active) / K; + size_t m_trav_active = 0; + for (size_t i = startPacketID; i <= endPacketID; i++) + { + const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf); + m_trav_active |= m_hit << (i*K); + } + return m_trav_active; + } + + template<int K> + __forceinline static size_t traverseCoherentStream(size_t m_active, + TravRayKStream<K, robust>* packets, + const AABBNode* __restrict__ node, + const Frustum<robust>& frustum, + size_t* maskK, + vfloat<N>& dist) + { + size_t m_node_hit = intersectNodeFrustum<N>(node, frustum, dist); + const size_t first_index = bsf(m_active); + const size_t first_packetID = first_index / K; + const size_t first_rayID = first_index % K; + size_t m_first_hit = intersectNode1<N>(node, packets[first_packetID], first_rayID, frustum.nf); + + /* this make traversal independent of the ordering of rays */ + size_t m_node = m_node_hit ^ m_first_hit; + while (unlikely(m_node)) + { + const size_t boxID = bscf(m_node); + const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf); + m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID); + maskK[boxID] = m_current; + } + return m_node_hit; + } + + // TODO: explicit 16-wide path for KNL + template<int K> + __forceinline static vint<N> traverseIncoherentStream(size_t m_active, + TravRayKStreamFast<K>* __restrict__ packets, + const AABBNode* __restrict__ node, + const NearFarPrecalculations& nf, + const int shiftTable[32]) + { + const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); + const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); + const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); + const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); + const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); + assert(m_active); + vint<N> vmask(zero); + do + { + STAT3(shadow.trav_nodes,1,1,1); + const size_t rayID = bscf(m_active); + assert(rayID < MAX_INTERNAL_STREAM_SIZE); + TravRayKStream<K,robust> &p = packets[rayID / K]; + const size_t i = rayID % K; + const vint<N> bitmask(shiftTable[rayID]); + const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); + const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); + const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); + const vfloat<N> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); + const vfloat<N> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); + const vfloat<N> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); + const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i])); + const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i])); + + const vbool<N> hit_mask = tNear <= tFar; +#if defined(__AVX2__) + vmask = vmask | (bitmask & vint<N>(hit_mask)); +#else + vmask = select(hit_mask, vmask | bitmask, vmask); +#endif + } while(m_active); + return vmask; + } + + template<int K> + __forceinline static vint<N> traverseIncoherentStream(size_t m_active, + TravRayKStreamRobust<K>* __restrict__ packets, + const AABBNode* __restrict__ node, + const NearFarPrecalculations& nf, + const int shiftTable[32]) + { + const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); + const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); + const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); + const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); + const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); + assert(m_active); + vint<N> vmask(zero); + do + { + STAT3(shadow.trav_nodes,1,1,1); + const size_t rayID = bscf(m_active); + assert(rayID < MAX_INTERNAL_STREAM_SIZE); + TravRayKStream<K,robust> &p = packets[rayID / K]; + const size_t i = rayID % K; + const vint<N> bitmask(shiftTable[rayID]); + const vfloat<N> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i]; + const vfloat<N> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i]; + const vfloat<N> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i]; + const vfloat<N> tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i]; + const vfloat<N> tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i]; + const vfloat<N> tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i]; + const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i])); + const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i])); + const float round_down = 1.0f-2.0f*float(ulp); + const float round_up = 1.0f+2.0f*float(ulp); + const vbool<N> hit_mask = round_down*tNear <= round_up*tFar; +#if defined(__AVX2__) + vmask = vmask | (bitmask & vint<N>(hit_mask)); +#else + vmask = select(hit_mask, vmask | bitmask, vmask); +#endif + } while(m_active); + return vmask; + } + + + static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth; + + public: + static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); + static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); + + private: + template<int K> + static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context); + + template<int K> + static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); + + template<int K> + static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); + }; + + + /*! BVH ray stream intersector with direct fallback to packets. */ + template<int N> + class BVHNIntersectorStreamPacketFallback + { + public: + static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); + static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); + + private: + template<int K> + static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context); + + template<int K> + static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h new file mode 100644 index 0000000000..e7df7c2ae2 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.h @@ -0,0 +1,41 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/ray.h" +#include "../common/scene.h" + +namespace embree +{ + namespace isa + { + class RayStreamFilter + { + public: + static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context); + static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context); + static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context); + + static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context); + static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context); + static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context); + + private: + template<int K, bool intersect> + static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context); + + template<int K, bool intersect> + static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context); + + template<int K, bool intersect> + static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + + template<int K, bool intersect> + static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context); + }; + } +}; diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h new file mode 100644 index 0000000000..57530692bc --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h @@ -0,0 +1,213 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! BVHN AABBNode */ + template<typename NodeRef, int N> + struct AABBNode_t : public BaseNode_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const + { + AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const { + node.getAABBNode()->setRef(i,child); + node.getAABBNode()->setBounds(i,bounds); + } + }; + + struct Create2 + { + template<typename BuildRecord> + __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const + { + AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear(); + for (size_t i=0; i<num; i++) node->setBounds(i,children[i].bounds()); + return NodeRef::encodeNode(node); + } + }; + + struct Set2 + { + template<typename BuildRecord> + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + AABBNode_t* node = ref.getAABBNode(); + for (size_t i=0; i<num; i++) node->setRef(i,children[i]); + return ref; + } + }; + + struct Set3 + { + Set3 (FastAllocator* allocator, PrimRef* prims) + : allocator(allocator), prims(prims) {} + + template<typename BuildRecord> + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + AABBNode_t* node = ref.getAABBNode(); + for (size_t i=0; i<num; i++) node->setRef(i,children[i]); + + if (unlikely(precord.alloc_barrier)) + { + PrimRef* begin = &prims[precord.prims.begin()]; + PrimRef* end = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!! + size_t bytes = (size_t)end - (size_t)begin; + allocator->addBlock(begin,bytes); + } + + return ref; + } + + FastAllocator* const allocator; + PrimRef* const prims; + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_x = lower_y = lower_z = pos_inf; + upper_x = upper_y = upper_z = neg_inf; + BaseNode_t<NodeRef,N>::clear(); + } + + /*! Sets bounding box and ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const BBox3fa& bounds) + { + assert(i < N); + lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z; + upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) { + setBounds(i,bounds); + children[i] = ref; + } + + /*! Returns bounds of node. */ + __forceinline BBox3fa bounds() const { + const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z)); + const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z)); + return BBox3fa(lower,upper); + } + + /*! Returns bounds of specified child. */ + __forceinline BBox3fa bounds(size_t i) const + { + assert(i < N); + const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]); + const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]); + return BBox3fa(lower,upper); + } + + /*! Returns extent of bounds of specified child. */ + __forceinline Vec3fa extend(size_t i) const { + return bounds(i).size(); + } + + /*! Returns bounds of all children (implemented later as specializations) */ + __forceinline void bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const; + + /*! swap two children of the node */ + __forceinline void swap(size_t i, size_t j) + { + assert(i<N && j<N); + std::swap(children[i],children[j]); + std::swap(lower_x[i],lower_x[j]); + std::swap(lower_y[i],lower_y[j]); + std::swap(lower_z[i],lower_z[j]); + std::swap(upper_x[i],upper_x[j]); + std::swap(upper_y[i],upper_y[j]); + std::swap(upper_z[i],upper_z[j]); + } + + /*! swap the children of two nodes */ + __forceinline static void swap(AABBNode_t* a, size_t i, AABBNode_t* b, size_t j) + { + assert(i<N && j<N); + std::swap(a->children[i],b->children[j]); + std::swap(a->lower_x[i],b->lower_x[j]); + std::swap(a->lower_y[i],b->lower_y[j]); + std::swap(a->lower_z[i],b->lower_z[j]); + std::swap(a->upper_x[i],b->upper_x[j]); + std::swap(a->upper_y[i],b->upper_y[j]); + std::swap(a->upper_z[i],b->upper_z[j]); + } + + /*! compacts a node (moves empty children to the end) */ + __forceinline static void compact(AABBNode_t* a) + { + /* find right most filled node */ + ssize_t j=N; + for (j=j-1; j>=0; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + + /* replace empty nodes with filled nodes */ + for (ssize_t i=0; i<j; i++) { + if (a->child(i) == NodeRef::emptyNode) { + a->swap(i,j); + for (j=j-1; j>i; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + } + } + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; } + __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; } + + /*! output operator */ + friend embree_ostream operator<<(embree_ostream o, const AABBNode_t& n) + { + o << "AABBNode { " << embree_endl; + o << " lower_x " << n.lower_x << embree_endl; + o << " upper_x " << n.upper_x << embree_endl; + o << " lower_y " << n.lower_y << embree_endl; + o << " upper_y " << n.upper_y << embree_endl; + o << " lower_z " << n.lower_z << embree_endl; + o << " upper_z " << n.upper_z << embree_endl; + o << " children = "; + for (size_t i=0; i<N; i++) o << n.children[i] << " "; + o << embree_endl; + o << "}" << embree_endl; + return o; + } + + public: + vfloat<N> lower_x; //!< X dimension of lower bounds of all N children. + vfloat<N> upper_x; //!< X dimension of upper bounds of all N children. + vfloat<N> lower_y; //!< Y dimension of lower bounds of all N children. + vfloat<N> upper_y; //!< Y dimension of upper bounds of all N children. + vfloat<N> lower_z; //!< Z dimension of lower bounds of all N children. + vfloat<N> upper_z; //!< Z dimension of upper bounds of all N children. + }; + + template<> + __forceinline void AABBNode_t<NodeRefPtr<4>,4>::bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const { + transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower); + transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper); + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h new file mode 100644 index 0000000000..c4cea7d8ba --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h @@ -0,0 +1,247 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! Motion Blur AABBNode */ + template<typename NodeRef, int N> + struct AABBNodeMB_t : public BaseNode_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + typedef BVHNodeRecord<NodeRef> NodeRecord; + typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + + struct Create + { + template<typename BuildRecord> + __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const + { + AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + template<typename BuildRecord> + __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const + { + AABBNodeMB_t* node = ref.getAABBNodeMB(); + + LBBox3fa bounds = empty; + for (size_t i=0; i<num; i++) { + node->setRef(i,children[i].ref); + node->setBounds(i,children[i].lbounds); + bounds.extend(children[i].lbounds); + } + return NodeRecordMB(ref,bounds); + } + }; + + struct SetTimeRange + { + __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {} + + template<typename BuildRecord> + __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const + { + AABBNodeMB_t* node = ref.getAABBNodeMB(); + + LBBox3fa bounds = empty; + for (size_t i=0; i<num; i++) { + node->setRef(i, children[i].ref); + node->setBounds(i, children[i].lbounds, tbounds); + bounds.extend(children[i].lbounds); + } + return NodeRecordMB(ref,bounds); + } + + BBox1f tbounds; + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_x = lower_y = lower_z = vfloat<N>(pos_inf); + upper_x = upper_y = upper_z = vfloat<N>(neg_inf); + lower_dx = lower_dy = lower_dz = vfloat<N>(0.0f); + upper_dx = upper_dy = upper_dz = vfloat<N>(0.0f); + BaseNode_t<NodeRef,N>::clear(); + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, NodeRef ref) { + children[i] = ref; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i) + { + /*! for empty bounds we have to avoid inf-inf=nan */ + BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX))); + BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX))); + bounds0 = bounds0.enlarge_by(4.0f*float(ulp)); + bounds1 = bounds1.enlarge_by(4.0f*float(ulp)); + Vec3fa dlower = bounds1.lower-bounds0.lower; + Vec3fa dupper = bounds1.upper-bounds0.upper; + + lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z; + upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z; + + lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z; + upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds) { + setBounds(i, bounds.bounds0, bounds.bounds1); + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) { + setBounds(i, bounds.global(tbounds)); + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) { + lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z; + upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z; + children[i] = ref; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRecordMB4D& child) + { + setRef(i, child.ref); + setBounds(i, child.lbounds, child.dt); + } + + /*! Return bounding box for time 0 */ + __forceinline BBox3fa bounds0(size_t i) const { + return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]), + Vec3fa(upper_x[i],upper_y[i],upper_z[i])); + } + + /*! Return bounding box for time 1 */ + __forceinline BBox3fa bounds1(size_t i) const { + return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]), + Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i])); + } + + /*! Returns bounds of node. */ + __forceinline BBox3fa bounds() const { + return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)), + reduce_min(min(lower_y,lower_y+lower_dy)), + reduce_min(min(lower_z,lower_z+lower_dz))), + Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)), + reduce_max(max(upper_y,upper_y+upper_dy)), + reduce_max(max(upper_z,upper_z+upper_dz)))); + } + + /*! Return bounding box of child i */ + __forceinline BBox3fa bounds(size_t i) const { + return merge(bounds0(i),bounds1(i)); + } + + /*! Return linear bounding box of child i */ + __forceinline LBBox3fa lbounds(size_t i) const { + return LBBox3fa(bounds0(i),bounds1(i)); + } + + /*! Return bounding box of child i at specified time */ + __forceinline BBox3fa bounds(size_t i, float time) const { + return lerp(bounds0(i),bounds1(i),time); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i) const { + return lbounds(i).expectedHalfArea(); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const { + return lbounds(i).expectedHalfArea(t0t1); + } + + /*! swap two children of the node */ + __forceinline void swap(size_t i, size_t j) + { + assert(i<N && j<N); + std::swap(children[i],children[j]); + + std::swap(lower_x[i],lower_x[j]); + std::swap(upper_x[i],upper_x[j]); + std::swap(lower_y[i],lower_y[j]); + std::swap(upper_y[i],upper_y[j]); + std::swap(lower_z[i],lower_z[j]); + std::swap(upper_z[i],upper_z[j]); + + std::swap(lower_dx[i],lower_dx[j]); + std::swap(upper_dx[i],upper_dx[j]); + std::swap(lower_dy[i],lower_dy[j]); + std::swap(upper_dy[i],upper_dy[j]); + std::swap(lower_dz[i],lower_dz[j]); + std::swap(upper_dz[i],upper_dz[j]); + } + + /*! compacts a node (moves empty children to the end) */ + __forceinline static void compact(AABBNodeMB_t* a) + { + /* find right most filled node */ + ssize_t j=N; + for (j=j-1; j>=0; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + + /* replace empty nodes with filled nodes */ + for (ssize_t i=0; i<j; i++) { + if (a->child(i) == NodeRef::emptyNode) { + a->swap(i,j); + for (j=j-1; j>i; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + } + } + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; } + __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; } + + /*! stream output operator */ + friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB_t& n) + { + cout << "AABBNodeMB {" << embree_endl; + for (size_t i=0; i<N; i++) + { + const BBox3fa b0 = n.bounds0(i); + const BBox3fa b1 = n.bounds1(i); + cout << " child" << i << " { " << embree_endl; + cout << " bounds0 = " << b0 << ", " << embree_endl; + cout << " bounds1 = " << b1 << ", " << embree_endl; + cout << " }"; + } + cout << "}"; + return cout; + } + + public: + vfloat<N> lower_x; //!< X dimension of lower bounds of all N children. + vfloat<N> upper_x; //!< X dimension of upper bounds of all N children. + vfloat<N> lower_y; //!< Y dimension of lower bounds of all N children. + vfloat<N> upper_y; //!< Y dimension of upper bounds of all N children. + vfloat<N> lower_z; //!< Z dimension of lower bounds of all N children. + vfloat<N> upper_z; //!< Z dimension of upper bounds of all N children. + + vfloat<N> lower_dx; //!< X dimension of lower bounds of all N children. + vfloat<N> upper_dx; //!< X dimension of upper bounds of all N children. + vfloat<N> lower_dy; //!< Y dimension of lower bounds of all N children. + vfloat<N> upper_dy; //!< Y dimension of upper bounds of all N children. + vfloat<N> lower_dz; //!< Z dimension of lower bounds of all N children. + vfloat<N> upper_dz; //!< Z dimension of upper bounds of all N children. + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h new file mode 100644 index 0000000000..46a81d7581 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h @@ -0,0 +1,107 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_aabb_mb.h" + +namespace embree +{ + /*! Aligned 4D Motion Blur Node */ + template<typename NodeRef, int N> + struct AABBNodeMB4D_t : public AABBNodeMB_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + using AABBNodeMB_t<NodeRef,N>::set; + + typedef BVHNodeRecord<NodeRef> NodeRecord; + typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + + struct Create + { + template<typename BuildRecord> + __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const + { + if (hasTimeSplits) + { + AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + else + { + AABBNodeMB_t<NodeRef,N>* node = (AABBNodeMB_t<NodeRef,N>*) alloc.malloc0(sizeof(AABBNodeMB_t<NodeRef,N>),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + } + }; + + struct Set + { + template<typename BuildRecord> + __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const + { + if (likely(ref.isAABBNodeMB())) { + for (size_t i=0; i<num; i++) + ref.getAABBNodeMB()->set(i, children[i]); + } else { + for (size_t i=0; i<num; i++) + ref.getAABBNodeMB4D()->set(i, children[i]); + } + } + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_t = vfloat<N>(pos_inf); + upper_t = vfloat<N>(neg_inf); + AABBNodeMB_t<NodeRef,N>::clear(); + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) + { + AABBNodeMB_t<NodeRef,N>::setBounds(i, bounds.global(tbounds)); + lower_t[i] = tbounds.lower; + upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRecordMB4D& child) { + AABBNodeMB_t<NodeRef,N>::setRef(i,child.ref); + setBounds(i, child.lbounds, child.dt); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i) const { + return AABBNodeMB_t<NodeRef,N>::lbounds(i).expectedHalfArea(timeRange(i)); + } + + /*! returns time range for specified child */ + __forceinline BBox1f timeRange(size_t i) const { + return BBox1f(lower_t[i],upper_t[i]); + } + + /*! stream output operator */ + friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n) + { + cout << "AABBNodeMB4D {" << embree_endl; + for (size_t i=0; i<N; i++) + { + const BBox3fa b0 = n.bounds0(i); + const BBox3fa b1 = n.bounds1(i); + cout << " child" << i << " { " << embree_endl; + cout << " bounds0 = " << lerp(b0,b1,n.lower_t[i]) << ", " << embree_endl; + cout << " bounds1 = " << lerp(b0,b1,n.upper_t[i]) << ", " << embree_endl; + cout << " time_bounds = " << n.lower_t[i] << ", " << n.upper_t[i] << embree_endl; + cout << " }"; + } + cout << "}"; + return cout; + } + + public: + vfloat<N> lower_t; //!< time dimension of lower bounds of all N children + vfloat<N> upper_t; //!< time dimension of upper bounds of all N children + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_node_base.h b/thirdparty/embree/kernels/bvh/bvh_node_base.h new file mode 100644 index 0000000000..a5570a7b9e --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_node_base.h @@ -0,0 +1,43 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_ref.h" + +namespace embree +{ + + /*! BVHN Base Node */ + template<typename NodeRef, int N> + struct BaseNode_t + { + /*! Clears the node. */ + __forceinline void clear() + { + for (size_t i=0; i<N; i++) + children[i] = NodeRef::emptyNode; + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; } + __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; } + + /*! verifies the node */ + __forceinline bool verify() const + { + for (size_t i=0; i<N; i++) { + if (child(i) == NodeRef::emptyNode) { + for (; i<N; i++) { + if (child(i) != NodeRef::emptyNode) + return false; + } + break; + } + } + return true; + } + + NodeRef children[N]; //!< Pointer to the N children (can be a node or leaf) + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_node_obb.h b/thirdparty/embree/kernels/bvh/bvh_node_obb.h new file mode 100644 index 0000000000..e6b500691e --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_node_obb.h @@ -0,0 +1,98 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! Node with unaligned bounds */ + template<typename NodeRef, int N> + struct OBBNode_t : public BaseNode_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const + { + OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const { + node.ungetAABBNode()->setRef(i,child); + node.ungetAABBNode()->setBounds(i,bounds); + } + }; + + /*! Clears the node. */ + __forceinline void clear() + { + naabb.l.vx = Vec3fa(nan); + naabb.l.vy = Vec3fa(nan); + naabb.l.vz = Vec3fa(nan); + naabb.p = Vec3fa(nan); + BaseNode_t<NodeRef,N>::clear(); + } + + /*! Sets bounding box. */ + __forceinline void setBounds(size_t i, const OBBox3fa& b) + { + assert(i < N); + + AffineSpace3fa space = b.space; + space.p -= b.bounds.lower; + space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space; + + naabb.l.vx.x[i] = space.l.vx.x; + naabb.l.vx.y[i] = space.l.vx.y; + naabb.l.vx.z[i] = space.l.vx.z; + + naabb.l.vy.x[i] = space.l.vy.x; + naabb.l.vy.y[i] = space.l.vy.y; + naabb.l.vy.z[i] = space.l.vy.z; + + naabb.l.vz.x[i] = space.l.vz.x; + naabb.l.vz.y[i] = space.l.vz.y; + naabb.l.vz.z[i] = space.l.vz.z; + + naabb.p.x[i] = space.p.x; + naabb.p.y[i] = space.p.y; + naabb.p.z[i] = space.p.z; + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Returns the extent of the bounds of the ith child */ + __forceinline Vec3fa extent(size_t i) const { + assert(i<N); + const Vec3fa vx(naabb.l.vx.x[i],naabb.l.vx.y[i],naabb.l.vx.z[i]); + const Vec3fa vy(naabb.l.vy.x[i],naabb.l.vy.y[i],naabb.l.vy.z[i]); + const Vec3fa vz(naabb.l.vz.x[i],naabb.l.vz.y[i],naabb.l.vz.z[i]); + return rsqrt(vx*vx + vy*vy + vz*vz); + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; } + __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; } + + /*! output operator */ + friend embree_ostream operator<<(embree_ostream o, const OBBNode_t& n) + { + o << "UnAABBNode { " << n.naabb << " } " << embree_endl; + return o; + } + + public: + AffineSpace3vf<N> naabb; //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space) + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h new file mode 100644 index 0000000000..c06b1aea5e --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_node_obb_mb.h @@ -0,0 +1,90 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + template<typename NodeRef, int N> + struct OBBNodeMB_t : public BaseNode_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const + { + OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const { + node.ungetAABBNodeMB()->setRef(i,child); + node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt)); + } + }; + + /*! Clears the node. */ + __forceinline void clear() + { + space0 = one; + //b0.lower = b0.upper = Vec3fa(nan); + b1.lower = b1.upper = Vec3fa(nan); + BaseNode_t<NodeRef,N>::clear(); + } + + /*! Sets space and bounding boxes. */ + __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) { + setBounds(i,space,lbounds.bounds0,lbounds.bounds1); + } + + /*! Sets space and bounding boxes. */ + __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c) + { + assert(i < N); + + AffineSpace3fa space = s0; + space.p -= a.lower; + Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower); + space = AffineSpace3fa::scale(scale)*space; + BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale); + BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale); + + space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z; + space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z; + space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z; + space0.p .x[i] = space.p .x; space0.p .y[i] = space.p .y; space0.p .z[i] = space.p .z; + + /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z; + b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/ + + b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z; + b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z; + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Returns the extent of the bounds of the ith child */ + __forceinline Vec3fa extent0(size_t i) const { + assert(i < N); + const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]); + const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]); + const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]); + return rsqrt(vx*vx + vy*vy + vz*vz); + } + + public: + AffineSpace3vf<N> space0; + //BBox3vf<N> b0; // these are the unit bounds + BBox3vf<N> b1; + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h new file mode 100644 index 0000000000..2afc8c98e7 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h @@ -0,0 +1,265 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! BVHN Quantized Node */ + template<int N> + struct __aligned(8) QuantizedBaseNode_t + { + typedef unsigned char T; + static const T MIN_QUAN = 0; + static const T MAX_QUAN = 255; + + /*! Clears the node. */ + __forceinline void clear() { + for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN; + for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN; + } + + /*! Returns bounds of specified child. */ + __forceinline BBox3fa bounds(size_t i) const + { + assert(i < N); + const Vec3fa lower(madd(scale.x,(float)lower_x[i],start.x), + madd(scale.y,(float)lower_y[i],start.y), + madd(scale.z,(float)lower_z[i],start.z)); + const Vec3fa upper(madd(scale.x,(float)upper_x[i],start.x), + madd(scale.y,(float)upper_y[i],start.y), + madd(scale.z,(float)upper_z[i],start.z)); + return BBox3fa(lower,upper); + } + + /*! Returns extent of bounds of specified child. */ + __forceinline Vec3fa extent(size_t i) const { + return bounds(i).size(); + } + + static __forceinline void init_dim(const vfloat<N> &lower, + const vfloat<N> &upper, + T lower_quant[N], + T upper_quant[N], + float &start, + float &scale) + { + /* quantize bounds */ + const vbool<N> m_valid = lower != vfloat<N>(pos_inf); + const float minF = reduce_min(lower); + const float maxF = reduce_max(upper); + float diff = (1.0f+2.0f*float(ulp))*(maxF - minF); + float decode_scale = diff / float(MAX_QUAN); + if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero + assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF); + const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f; + vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN); + vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN); + + /* lower/upper correction */ + vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower; + vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper; + ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN); + iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN); + + /* disable invalid lanes */ + ilower = select(m_valid,ilower,MAX_QUAN); + iupper = select(m_valid,iupper,MIN_QUAN); + + /* store as uchar to memory */ + vint<N>::store(lower_quant,ilower); + vint<N>::store(upper_quant,iupper); + start = minF; + scale = decode_scale; + +#if defined(DEBUG) + vfloat<N> extract_lower( vint<N>::loadu(lower_quant) ); + vfloat<N> extract_upper( vint<N>::loadu(upper_quant) ); + vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF); + vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF); + assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid)); + assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid)); +#endif + } + + __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node) + { + init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x); + init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y); + init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z); + } + + __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); } + +#if defined(__AVX512F__) // KNL + __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); } +#endif + __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); } + + __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); } + + __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); } + + __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); } + + __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); } + + __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); } + + template <int M> + __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); } + +#if defined(__AVX512F__) + __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); } + __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); } + __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); } +#endif + + union { + struct { + T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children + T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children + T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children + T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children + T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children + T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children + }; + T all_planes[6*N]; + }; + + Vec3f start; + Vec3f scale; + + friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n) + { + o << "QuantizedBaseNode { " << embree_endl; + o << " start " << n.start << embree_endl; + o << " scale " << n.scale << embree_endl; + o << " lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl; + o << " upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl; + o << " lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl; + o << " upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl; + o << " lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl; + o << " upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl; + o << "}" << embree_endl; + return o; + } + + }; + + template<typename NodeRef, int N> + struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N> + { + using BaseNode_t<NodeRef,N>::children; + using QuantizedBaseNode_t<N>::lower_x; + using QuantizedBaseNode_t<N>::upper_x; + using QuantizedBaseNode_t<N>::lower_y; + using QuantizedBaseNode_t<N>::upper_y; + using QuantizedBaseNode_t<N>::lower_z; + using QuantizedBaseNode_t<N>::upper_z; + using QuantizedBaseNode_t<N>::start; + using QuantizedBaseNode_t<N>::scale; + using QuantizedBaseNode_t<N>::init_dim; + + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + struct Create2 + { + template<typename BuildRecord> + __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const + { + __aligned(64) AABBNode_t<NodeRef,N> node; + node.clear(); + for (size_t i=0; i<n; i++) { + node.setBounds(i,children[i].bounds()); + } + QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(sizeof(QuantizedNode_t), NodeRef::byteAlignment); + qnode->init(node); + + return (size_t)qnode | NodeRef::tyQuantizedNode; + } + }; + + struct Set2 + { + template<typename BuildRecord> + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + QuantizedNode_t* node = ref.quantizedNode(); + for (size_t i=0; i<num; i++) node->setRef(i,children[i]); + return ref; + } + }; + + __forceinline void init(AABBNode_t<NodeRef,N>& node) + { + for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode; + init_dim(node); + } + + }; + + /*! BVHN Quantized Node */ + template<int N> + struct __aligned(8) QuantizedBaseNodeMB_t + { + QuantizedBaseNode_t<N> node0; + QuantizedBaseNode_t<N> node1; + + /*! Clears the node. */ + __forceinline void clear() { + node0.clear(); + node1.clear(); + } + + /*! Returns bounds of specified child. */ + __forceinline BBox3fa bounds(size_t i) const + { + assert(i < N); + BBox3fa bounds0 = node0.bounds(i); + BBox3fa bounds1 = node1.bounds(i); + bounds0.extend(bounds1); + return bounds0; + } + + /*! Returns extent of bounds of specified child. */ + __forceinline Vec3fa extent(size_t i) const { + return bounds(i).size(); + } + + __forceinline vbool<N> validMask() const { return node0.validMask(); } + + template<typename T> + __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); } + + + template<int M> + __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); } + + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_node_ref.h b/thirdparty/embree/kernels/bvh/bvh_node_ref.h new file mode 100644 index 0000000000..6f6da758de --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_node_ref.h @@ -0,0 +1,242 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/alloc.h" +#include "../common/accel.h" +#include "../common/device.h" +#include "../common/scene.h" +#include "../geometry/primitive.h" +#include "../common/ray.h" + +namespace embree +{ + /* BVH node reference with bounds */ + template<typename NodeRef> + struct BVHNodeRecord + { + __forceinline BVHNodeRecord() {} + __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {} + __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {} + + NodeRef ref; + BBox3fx bounds; + }; + + template<typename NodeRef> + struct BVHNodeRecordMB + { + __forceinline BVHNodeRecordMB() {} + __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {} + + NodeRef ref; + LBBox3fa lbounds; + }; + + template<typename NodeRef> + struct BVHNodeRecordMB4D + { + __forceinline BVHNodeRecordMB4D() {} + __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {} + + NodeRef ref; + LBBox3fa lbounds; + BBox1f dt; + }; + + template<typename NodeRef, int N> struct BaseNode_t; + template<typename NodeRef, int N> struct AABBNode_t; + template<typename NodeRef, int N> struct AABBNodeMB_t; + template<typename NodeRef, int N> struct AABBNodeMB4D_t; + template<typename NodeRef, int N> struct OBBNode_t; + template<typename NodeRef, int N> struct OBBNodeMB_t; + template<typename NodeRef, int N> struct QuantizedNode_t; + template<typename NodeRef, int N> struct QuantizedNodeMB_t; + + /*! Pointer that points to a node or a list of primitives */ + template<int N> + struct NodeRefPtr + { + //template<int NN> friend class BVHN; + + /*! Number of bytes the nodes and primitives are minimally aligned to.*/ + static const size_t byteAlignment = 16; + static const size_t byteNodeAlignment = 4*N; + + /*! highest address bit is used as barrier for some algorithms */ + static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1)); + + /*! Masks the bits that store the number of items per leaf. */ + static const size_t align_mask = byteAlignment-1; + static const size_t items_mask = byteAlignment-1; + + /*! different supported node types */ + static const size_t tyAABBNode = 0; + static const size_t tyAABBNodeMB = 1; + static const size_t tyAABBNodeMB4D = 6; + static const size_t tyOBBNode = 2; + static const size_t tyOBBNodeMB = 3; + static const size_t tyQuantizedNode = 5; + static const size_t tyLeaf = 8; + + /*! Empty node */ + static const size_t emptyNode = tyLeaf; + + /*! Invalid node, used as marker in traversal */ + static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0); + static const size_t popRay = (((size_t)-1) & (~items_mask)) | (tyLeaf+1); + + /*! Maximum number of primitive blocks in a leaf. */ + static const size_t maxLeafBlocks = items_mask-tyLeaf; + + /*! Default constructor */ + __forceinline NodeRefPtr () {} + + /*! Construction from integer */ + __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {} + + /*! Cast to size_t */ + __forceinline operator size_t() const { return ptr; } + + /*! Sets the barrier bit. */ + __forceinline void setBarrier() { +#if defined(__64BIT__) + assert(!isBarrier()); + ptr |= barrier_mask; +#else + assert(false); +#endif + } + + /*! Clears the barrier bit. */ + __forceinline void clearBarrier() { +#if defined(__64BIT__) + ptr &= ~barrier_mask; +#else + assert(false); +#endif + } + + /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */ + __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; } + + /*! checks if this is a leaf */ + __forceinline size_t isLeaf() const { return ptr & tyLeaf; } + + /*! returns node type */ + __forceinline int type() const { return ptr & (size_t)align_mask; } + + /*! checks if this is a node */ + __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; } + + /*! checks if this is a motion blur node */ + __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; } + + /*! checks if this is a 4D motion blur node */ + __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; } + + /*! checks if this is a node with unaligned bounding boxes */ + __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; } + + /*! checks if this is a motion blur node with unaligned bounding boxes */ + __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; } + + /*! checks if this is a quantized node */ + __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; } + + /*! Encodes a node */ + static __forceinline NodeRefPtr encodeNode(AABBNode_t<NodeRefPtr,N>* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node); + } + + static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t<NodeRefPtr,N>* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node | tyAABBNodeMB); + } + + static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t<NodeRefPtr,N>* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node | tyAABBNodeMB4D); + } + + /*! Encodes an unaligned node */ + static __forceinline NodeRefPtr encodeNode(OBBNode_t<NodeRefPtr,N>* node) { + return NodeRefPtr((size_t) node | tyOBBNode); + } + + /*! Encodes an unaligned motion blur node */ + static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t<NodeRefPtr,N>* node) { + return NodeRefPtr((size_t) node | tyOBBNodeMB); + } + + /*! Encodes a leaf */ + static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) { + assert(!((size_t)tri & align_mask)); + assert(num <= maxLeafBlocks); + return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks))); + } + + /*! Encodes a leaf */ + static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) { + assert(!((size_t)ptr & align_mask)); + return NodeRefPtr((size_t)ptr | (tyLeaf+ty)); + } + + /*! returns base node pointer */ + __forceinline BaseNode_t<NodeRefPtr,N>* baseNode() + { + assert(!isLeaf()); + return (BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); + } + __forceinline const BaseNode_t<NodeRefPtr,N>* baseNode() const + { + assert(!isLeaf()); + return (const BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); + } + + /*! returns node pointer */ + __forceinline AABBNode_t<NodeRefPtr,N>* getAABBNode() { assert(isAABBNode()); return ( AABBNode_t<NodeRefPtr,N>*)ptr; } + __forceinline const AABBNode_t<NodeRefPtr,N>* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t<NodeRefPtr,N>*)ptr; } + + /*! returns motion blur node pointer */ + __forceinline AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() { assert(isAABBNodeMB() || isAABBNodeMB4D()); return ( AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + __forceinline const AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + + /*! returns 4D motion blur node pointer */ + __forceinline AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() { assert(isAABBNodeMB4D()); return ( AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + __forceinline const AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + + /*! returns unaligned node pointer */ + __forceinline OBBNode_t<NodeRefPtr,N>* ungetAABBNode() { assert(isOBBNode()); return ( OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + __forceinline const OBBNode_t<NodeRefPtr,N>* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + + /*! returns unaligned motion blur node pointer */ + __forceinline OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() { assert(isOBBNodeMB()); return ( OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + __forceinline const OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + + /*! returns quantized node pointer */ + __forceinline QuantizedNode_t<NodeRefPtr,N>* quantizedNode() { assert(isQuantizedNode()); return ( QuantizedNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask ); } + __forceinline const QuantizedNode_t<NodeRefPtr,N>* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask ); } + + /*! returns leaf pointer */ + __forceinline char* leaf(size_t& num) const { + assert(isLeaf()); + num = (ptr & (size_t)items_mask)-tyLeaf; + return (char*)(ptr & ~(size_t)align_mask); + } + + /*! clear all bit flags */ + __forceinline void clearFlags() { + ptr &= ~(size_t)align_mask; + } + + /*! returns the wideness */ + __forceinline size_t getN() const { return N; } + + public: + size_t ptr; + }; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_refit.cpp b/thirdparty/embree/kernels/bvh/bvh_refit.cpp new file mode 100644 index 0000000000..bf5c8538ba --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_refit.cpp @@ -0,0 +1,247 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_refit.h" +#include "bvh_statistics.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +namespace embree +{ + namespace isa + { + static const size_t SINGLE_THREAD_THRESHOLD = 4*1024; + + template<int N> + __forceinline bool compare(const typename BVHN<N>::NodeRef* a, const typename BVHN<N>::NodeRef* b) + { + size_t sa = *(size_t*)&a->node()->lower_x; + size_t sb = *(size_t*)&b->node()->lower_x; + return sa < sb; + } + + template<int N> + BVHNRefitter<N>::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds) + : bvh(bvh), leafBounds(leafBounds), numSubTrees(0) + { + } + + template<int N> + void BVHNRefitter<N>::refit() + { + if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) { + bvh->bounds = LBBox3fa(recurse_bottom(bvh->root)); + } + else + { + BBox3fa subTreeBounds[MAX_NUM_SUB_TREES]; + numSubTrees = 0; + gather_subtree_refs(bvh->root,numSubTrees,0); + if (numSubTrees) + parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + NodeRef& ref = subTrees[i]; + subTreeBounds[i] = recurse_bottom(ref); + } + }); + + numSubTrees = 0; + bvh->bounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0)); + } + } + + template<int N> + void BVHNRefitter<N>::gather_subtree_refs(NodeRef& ref, + size_t &subtrees, + const size_t depth) + { + if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) + { + assert(subtrees < MAX_NUM_SUB_TREES); + subTrees[subtrees++] = ref; + return; + } + + if (ref.isAABBNode()) + { + AABBNode* node = ref.getAABBNode(); + for (size_t i=0; i<N; i++) { + NodeRef& child = node->child(i); + if (unlikely(child == BVH::emptyNode)) continue; + gather_subtree_refs(child,subtrees,depth+1); + } + } + } + + template<int N> + BBox3fa BVHNRefitter<N>::refit_toplevel(NodeRef& ref, + size_t &subtrees, + const BBox3fa *const subTreeBounds, + const size_t depth) + { + if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) + { + assert(subtrees < MAX_NUM_SUB_TREES); + assert(subTrees[subtrees] == ref); + return subTreeBounds[subtrees++]; + } + + if (ref.isAABBNode()) + { + AABBNode* node = ref.getAABBNode(); + BBox3fa bounds[N]; + + for (size_t i=0; i<N; i++) + { + NodeRef& child = node->child(i); + + if (unlikely(child == BVH::emptyNode)) + bounds[i] = BBox3fa(empty); + else + bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1); + } + + BBox3vf<N> boundsT = transpose<N>(bounds); + + /* set new bounds */ + node->lower_x = boundsT.lower.x; + node->lower_y = boundsT.lower.y; + node->lower_z = boundsT.lower.z; + node->upper_x = boundsT.upper.x; + node->upper_y = boundsT.upper.y; + node->upper_z = boundsT.upper.z; + + return merge<N>(bounds); + } + else + return leafBounds.leafBounds(ref); + } + + // ========================================================= + // ========================================================= + // ========================================================= + + + template<int N> + BBox3fa BVHNRefitter<N>::recurse_bottom(NodeRef& ref) + { + /* this is a leaf node */ + if (unlikely(ref.isLeaf())) + return leafBounds.leafBounds(ref); + + /* recurse if this is an internal node */ + AABBNode* node = ref.getAABBNode(); + + /* enable exclusive prefetch for >= AVX platforms */ +#if defined(__AVX__) + BVH::prefetchW(ref); +#endif + BBox3fa bounds[N]; + + for (size_t i=0; i<N; i++) + if (unlikely(node->child(i) == BVH::emptyNode)) + { + bounds[i] = BBox3fa(empty); + } + else + bounds[i] = recurse_bottom(node->child(i)); + + /* AOS to SOA transform */ + BBox3vf<N> boundsT = transpose<N>(bounds); + + /* set new bounds */ + node->lower_x = boundsT.lower.x; + node->lower_y = boundsT.lower.y; + node->lower_z = boundsT.lower.z; + node->upper_x = boundsT.upper.x; + node->upper_y = boundsT.upper.y; + node->upper_z = boundsT.upper.z; + + return merge<N>(bounds); + } + + template<int N, typename Mesh, typename Primitive> + BVHNRefitT<N,Mesh,Primitive>::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode) + : bvh(bvh), builder(builder), refitter(new BVHNRefitter<N>(bvh,*(typename BVHNRefitter<N>::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {} + + template<int N, typename Mesh, typename Primitive> + void BVHNRefitT<N,Mesh,Primitive>::clear() + { + if (builder) + builder->clear(); + } + + template<int N, typename Mesh, typename Primitive> + void BVHNRefitT<N,Mesh,Primitive>::build() + { + if (mesh->topologyChanged(topologyVersion)) { + topologyVersion = mesh->getTopologyVersion(); + builder->build(); + } + else + refitter->refit(); + } + + template class BVHNRefitter<4>; +#if defined(__AVX__) + template class BVHNRefitter<8>; +#endif + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + + Builder* BVH4Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + + Builder* BVH8Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif + +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode); + Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode); + Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode); + Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode); + Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); } +#endif +#endif + + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_refit.h b/thirdparty/embree/kernels/bvh/bvh_refit.h new file mode 100644 index 0000000000..09bb3d8da5 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_refit.h @@ -0,0 +1,95 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" + +namespace embree +{ + namespace isa + { + template<int N> + class BVHNRefitter + { + public: + + /*! Type shortcuts */ + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + struct LeafBoundsInterface { + virtual const BBox3fa leafBounds(NodeRef& ref) const = 0; + }; + + public: + + /*! Constructor. */ + BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds); + + /*! refits the BVH */ + void refit(); + + private: + /* single-threaded subtree extraction based on BVH depth */ + void gather_subtree_refs(NodeRef& ref, + size_t &subtrees, + const size_t depth = 0); + + /* single-threaded top-level refit */ + BBox3fa refit_toplevel(NodeRef& ref, + size_t &subtrees, + const BBox3fa *const subTreeBounds, + const size_t depth = 0); + + /* single-threaded subtree refit */ + BBox3fa recurse_bottom(NodeRef& ref); + + public: + BVH* bvh; //!< BVH to refit + const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves + + static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4 : (N==8) ? 3 : 3; + static const size_t MAX_NUM_SUB_TREES = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH + size_t numSubTrees; + NodeRef subTrees[MAX_NUM_SUB_TREES]; + }; + + template<int N, typename Mesh, typename Primitive> + class BVHNRefitT : public Builder, public BVHNRefitter<N>::LeafBoundsInterface + { + public: + + /*! Type shortcuts */ + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + public: + BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode); + + virtual void build(); + + virtual void clear(); + + virtual const BBox3fa leafBounds (NodeRef& ref) const + { + size_t num; char* prim = ref.leaf(num); + if (unlikely(ref == BVH::emptyNode)) return empty; + + BBox3fa bounds = empty; + for (size_t i=0; i<num; i++) + bounds.extend(((Primitive*)prim)[i].update(mesh)); + return bounds; + } + + private: + BVH* bvh; + std::unique_ptr<Builder> builder; + std::unique_ptr<BVHNRefitter<N>> refitter; + Mesh* mesh; + unsigned int topologyVersion; + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree/kernels/bvh/bvh_rotate.cpp new file mode 100644 index 0000000000..460bd60c62 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_rotate.cpp @@ -0,0 +1,127 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_rotate.h" + +namespace embree +{ + namespace isa + { + /*! Computes half surface area of box. */ + __forceinline float halfArea3f(const BBox<vfloat4>& box) { + const vfloat4 d = box.size(); + const vfloat4 a = d*shuffle<1,2,0,3>(d); + return a[0]+a[1]+a[2]; + } + + size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth) + { + /*! nothing to rotate if we reached a leaf node. */ + if (parentRef.isBarrier()) return 0; + if (parentRef.isLeaf()) return 0; + AABBNode* parent = parentRef.getAABBNode(); + + /*! rotate all children first */ + vint4 cdepth; + for (size_t c=0; c<4; c++) + cdepth[c] = (int)rotate(parent->child(c),depth+1); + + /* compute current areas of all children */ + vfloat4 sizeX = parent->upper_x-parent->lower_x; + vfloat4 sizeY = parent->upper_y-parent->lower_y; + vfloat4 sizeZ = parent->upper_z-parent->lower_z; + vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ); + + /*! get node bounds */ + BBox<vfloat4> child1_0,child1_1,child1_2,child1_3; + parent->bounds(child1_0,child1_1,child1_2,child1_3); + + /*! Find best rotation. We pick a first child (child1) and a sub-child + (child2child) of a different second child (child2), and swap child1 + and child2child. We perform the best such swap. */ + float bestArea = 0; + size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1; + for (size_t c2=0; c2<4; c2++) + { + /*! ignore leaf nodes as we cannot descent into them */ + if (parent->child(c2).isBarrier()) continue; + if (parent->child(c2).isLeaf()) continue; + AABBNode* child2 = parent->child(c2).getAABBNode(); + + /*! transpose child bounds */ + BBox<vfloat4> child2c0,child2c1,child2c2,child2c3; + child2->bounds(child2c0,child2c1,child2c2,child2c3); + + /*! put child1_0 at each child2 position */ + float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3)); + float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3)); + float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3)); + float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0)); + vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03); + vfloat4 min0 = vreduce_min(cost0); + int pos0 = (int)bsf(movemask(min0 == cost0)); + + /*! put child1_1 at each child2 position */ + float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3)); + float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3)); + float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3)); + float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1)); + vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13); + vfloat4 min1 = vreduce_min(cost1); + int pos1 = (int)bsf(movemask(min1 == cost1)); + + /*! put child1_2 at each child2 position */ + float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3)); + float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3)); + float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3)); + float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2)); + vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23); + vfloat4 min2 = vreduce_min(cost2); + int pos2 = (int)bsf(movemask(min2 == cost2)); + + /*! put child1_3 at each child2 position */ + float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3)); + float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3)); + float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3)); + float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3)); + vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33); + vfloat4 min3 = vreduce_min(cost3); + int pos3 = (int)bsf(movemask(min3 == cost3)); + + /*! find best other child */ + vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]); + int pos[4] = { pos0,pos1,pos2,pos3 }; + const size_t mbd = BVH4::maxBuildDepth; + vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints + valid &= vint4(int(c2)) != vint4(step); + if (none(valid)) continue; + size_t c1 = select_min(valid,area0123); + float area = area0123[c1]; + if (c1 == c2) continue; // can happen if bounds are NANs + + /*! accept a swap when it reduces cost and is not swapping a node with itself */ + if (area < bestArea) { + bestArea = area; + bestChild1 = c1; + bestChild2 = c2; + bestChild2Child = pos[c1]; + } + } + + /*! if we did not find a swap that improves the SAH then do nothing */ + if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth); + + /*! perform the best found tree rotation */ + AABBNode* child2 = parent->child(bestChild2).getAABBNode(); + AABBNode::swap(parent,bestChild1,child2,bestChild2Child); + parent->setBounds(bestChild2,child2->bounds()); + AABBNode::compact(parent); + AABBNode::compact(child2); + + /*! This returned depth is conservative as the child that was + * pulled up in the tree could have been on the critical path. */ + cdepth[bestChild1]++; // bestChild1 was pushed down one level + return 1+reduce_max(cdepth); + } + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_rotate.h b/thirdparty/embree/kernels/bvh/bvh_rotate.h new file mode 100644 index 0000000000..61ef64a679 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_rotate.h @@ -0,0 +1,37 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" + +namespace embree +{ + namespace isa + { + template<int N> + class BVHNRotate + { + typedef typename BVHN<N>::NodeRef NodeRef; + + public: + static const bool enabled = false; + + static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; } + static __forceinline void restructure(NodeRef ref, size_t depth = 1) {} + }; + + /* BVH4 tree rotations */ + template<> + class BVHNRotate<4> + { + typedef BVH4::AABBNode AABBNode; + typedef BVH4::NodeRef NodeRef; + + public: + static const bool enabled = true; + + static size_t rotate(NodeRef parentRef, size_t depth = 1); + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp new file mode 100644 index 0000000000..d857ff7d95 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp @@ -0,0 +1,168 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_statistics.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + template<int N> + BVHNStatistics<N>::BVHNStatistics (BVH* bvh) : bvh(bvh) + { + double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea()); + stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f)); + } + + template<int N> + std::string BVHNStatistics<N>::str() + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << " primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl; + size_t totalBytes = stat.bytes(bvh); + double totalSAH = stat.sah(bvh); + stream << " total : sah = " << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), "; + stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), "; + stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), "; + stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl; + if (stat.statAABBNodes.numNodes ) stream << " getAABBNodes : " << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statOBBNodes.numNodes ) stream << " ungetAABBNodes : " << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statAABBNodesMB.numNodes ) stream << " getAABBNodesMB : " << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statAABBNodesMB4D.numNodes) stream << " getAABBNodesMB4D : " << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statOBBNodesMB.numNodes) stream << " ungetAABBNodesMB : " << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statQuantizedNodes.numNodes ) stream << " quantizedNodes : " << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (true) stream << " leaves : " << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl; + if (true) stream << " histogram : " << stat.statLeaf.histToString() << std::endl; + return stream.str(); + } + + template<int N> + typename BVHNStatistics<N>::Statistics BVHNStatistics<N>::statistics(NodeRef node, const double A, const BBox1f t0t1) + { + Statistics s; + assert(t0t1.size() > 0.0f); + double dt = max(0.0f,t0t1.size()); + if (node.isAABBNode()) + { + AABBNode* n = node.getAABBNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extend(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statAABBNodes.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodes.numNodes++; + s.statAABBNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isOBBNode()) + { + OBBNode* n = node.ungetAABBNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statOBBNodes.numChildren++; + return s; + }, Statistics::add); + s.statOBBNodes.numNodes++; + s.statOBBNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isAABBNodeMB()) + { + AABBNodeMB* n = node.getAABBNodeMB(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1)); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statAABBNodesMB.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodesMB.numNodes++; + s.statAABBNodesMB.nodeSAH += dt*A; + s.depth++; + } + else if (node.isAABBNodeMB4D()) + { + AABBNodeMB4D* n = node.getAABBNodeMB4D(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const BBox1f t0t1i = intersect(t0t1,n->timeRange(i)); + assert(!t0t1i.empty()); + const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i); + Statistics s = statistics(n->child(i),Ai,t0t1i); + s.statAABBNodesMB4D.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodesMB4D.numNodes++; + s.statAABBNodesMB4D.nodeSAH += dt*A; + s.depth++; + } + else if (node.isOBBNodeMB()) + { + OBBNodeMB* n = node.ungetAABBNodeMB(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent0(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statOBBNodesMB.numChildren++; + return s; + }, Statistics::add); + s.statOBBNodesMB.numNodes++; + s.statOBBNodesMB.nodeSAH += dt*A; + s.depth++; + } + else if (node.isQuantizedNode()) + { + QuantizedNode* n = node.quantizedNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statQuantizedNodes.numChildren++; + return s; + }, Statistics::add); + s.statQuantizedNodes.numNodes++; + s.statQuantizedNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isLeaf()) + { + size_t num; const char* tri = node.leaf(num); + if (num) + { + for (size_t i=0; i<num; i++) + { + const size_t bytes = bvh->primTy->getBytes(tri); + s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri); + s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri); + s.statLeaf.numBytes += bytes; + tri+=bytes; + } + s.statLeaf.numLeaves++; + s.statLeaf.numPrimBlocks += num; + s.statLeaf.leafSAH += dt*A*num; + if (num-1 < Statistics::LeafStat::NHIST) { + s.statLeaf.numPrimBlocksHistogram[num-1]++; + } + } + } + else { + // -- GODOT start -- + // throw std::runtime_error("not supported node type in bvh_statistics"); + abort(); + // -- GODOT end -- + } + return s; + } + +#if defined(__AVX__) + template class BVHNStatistics<8>; +#endif + +#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) + template class BVHNStatistics<4>; +#endif +} diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.h b/thirdparty/embree/kernels/bvh/bvh_statistics.h new file mode 100644 index 0000000000..a28e115f1c --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_statistics.h @@ -0,0 +1,285 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include <sstream> + +namespace embree +{ + template<int N> + class BVHNStatistics + { + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::OBBNode OBBNode; + typedef typename BVH::AABBNodeMB AABBNodeMB; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + typedef typename BVH::OBBNodeMB OBBNodeMB; + typedef typename BVH::QuantizedNode QuantizedNode; + + typedef typename BVH::NodeRef NodeRef; + + struct Statistics + { + template<typename Node> + struct NodeStat + { + NodeStat ( double nodeSAH = 0, + size_t numNodes = 0, + size_t numChildren = 0) + : nodeSAH(nodeSAH), + numNodes(numNodes), + numChildren(numChildren) {} + + double sah(BVH* bvh) const { + return nodeSAH/bvh->getLinearBounds().expectedHalfArea(); + } + + size_t bytes() const { + return numNodes*sizeof(Node); + } + + size_t size() const { + return numNodes; + } + + double fillRateNom () const { return double(numChildren); } + double fillRateDen () const { return double(numNodes*N); } + double fillRate () const { return fillRateNom()/fillRateDen(); } + + __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b) + { + return NodeStat(a.nodeSAH + b.nodeSAH, + a.numNodes+b.numNodes, + a.numChildren+b.numChildren); + } + + std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh); + stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), "; + stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6 << " MB "; + stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), "; + stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), "; + stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives); + return stream.str(); + } + + public: + double nodeSAH; + size_t numNodes; + size_t numChildren; + }; + + struct LeafStat + { + static const int NHIST = 8; + + LeafStat ( double leafSAH = 0.0f, + size_t numLeaves = 0, + size_t numPrimsActive = 0, + size_t numPrimsTotal = 0, + size_t numPrimBlocks = 0, + size_t numBytes = 0) + : leafSAH(leafSAH), + numLeaves(numLeaves), + numPrimsActive(numPrimsActive), + numPrimsTotal(numPrimsTotal), + numPrimBlocks(numPrimBlocks), + numBytes(numBytes) + { + for (size_t i=0; i<NHIST; i++) + numPrimBlocksHistogram[i] = 0; + } + + double sah(BVH* bvh) const { + return leafSAH/bvh->getLinearBounds().expectedHalfArea(); + } + + size_t bytes(BVH* bvh) const { + return numBytes; + } + + size_t size() const { + return numLeaves; + } + + double fillRateNom (BVH* bvh) const { return double(numPrimsActive); } + double fillRateDen (BVH* bvh) const { return double(numPrimsTotal); } + double fillRate (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); } + + __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b) + { + LeafStat stat(a.leafSAH + b.leafSAH, + a.numLeaves+b.numLeaves, + a.numPrimsActive+b.numPrimsActive, + a.numPrimsTotal+b.numPrimsTotal, + a.numPrimBlocks+b.numPrimBlocks, + a.numBytes+b.numBytes); + for (size_t i=0; i<NHIST; i++) { + stat.numPrimBlocksHistogram[i] += a.numPrimBlocksHistogram[i]; + stat.numPrimBlocksHistogram[i] += b.numPrimBlocksHistogram[i]; + } + return stat; + } + + std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh); + stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), "; + stream << "#bytes = " << std::setw(7) << std::setprecision(2) << double(bytes(bvh))/1E6 << " MB "; + stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes(bvh))/double(bytesTotal) << "%), "; + stream << "#nodes = " << std::setw(7) << numLeaves << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate(bvh) << "% filled), "; + stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes(bvh))/double(bvh->numPrimitives); + return stream.str(); + } + + std::string histToString() const + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + for (size_t i=0; i<NHIST; i++) + stream << std::setw(6) << std::setprecision(2) << 100.0f*float(numPrimBlocksHistogram[i])/float(numLeaves) << "% "; + return stream.str(); + } + + public: + double leafSAH; //!< SAH of the leaves only + size_t numLeaves; //!< Number of leaf nodes. + size_t numPrimsActive; //!< Number of active primitives ( + size_t numPrimsTotal; //!< Number of active and inactive primitives + size_t numPrimBlocks; //!< Number of primitive blocks. + size_t numBytes; //!< Number of bytes of leaves. + size_t numPrimBlocksHistogram[8]; + }; + + public: + Statistics (size_t depth = 0, + LeafStat statLeaf = LeafStat(), + NodeStat<AABBNode> statAABBNodes = NodeStat<AABBNode>(), + NodeStat<OBBNode> statOBBNodes = NodeStat<OBBNode>(), + NodeStat<AABBNodeMB> statAABBNodesMB = NodeStat<AABBNodeMB>(), + NodeStat<AABBNodeMB4D> statAABBNodesMB4D = NodeStat<AABBNodeMB4D>(), + NodeStat<OBBNodeMB> statOBBNodesMB = NodeStat<OBBNodeMB>(), + NodeStat<QuantizedNode> statQuantizedNodes = NodeStat<QuantizedNode>()) + + : depth(depth), + statLeaf(statLeaf), + statAABBNodes(statAABBNodes), + statOBBNodes(statOBBNodes), + statAABBNodesMB(statAABBNodesMB), + statAABBNodesMB4D(statAABBNodesMB4D), + statOBBNodesMB(statOBBNodesMB), + statQuantizedNodes(statQuantizedNodes) {} + + double sah(BVH* bvh) const + { + return statLeaf.sah(bvh) + + statAABBNodes.sah(bvh) + + statOBBNodes.sah(bvh) + + statAABBNodesMB.sah(bvh) + + statAABBNodesMB4D.sah(bvh) + + statOBBNodesMB.sah(bvh) + + statQuantizedNodes.sah(bvh); + } + + size_t bytes(BVH* bvh) const { + return statLeaf.bytes(bvh) + + statAABBNodes.bytes() + + statOBBNodes.bytes() + + statAABBNodesMB.bytes() + + statAABBNodesMB4D.bytes() + + statOBBNodesMB.bytes() + + statQuantizedNodes.bytes(); + } + + size_t size() const + { + return statLeaf.size() + + statAABBNodes.size() + + statOBBNodes.size() + + statAABBNodesMB.size() + + statAABBNodesMB4D.size() + + statOBBNodesMB.size() + + statQuantizedNodes.size(); + } + + double fillRate (BVH* bvh) const + { + double nom = statLeaf.fillRateNom(bvh) + + statAABBNodes.fillRateNom() + + statOBBNodes.fillRateNom() + + statAABBNodesMB.fillRateNom() + + statAABBNodesMB4D.fillRateNom() + + statOBBNodesMB.fillRateNom() + + statQuantizedNodes.fillRateNom(); + double den = statLeaf.fillRateDen(bvh) + + statAABBNodes.fillRateDen() + + statOBBNodes.fillRateDen() + + statAABBNodesMB.fillRateDen() + + statAABBNodesMB4D.fillRateDen() + + statOBBNodesMB.fillRateDen() + + statQuantizedNodes.fillRateDen(); + return nom/den; + } + + friend Statistics operator+ ( const Statistics& a, const Statistics& b ) + { + return Statistics(max(a.depth,b.depth), + a.statLeaf + b.statLeaf, + a.statAABBNodes + b.statAABBNodes, + a.statOBBNodes + b.statOBBNodes, + a.statAABBNodesMB + b.statAABBNodesMB, + a.statAABBNodesMB4D + b.statAABBNodesMB4D, + a.statOBBNodesMB + b.statOBBNodesMB, + a.statQuantizedNodes + b.statQuantizedNodes); + } + + static Statistics add ( const Statistics& a, const Statistics& b ) { + return a+b; + } + + public: + size_t depth; + LeafStat statLeaf; + NodeStat<AABBNode> statAABBNodes; + NodeStat<OBBNode> statOBBNodes; + NodeStat<AABBNodeMB> statAABBNodesMB; + NodeStat<AABBNodeMB4D> statAABBNodesMB4D; + NodeStat<OBBNodeMB> statOBBNodesMB; + NodeStat<QuantizedNode> statQuantizedNodes; + }; + + public: + + /* Constructor gathers statistics. */ + BVHNStatistics (BVH* bvh); + + /*! Convert statistics into a string */ + std::string str(); + + double sah() const { + return stat.sah(bvh); + } + + size_t bytesUsed() const { + return stat.bytes(bvh); + } + + private: + Statistics statistics(NodeRef node, const double A, const BBox1f dt); + + private: + BVH* bvh; + Statistics stat; + }; + + typedef BVHNStatistics<4> BVH4Statistics; + typedef BVHNStatistics<8> BVH8Statistics; +} diff --git a/thirdparty/embree/kernels/bvh/bvh_traverser1.h b/thirdparty/embree/kernels/bvh/bvh_traverser1.h new file mode 100644 index 0000000000..8ce01b57f5 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_traverser1.h @@ -0,0 +1,466 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "node_intersector1.h" +#include "../common/stack_item.h" + +#define NEW_SORTING_CODE 1 + +namespace embree +{ + namespace isa + { + /*! BVH regular node traversal for single rays. */ + template<int N, int types> + class BVHNNodeTraverser1Hit; + +#if defined(__AVX512VL__) // SKX + + template<int N> + __forceinline void isort_update(vint<N> &dist, const vint<N> &d) + { + const vint<N> dist_shift = align_shift_right<N-1>(dist,dist); + const vboolf<N> m_geq = d >= dist; + const vboolf<N> m_geq_shift = m_geq << 1; + dist = select(m_geq,d,dist); + dist = select(m_geq_shift,dist_shift,dist); + } + + template<int N> + __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) { + dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero))); + } + + __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) { + return toScalar(permutex2var((__m256i)index,n0,n1)); + } + + __forceinline float permuteExtract(const vint8& index, const vfloat8& n) { + return toScalar(permute(n,index)); + } + +#endif + + /* Specialization for BVH4. */ + template<int types> + class BVHNNodeTraverser1Hit<4, types> + { + typedef BVH4 BVH; + typedef BVH4::NodeRef NodeRef; + typedef BVH4::BaseNode BaseNode; + + + public: + /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */ + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t mask, + const vfloat4& tNear, + StackItemT<NodeRef>*& stackPtr, + StackItemT<NodeRef>* stackEnd) + { + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + if (likely(mask == 0)) { + assert(cur != BVH::emptyNode); + return; + } + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + const unsigned int d0 = ((unsigned int*)&tNear)[r]; + r = bscf(mask); + NodeRef c1 = node->child(r); + BVH::prefetch(c1,types); + const unsigned int d1 = ((unsigned int*)&tNear)[r]; + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + assert(stackPtr < stackEnd); + if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } + else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } + } + +#if NEW_SORTING_CODE == 1 + vint4 s0((size_t)c0,(size_t)d0); + vint4 s1((size_t)c1,(size_t)d1); + r = bscf(mask); + NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; + vint4 s2((size_t)c2,(size_t)d2); + /* 3 hits */ + if (likely(mask == 0)) { + StackItemT<NodeRef>::sort3(s0,s1,s2); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; + cur = toSizeT(s2); + stackPtr+=2; + return; + } + r = bscf(mask); + NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; + vint4 s3((size_t)c3,(size_t)d3); + /* 4 hits */ + StackItemT<NodeRef>::sort4(s0,s1,s2,s3); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; + cur = toSizeT(s3); + stackPtr+=3; +#else + /*! Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. */ + assert(stackPtr < stackEnd); + stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; + assert(stackPtr < stackEnd); + stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; + + /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + + /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#endif + } + + /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */ + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t mask, + const vfloat4& tNear, + NodeRef*& stackPtr, + NodeRef* stackEnd) + { + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + + /* simpler in sequence traversal order */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); BVH::prefetch(cur,types); + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + } + } + }; + + /* Specialization for BVH8. */ + template<int types> + class BVHNNodeTraverser1Hit<8, types> + { + typedef BVH8 BVH; + typedef BVH8::NodeRef NodeRef; + typedef BVH8::BaseNode BaseNode; + +#if defined(__AVX512VL__) + template<class NodeRef, class BaseNode> + static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur, + size_t mask, + const vfloat8& tNear, + StackItemT<NodeRef>*& stackPtr, + StackItemT<NodeRef>* stackEnd) + { + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]); + const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]); + vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step); + distance_i = vint8::compact((int)mask,distance_i,distance_i); + cur = permuteExtract(distance_i,n0,n1); + BVH::prefetch(cur,types); + + mask &= mask-1; + if (likely(mask == 0)) return; + + /* 2 hits: order A0 B0 */ + const vint8 d0(distance_i); + const vint8 d1(shuffle<1>(distance_i)); + cur = permuteExtract(d1,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A0 = min(d0, d1); + const vint8 dist_B0 = max(d0, d1); + assert(dist_A0[0] < dist_B0[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A0,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_B0,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear); + stackPtr++; + return; + } + + /* 3 hits: order A1 B1 C1 */ + + const vint8 d2(shuffle<2>(distance_i)); + cur = permuteExtract(d2,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A1 = min(dist_A0,d2); + const vint8 dist_tmp_B1 = max(dist_A0,d2); + const vint8 dist_B1 = min(dist_B0,dist_tmp_B1); + const vint8 dist_C1 = max(dist_B0,dist_tmp_B1); + assert(dist_A1[0] < dist_B1[0]); + assert(dist_B1[0] < dist_C1[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A1,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_C1,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear); + stackPtr[1].ptr = permuteExtract(dist_B1,n0,n1); + *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear); + stackPtr+=2; + return; + } + + /* 4 hits: order A2 B2 C2 D2 */ + + const vint8 d3(shuffle<3>(distance_i)); + cur = permuteExtract(d3,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A2 = min(dist_A1,d3); + const vint8 dist_tmp_B2 = max(dist_A1,d3); + const vint8 dist_B2 = min(dist_B1,dist_tmp_B2); + const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2); + const vint8 dist_C2 = min(dist_C1,dist_tmp_C2); + const vint8 dist_D2 = max(dist_C1,dist_tmp_C2); + assert(dist_A2[0] < dist_B2[0]); + assert(dist_B2[0] < dist_C2[0]); + assert(dist_C2[0] < dist_D2[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A2,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_D2,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear); + stackPtr[1].ptr = permuteExtract(dist_C2,n0,n1); + *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear); + stackPtr[2].ptr = permuteExtract(dist_B2,n0,n1); + *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear); + stackPtr+=3; + return; + } + + /* >=5 hits: reverse to descending order for writing to stack */ + + distance_i = align_shift_right<3>(distance_i,distance_i); + const size_t hits = 4 + popcnt(mask); + vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert + + isort_quick_update<8>(dist,dist_A2); + isort_quick_update<8>(dist,dist_B2); + isort_quick_update<8>(dist,dist_C2); + isort_quick_update<8>(dist,dist_D2); + + do { + + distance_i = align_shift_right<1>(distance_i,distance_i); + cur = permuteExtract(distance_i,n0,n1); + BVH::prefetch(cur,types); + const vint8 new_dist(permute(distance_i,vint8(zero))); + mask &= mask-1; + isort_update<8>(dist,new_dist); + + } while(mask); + + for (size_t i=0; i<7; i++) + assert(dist[i+0]>=dist[i+1]); + + for (size_t i=0;i<hits-1;i++) + { + stackPtr->ptr = permuteExtract(dist,n0,n1); + *(float*)&stackPtr->dist = permuteExtract(dist,tNear); + dist = align_shift_right<1>(dist,dist); + stackPtr++; + } + cur = permuteExtract(dist,n0,n1); + } +#endif + + public: + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t mask, + const vfloat8& tNear, + StackItemT<NodeRef>*& stackPtr, + StackItemT<NodeRef>* stackEnd) + { + assert(mask != 0); +#if defined(__AVX512VL__) + traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); +#else + + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + if (likely(mask == 0)) { + assert(cur != BVH::emptyNode); + return; + } + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + const unsigned int d0 = ((unsigned int*)&tNear)[r]; + r = bscf(mask); + NodeRef c1 = node->child(r); + BVH::prefetch(c1,types); + const unsigned int d1 = ((unsigned int*)&tNear)[r]; + + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + assert(stackPtr < stackEnd); + if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } + else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } + } +#if NEW_SORTING_CODE == 1 + vint4 s0((size_t)c0,(size_t)d0); + vint4 s1((size_t)c1,(size_t)d1); + + r = bscf(mask); + NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; + vint4 s2((size_t)c2,(size_t)d2); + /* 3 hits */ + if (likely(mask == 0)) { + StackItemT<NodeRef>::sort3(s0,s1,s2); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; + cur = toSizeT(s2); + stackPtr+=2; + return; + } + r = bscf(mask); + NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; + vint4 s3((size_t)c3,(size_t)d3); + /* 4 hits */ + if (likely(mask == 0)) { + StackItemT<NodeRef>::sort4(s0,s1,s2,s3); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; + cur = toSizeT(s3); + stackPtr+=3; + return; + } + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3; + /*! fallback case if more than 4 children are hit */ + StackItemT<NodeRef>* stackFirst = stackPtr; + stackPtr+=4; + while (1) + { + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r]; + const vint4 s((size_t)c,(size_t)d); + *(vint4*)stackPtr++ = s; + assert(c != BVH::emptyNode); + if (unlikely(mask == 0)) break; + } + sort(stackFirst,stackPtr); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#else + /*! Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. */ + assert(stackPtr < stackEnd); + stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; + assert(stackPtr < stackEnd); + stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; + + /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + + /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + /*! fallback case if more than 4 children are hit */ + StackItemT<NodeRef>* stackFirst = stackPtr-4; + while (1) + { + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (unlikely(mask == 0)) break; + } + sort(stackFirst,stackPtr); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#endif +#endif + } + + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t mask, + const vfloat8& tNear, + NodeRef*& stackPtr, + NodeRef* stackEnd) + { + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + + /* simpler in sequence traversal order */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); BVH::prefetch(cur,types); + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + } + } + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h new file mode 100644 index 0000000000..852981e69d --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_traverser_stream.h @@ -0,0 +1,149 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/stack_item.h" + +namespace embree +{ + namespace isa + { + template<int N, int types> + class BVHNNodeTraverserStreamHitCoherent + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + + public: + template<class T> + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t& m_trav_active, + const vbool<N>& vmask, + const vfloat<N>& tNear, + const T* const tMask, + StackItemMaskCoherent*& stackPtr) + { + const NodeRef parent = cur; + size_t mask = movemask(vmask); + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + const size_t r0 = bscf(mask); + assert(r0 < 8); + cur = node->child(r0); + BVHN<N>::prefetch(cur,types); + m_trav_active = tMask[r0]; + assert(cur != BVH::emptyNode); + if (unlikely(mask == 0)) return; + + const unsigned int* const tNear_i = (unsigned int*)&tNear; + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + unsigned int d0 = tNear_i[r0]; + const size_t r1 = bscf(mask); + assert(r1 < 8); + NodeRef c1 = node->child(r1); + BVHN<N>::prefetch(c1,types); + unsigned int d1 = tNear_i[r1]; + + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + if (d0 < d1) { + assert(tNear[r1] >= 0.0f); + stackPtr->mask = tMask[r1]; + stackPtr->parent = parent; + stackPtr->child = c1; + stackPtr++; + cur = c0; + m_trav_active = tMask[r0]; + return; + } + else { + assert(tNear[r0] >= 0.0f); + stackPtr->mask = tMask[r0]; + stackPtr->parent = parent; + stackPtr->child = c0; + stackPtr++; + cur = c1; + m_trav_active = tMask[r1]; + return; + } + } + + /*! slow path for more than two hits */ + size_t hits = movemask(vmask); + const vint<N> dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint<N>(step), 0); + const vint<N> dist_i_sorted = usort_descending(dist_i); + const vint<N> sorted_index = dist_i_sorted & 7; + + size_t i = 0; + for (;;) + { + const unsigned int index = sorted_index[i]; + assert(index < 8); + cur = node->child(index); + m_trav_active = tMask[index]; + assert(m_trav_active); + BVHN<N>::prefetch(cur,types); + bscf(hits); + if (unlikely(hits==0)) break; + i++; + assert(cur != BVH::emptyNode); + assert(tNear[index] >= 0.0f); + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + } + } + + template<class T> + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t& m_trav_active, + const vbool<N>& vmask, + const T* const tMask, + StackItemMaskCoherent*& stackPtr) + { + const NodeRef parent = cur; + size_t mask = movemask(vmask); + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVHN<N>::prefetch(cur,types); + m_trav_active = tMask[r]; + + /* simple in order sequence */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); + BVHN<N>::prefetch(cur,types); + m_trav_active = tMask[r]; + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + } + } + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/node_intersector.h b/thirdparty/embree/kernels/bvh/node_intersector.h new file mode 100644 index 0000000000..25edaf295d --- /dev/null +++ b/thirdparty/embree/kernels/bvh/node_intersector.h @@ -0,0 +1,31 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" + +namespace embree +{ + namespace isa + { + struct NearFarPrecalculations + { + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + + __forceinline NearFarPrecalculations() {} + + __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N) + { + const size_t size = sizeof(float)*N; + nearX = (dir.x < 0.0f) ? 1*size : 0*size; + nearY = (dir.y < 0.0f) ? 3*size : 2*size; + nearZ = (dir.z < 0.0f) ? 5*size : 4*size; + farX = nearX ^ size; + farY = nearY ^ size; + farZ = nearZ ^ size; + } + }; + } +} diff --git a/thirdparty/embree/kernels/bvh/node_intersector1.h b/thirdparty/embree/kernels/bvh/node_intersector1.h new file mode 100644 index 0000000000..1ec4fc63fc --- /dev/null +++ b/thirdparty/embree/kernels/bvh/node_intersector1.h @@ -0,0 +1,1403 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray structure used in single-ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, bool robust> + struct TravRayBase; + + /* Base (without tnear and tfar) */ + template<int N> + struct TravRayBase<N,false> + { + __forceinline TravRayBase() {} + + __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir) + : org_xyz(ray_org), dir_xyz(ray_dir) + { + const Vec3fa ray_rdir = rcp_safe(ray_dir); + org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z); + dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z); + rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z); +#if defined(__AVX2__) || defined(__ARM_NEON) + const Vec3fa ray_org_rdir = ray_org*ray_rdir; + org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z); +#endif + nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>); + nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>); + nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>); + farX = nearX ^ sizeof(vfloat<N>); + farY = nearY ^ sizeof(vfloat<N>); + farZ = nearZ ^ sizeof(vfloat<N>); + } + + template<int K> + __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, + const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ, + size_t flip = sizeof(vfloat<N>)) + { + org = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]); + dir = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); + rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); +#if defined(__AVX2__) || defined(__ARM_NEON) + org_rdir = org*rdir; +#endif + nearX = nearXYZ.x[k]; + nearY = nearXYZ.y[k]; + nearZ = nearXYZ.z[k]; + farX = nearX ^ flip; + farY = nearY ^ flip; + farZ = nearZ ^ flip; + } + + Vec3fa org_xyz, dir_xyz; + Vec3vf<N> org, dir, rdir; +#if defined(__AVX2__) || defined(__ARM_NEON) + Vec3vf<N> org_rdir; +#endif + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + }; + + /* Base (without tnear and tfar) */ + template<int N> + struct TravRayBase<N,true> + { + __forceinline TravRayBase() {} + + __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir) + : org_xyz(ray_org), dir_xyz(ray_dir) + { + const float round_down = 1.0f-3.0f*float(ulp); + const float round_up = 1.0f+3.0f*float(ulp); + const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir); + const Vec3fa ray_rdir_near = round_down*ray_rdir; + const Vec3fa ray_rdir_far = round_up *ray_rdir; + org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z); + dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z); + rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z); + rdir_far = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z); + + nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>); + nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>); + nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>); + farX = nearX ^ sizeof(vfloat<N>); + farY = nearY ^ sizeof(vfloat<N>); + farZ = nearZ ^ sizeof(vfloat<N>); + } + + template<int K> + __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, + const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ, + size_t flip = sizeof(vfloat<N>)) + { + const vfloat<N> round_down = 1.0f-3.0f*float(ulp); + const vfloat<N> round_up = 1.0f+3.0f*float(ulp); + org = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]); + dir = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); + rdir_near = round_down*Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); + rdir_far = round_up *Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); + + nearX = nearXYZ.x[k]; + nearY = nearXYZ.y[k]; + nearZ = nearXYZ.z[k]; + farX = nearX ^ flip; + farY = nearY ^ flip; + farZ = nearZ ^ flip; + } + + Vec3fa org_xyz, dir_xyz; + Vec3vf<N> org, dir, rdir_near, rdir_far; + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + }; + + /* Full (with tnear and tfar) */ + template<int N, bool robust> + struct TravRay : TravRayBase<N,robust> + { + __forceinline TravRay() {} + + __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar) + : TravRayBase<N,robust>(ray_org, ray_dir), + tnear(ray_tnear), tfar(ray_tfar) {} + + template<int K> + __forceinline void init(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, + const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ, + float ray_tnear, float ray_tfar, + size_t flip = sizeof(vfloat<N>)) + { + TravRayBase<N,robust>::template init<K>(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip); + tnear = ray_tnear; tfar = ray_tfar; + } + + vfloat<N> tnear; + vfloat<N> tfar; + }; + + ////////////////////////////////////////////////////////////////////////////////////// + // Point Query structure used in single-ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + struct TravPointQuery + { + __forceinline TravPointQuery() {} + + __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad) + { + org = Vec3vf<N>(query_org.x, query_org.y, query_org.z); + rad = Vec3vf<N>(query_rad.x, query_rad.y, query_rad.z); + } + + __forceinline vfloat<N> const& tfar() const { + return rad.x; + } + + Vec3vf<N> org, rad; + }; + + ////////////////////////////////////////////////////////////////////////////////////// + // point query + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t pointQuerySphereDistAndMask( + const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, + vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ) + { + const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x; + const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y; + const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z; + dist = vX * vX + vY * vY + vZ * vZ; + const vbool<N> vmask = dist <= query.tfar()*query.tfar(); + const vbool<N> valid = minX <= maxX; + return movemask(vmask) & movemask(valid); + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x)); + const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y)); + const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z)); + const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x)); + const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y)); + const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z)); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x); + const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y); + const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z); + const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x); + const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y); + const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z); + const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0])); + const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0])); + const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0])); + const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0])); + const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0])); + const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0])); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template<int N> + __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + size_t mask = pointQueryNodeSphere(node, query, time, dist); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t); + mask &= movemask(vmask); + } + + return mask; + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + const vfloat<N> start_x(node->start.x); + const vfloat<N> scale_x(node->scale.x); + const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x); + const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x); + const vfloat<N> start_y(node->start.y); + const vfloat<N> scale_y(node->scale.y); + const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y); + const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y); + const vfloat<N> start_z(node->start.z); + const vfloat<N> scale_z(node->scale.z); + const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z); + const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask()); + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const vfloat<N> minX = node->dequantizeLowerX(time); + const vfloat<N> maxX = node->dequantizeUpperX(time); + const vfloat<N> minY = node->dequantizeLowerY(time); + const vfloat<N> maxY = node->dequantizeUpperY(time); + const vfloat<N> minZ = node->dequantizeLowerZ(time); + const vfloat<N> maxZ = node->dequantizeUpperZ(time); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask()); + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + // TODO: point query - implement + const vbool<N> vmask = vbool<N>(true); + const size_t mask = movemask(vmask) & ((1<<N)-1); + dist = vfloat<N>(0.0f); + return mask; + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + // TODO: point query - implement + const vbool<N> vmask = vbool<N>(true); + const size_t mask = movemask(vmask) & ((1<<N)-1); + dist = vfloat<N>(0.0f); + return mask; + } + + template<int N> + __forceinline size_t pointQueryAABBDistAndMask( + const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, + vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ) + { + const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x; + const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y; + const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z; + dist = vX * vX + vY * vY + vZ * vZ; + const vbool<N> valid = minX <= maxX; + const vbool<N> vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) | + (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) | + (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z)); + return movemask(vmask) & movemask(valid); + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x)); + const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y)); + const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z)); + const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x)); + const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y)); + const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z)); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x); + const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y); + const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z); + const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x); + const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y); + const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z); + const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0])); + const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0])); + const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0])); + const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0])); + const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0])); + const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0])); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template<int N> + __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + size_t mask = pointQueryNodeAABB(node, query, time, dist); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t); + mask &= movemask(vmask); + } + + return mask; + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat<N> start_x(node->start.x); + const vfloat<N> scale_x(node->scale.x); + const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x); + const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x); + const vfloat<N> start_y(node->start.y); + const vfloat<N> scale_y(node->scale.y); + const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y); + const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y); + const vfloat<N> start_z(node->start.z); + const vfloat<N> scale_z(node->scale.z); + const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z); + const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid; + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat<N> minX = node->dequantizeLowerX(time); + const vfloat<N> maxX = node->dequantizeUpperX(time); + const vfloat<N> minY = node->dequantizeLowerY(time); + const vfloat<N> maxY = node->dequantizeUpperY(time); + const vfloat<N> minZ = node->dequantizeLowerZ(time); + const vfloat<N> maxZ = node->dequantizeUpperZ(time); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid; + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + // TODO: point query - implement + const vbool<N> vmask = vbool<N>(true); + const size_t mask = movemask(vmask) & ((1<<N)-1); + dist = vfloat<N>(0.0f); + return mask; + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + // TODO: point query - implement + const vbool<N> vmask = vbool<N>(true); + const size_t mask = movemask(vmask) & ((1<<N)-1); + dist = vfloat<N>(0.0f); + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, bool robust> + __forceinline size_t intersectNode(const typename BVHN<N>::AABBNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist); + + template<> + __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist) + { +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); + const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); + const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); + const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); + const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); + const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#else + const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; + const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; + const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z; + const vfloat4 tFarX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x; + const vfloat4 tFarY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; + const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__aarch64__) + const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear); + const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<4)-1); +#elif defined(__AVX512F__) // SKX + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + +#if defined(__AVX__) + + template<> + __forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist) + { +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); + const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); + const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); + const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); + const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); + const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#else + const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; + const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; + const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z; + const vfloat8 tFarX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x; + const vfloat8 tFarY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; + const vfloat8 tFarZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX2__) && !defined(__AVX512F__) // HSW + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<8)-1); +#elif defined(__AVX512F__) // SKX + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + +#endif + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNode* node, const TravRay<N,true>& ray, vfloat<N>& dist) + { + const vfloat<N> tNearX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; + const vfloat<N> tNearY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; + const vfloat<N> tNearZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; + const vfloat<N> tFarX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; + const vfloat<N> tFarY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; + const vfloat<N> tFarZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; + const vfloat<N> tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat<N> tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool<N> vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNode(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist) + { + const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX); + const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY); + const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ); + const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); + const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); + const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); + const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; + const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; + const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z; + const vfloat<N> tFarX = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x; + const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; + const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; +#endif +#if defined(__AVX2__) && !defined(__AVX512F__) // HSW + const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool<N> vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<N)-1); +#elif defined(__AVX512F__) // SKX + const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool<N> vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + const vbool<N> vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist) + { + const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX); + const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY); + const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ); + const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x; + const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y; + const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z; + const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); + const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); + const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); + const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x; + const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y; + const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z; + const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ); + const size_t mask = movemask(tNear <= tFar); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeMB4D(const typename BVHN<N>::NodeRef ref, const TravRay<N,false>& ray, const float time, vfloat<N>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX); + const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY); + const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ); + const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); + const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); + const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); +#if defined (__AVX2__) || defined(__ARM_NEON) + const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); + const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; + const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; + const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z; + const vfloat<N> tFarX = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x; + const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; + const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; +#endif +#if defined(__AVX2__) && !defined(__AVX512F__) + const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear)); + const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar )); +#else + const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); +#endif + vbool<N> vmask = tNear <= tFar; + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + vmask &= (node1->lower_t <= time) & (time < node1->upper_t); + } + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeMB4DRobust(const typename BVHN<N>::NodeRef ref, const TravRay<N,true>& ray, const float time, vfloat<N>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX); + const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY); + const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ); + const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x; + const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y; + const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z; + const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); + const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); + const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); + const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x; + const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y; + const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z; + const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ); + vbool<N> vmask = tNear <= tFar; + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + vmask &= (node1->lower_t <= time) & (time < node1->upper_t); + } + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast QuantizedBaseNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, bool robust> + __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist); + + template<> + __forceinline size_t intersectNode<4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,false>& ray, vfloat4& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat4 start_x(node->start.x); + const vfloat4 scale_x(node->scale.x); + const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x); + const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x); + const vfloat4 start_y(node->start.y); + const vfloat4 scale_y(node->scale.y); + const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y); + const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y); + const vfloat4 start_z(node->start.z); + const vfloat4 scale_z(node->scale.z); + const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); + const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); + +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<4)-1); +#elif defined(__AVX512F__) // SKX + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask & mvalid; + } + + template<> + __forceinline size_t intersectNode<4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,true>& ray, vfloat4& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat4 start_x(node->start.x); + const vfloat4 scale_x(node->scale.x); + const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x); + const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x); + const vfloat4 start_y(node->start.y); + const vfloat4 scale_y(node->scale.y); + const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y); + const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y); + const vfloat4 start_z(node->start.z); + const vfloat4 scale_z(node->scale.z); + const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); + const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); + + const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + dist = tNear; + return mask & mvalid; + } + + +#if defined(__AVX__) + + template<> + __forceinline size_t intersectNode<8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,false>& ray, vfloat8& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat8 start_x(node->start.x); + const vfloat8 scale_x(node->scale.x); + const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x); + const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x); + const vfloat8 start_y(node->start.y); + const vfloat8 scale_y(node->scale.y); + const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y); + const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y); + const vfloat8 start_z(node->start.z); + const vfloat8 scale_z(node->scale.z); + const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); + const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); + +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX2__) && !defined(__AVX512F__) // HSW + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<8)-1); +#elif defined(__AVX512F__) // SKX + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask & mvalid; + } + + template<> + __forceinline size_t intersectNode<8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,true>& ray, vfloat8& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat8 start_x(node->start.x); + const vfloat8 scale_x(node->scale.x); + const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x); + const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x); + const vfloat8 start_y(node->start.y); + const vfloat8 scale_y(node->scale.y); + const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y); + const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y); + const vfloat8 start_z(node->start.z); + const vfloat8 scale_z(node->scale.z); + const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); + const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); + + const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + + dist = tNear; + return mask & mvalid; + } + + +#endif + + template<int N> + __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist) + { + const vboolf<N> mvalid = node->validMask(); + const vfloat<N> lower_x = node->dequantizeLowerX(time); + const vfloat<N> upper_x = node->dequantizeUpperX(time); + const vfloat<N> lower_y = node->dequantizeLowerY(time); + const vfloat<N> upper_y = node->dequantizeUpperY(time); + const vfloat<N> lower_z = node->dequantizeLowerZ(time); + const vfloat<N> upper_z = node->dequantizeUpperZ(time); +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat<N> tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat<N> tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat<N> tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + + const vfloat<N> tminX = mini(tNearX,tFarX); + const vfloat<N> tmaxX = maxi(tNearX,tFarX); + const vfloat<N> tminY = mini(tNearY,tFarY); + const vfloat<N> tmaxY = maxi(tNearY,tFarY); + const vfloat<N> tminZ = mini(tNearZ,tFarZ); + const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ); + const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear); + const vfloat<N> tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar); +#if defined(__AVX512F__) // SKX + const vbool<N> vmask = le(mvalid,asInt(tNear),asInt(tFar)); +#else + const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid; +#endif + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template<int N> + __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist) + { + const vboolf<N> mvalid = node->validMask(); + const vfloat<N> lower_x = node->dequantizeLowerX(time); + const vfloat<N> upper_x = node->dequantizeUpperX(time); + const vfloat<N> lower_y = node->dequantizeLowerY(time); + const vfloat<N> upper_y = node->dequantizeUpperY(time); + const vfloat<N> lower_z = node->dequantizeLowerZ(time); + const vfloat<N> upper_z = node->dequantizeUpperZ(time); + const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat<N> tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat<N> tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat<N> tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat<N> tminX = mini(tNearX,tFarX); + const vfloat<N> tmaxX = maxi(tNearX,tFarX); + const vfloat<N> tminY = mini(tNearY,tFarY); + const vfloat<N> tmaxY = maxi(tNearY,tFarY); + const vfloat<N> tminZ = mini(tNearZ,tFarZ); + const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ); + const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear); + const vfloat<N> tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar); +#if defined(__AVX512F__) // SKX + const vbool<N> vmask = le(mvalid,asInt(tNear),asInt(tFar)); +#else + const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid; +#endif + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, bool robust> + __forceinline size_t intersectNode(const typename BVHN<N>::OBBNode* node, const TravRay<N,robust>& ray, vfloat<N>& dist) + { + const Vec3vf<N> dir = xfmVector(node->naabb,ray.dir); + //const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))/dir; + const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))*rcp_safe(dir); + const Vec3vf<N> org = xfmPoint(node->naabb,ray.org); + const Vec3vf<N> tLowerXYZ = org * nrdir; // (Vec3fa(zero) - org) * rdir; + const Vec3vf<N> tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir; + + const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x); + const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y); + const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z); + const vfloat<N> tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x); + const vfloat<N> tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y); + const vfloat<N> tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z); + vfloat<N> tNear = max(ray.tnear, tNearX,tNearY,tNearZ); + vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + if (robust) { + tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp)); + tFar = tFar *vfloat<N>(1.0f+3.0f*float(ulp)); + } + const vbool<N> vmask = tNear <= tFar; + dist = tNear; + return movemask(vmask); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, bool robust> + __forceinline size_t intersectNode(const typename BVHN<N>::OBBNodeMB* node, const TravRay<N,robust>& ray, const float time, vfloat<N>& dist) + { + const AffineSpace3vf<N> xfm = node->space0; + const Vec3vf<N> b0_lower = zero; + const Vec3vf<N> b0_upper = one; + const Vec3vf<N> lower = lerp(b0_lower,node->b1.lower,vfloat<N>(time)); + const Vec3vf<N> upper = lerp(b0_upper,node->b1.upper,vfloat<N>(time)); + + const BBox3vf<N> bounds(lower,upper); + const Vec3vf<N> dir = xfmVector(xfm,ray.dir); + const Vec3vf<N> rdir = rcp_safe(dir); + const Vec3vf<N> org = xfmPoint(xfm,ray.org); + + const Vec3vf<N> tLowerXYZ = (bounds.lower - org) * rdir; + const Vec3vf<N> tUpperXYZ = (bounds.upper - org) * rdir; + + const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x); + const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y); + const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z); + const vfloat<N> tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x); + const vfloat<N> tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y); + const vfloat<N> tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z); + vfloat<N> tNear = max(ray.tnear, tNearX,tNearY,tNearZ); + vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + if (robust) { + tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp)); + tFar = tFar *vfloat<N>(1.0f+3.0f*float(ulp)); + } + const vbool<N> vmask = tNear <= tFar; + dist = tNear; + return movemask(vmask); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in point query raversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Computes traversal information for N nodes with 1 point query */ + template<int N, int types> + struct BVHNNodePointQuerySphere1; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere(node.getAABBNode(), query, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphereMB4D<N>(node, query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN1_UN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = pointQueryNodeSphere(node.getAABBNode(), query, dist); + else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN2_UN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D_UN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist); + else mask = pointQueryNodeSphereMB4D(node, query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_QN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist); + return true; + } + }; + + template<int N> + struct BVHNQuantizedBaseNodePointQuerySphere1 + { + static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + return pointQueryNodeSphere(node,query,dist); + } + + static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + return pointQueryNodeSphere(node,query,time,dist); + } + }; + + /*! Computes traversal information for N nodes with 1 point query */ + template<int N, int types> + struct BVHNNodePointQueryAABB1; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB(node.getAABBNode(), query, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABBMB4D<N>(node, query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN1_UN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = pointQueryNodeAABB(node.getAABBNode(), query, dist); + else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN2_UN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D_UN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist); + else mask = pointQueryNodeAABBMB4D(node, query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_QN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist); + return true; + } + }; + + template<int N> + struct BVHNQuantizedBaseNodePointQueryAABB1 + { + static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + return pointQueryNodeAABB(node,query,dist); + } + + static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + return pointQueryNodeAABB(node,query,time,dist); + } + }; + + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Intersects N nodes with 1 ray */ + template<int N, int types, bool robust> + struct BVHNNodeIntersector1; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN1, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode(node.getAABBNode(), ray, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust(node.getAABBNode(), ray, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode(node.getAABBNodeMB(), ray, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN2_AN4D, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeMB4D<N>(node, ray, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN2_AN4D, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeMB4DRobust<N>(node, ray, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN1_UN1, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = intersectNode(node.getAABBNode(), ray, dist); + else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN1_UN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = intersectNodeRobust(node.getAABBNode(), ray, dist); + else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN2_UN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = intersectNode(node.getAABBNodeMB(), ray, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN2_UN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN2_AN4D_UN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else mask = intersectNodeMB4D(node, ray, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_AN2_AN4D_UN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else mask = intersectNodeMB4DRobust(node, ray, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_QN1, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,false>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist); + return true; + } + }; + + template<int N> + struct BVHNNodeIntersector1<N, BVH_QN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,true>& ray, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist); + return true; + } + }; + + /*! Intersects N nodes with K rays */ + template<int N, bool robust> + struct BVHNQuantizedBaseNodeIntersector1; + + template<int N> + struct BVHNQuantizedBaseNodeIntersector1<N, false> + { + static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,false>& ray, vfloat<N>& dist) + { + return intersectNode(node,ray,dist); + } + + static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,false>& ray, const float time, vfloat<N>& dist) + { + return intersectNode(node,ray,time,dist); + } + + }; + + template<int N> + struct BVHNQuantizedBaseNodeIntersector1<N, true> + { + static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,true>& ray, vfloat<N>& dist) + { + return intersectNode(node,ray,dist); + } + + static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,true>& ray, const float time, vfloat<N>& dist) + { + return intersectNode(node,ray,time,dist); + } + + }; + + + } +} diff --git a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h new file mode 100644 index 0000000000..1f7215e5df --- /dev/null +++ b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h @@ -0,0 +1,241 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Frustum structure used in hybrid and stream traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /* + Optimized frustum test. We calculate t=(p-org)/dir in ray/box + intersection. We assume the rays are split by octant, thus + dir intervals are either positive or negative in each + dimension. + + Case 1: dir.min >= 0 && dir.max >= 0: + t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min + t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max + + Case 2: dir.min < 0 && dir.max < 0: + t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max + t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min + */ + + template<bool robust> + struct Frustum; + + /* Fast variant */ + template<> + struct Frustum<false> + { + __forceinline Frustum() {} + + template<int K> + __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)), + reduce_min(select(valid, org.y, pos_inf)), + reduce_min(select(valid, org.z, pos_inf))); + + const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)), + reduce_max(select(valid, org.y, neg_inf)), + reduce_max(select(valid, org.z, neg_inf))); + + const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)), + reduce_min(select(valid, rdir.y, pos_inf)), + reduce_min(select(valid, rdir.z, pos_inf))); + + const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)), + reduce_max(select(valid, rdir.y, neg_inf)), + reduce_max(select(valid, rdir.z, neg_inf))); + + const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf))); + const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf))); + + init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N); + } + + __forceinline void init(const Vec3fa& reduced_min_org, + const Vec3fa& reduced_max_org, + const Vec3fa& reduced_min_rdir, + const Vec3fa& reduced_max_rdir, + float reduced_min_dist, + float reduced_max_dist, + int N) + { + const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero)); + + min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); + max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); + + min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org); + max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org); + + min_dist = reduced_min_dist; + max_dist = reduced_max_dist; + + nf = NearFarPrecalculations(min_rdir, N); + } + + template<int K> + __forceinline void updateMaxDist(const vfloat<K>& ray_tfar) + { + max_dist = reduce_max(ray_tfar); + } + + NearFarPrecalculations nf; + + Vec3fa min_rdir; + Vec3fa max_rdir; + + Vec3fa min_org_rdir; + Vec3fa max_org_rdir; + + float min_dist; + float max_dist; + }; + + typedef Frustum<false> FrustumFast; + + /* Robust variant */ + template<> + struct Frustum<true> + { + __forceinline Frustum() {} + + template<int K> + __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)), + reduce_min(select(valid, org.y, pos_inf)), + reduce_min(select(valid, org.z, pos_inf))); + + const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)), + reduce_max(select(valid, org.y, neg_inf)), + reduce_max(select(valid, org.z, neg_inf))); + + const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)), + reduce_min(select(valid, rdir.y, pos_inf)), + reduce_min(select(valid, rdir.z, pos_inf))); + + const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)), + reduce_max(select(valid, rdir.y, neg_inf)), + reduce_max(select(valid, rdir.z, neg_inf))); + + const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf))); + const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf))); + + init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N); + } + + __forceinline void init(const Vec3fa& reduced_min_org, + const Vec3fa& reduced_max_org, + const Vec3fa& reduced_min_rdir, + const Vec3fa& reduced_max_rdir, + float reduced_min_dist, + float reduced_max_dist, + int N) + { + const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero)); + min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); + max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); + + min_org = select(pos_rdir, reduced_max_org, reduced_min_org); + max_org = select(pos_rdir, reduced_min_org, reduced_max_org); + + min_dist = reduced_min_dist; + max_dist = reduced_max_dist; + + nf = NearFarPrecalculations(min_rdir, N); + } + + template<int K> + __forceinline void updateMaxDist(const vfloat<K>& ray_tfar) + { + max_dist = reduce_max(ray_tfar); + } + + NearFarPrecalculations nf; + + Vec3fa min_rdir; + Vec3fa max_rdir; + + Vec3fa min_org; + Vec3fa max_org; + + float min_dist; + float max_dist; + }; + + typedef Frustum<true> FrustumRobust; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node, + const FrustumFast& frustum, vfloat<N>& dist) + { + const vfloat<N> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX); + const vfloat<N> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY); + const vfloat<N> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ); + const vfloat<N> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX); + const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY); + const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ); + + const vfloat<N> fminX = msub(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.min_org_rdir.x)); + const vfloat<N> fminY = msub(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.min_org_rdir.y)); + const vfloat<N> fminZ = msub(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.min_org_rdir.z)); + const vfloat<N> fmaxX = msub(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.max_org_rdir.x)); + const vfloat<N> fmaxY = msub(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.max_org_rdir.y)); + const vfloat<N> fmaxZ = msub(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.max_org_rdir.z)); + + const vfloat<N> fmin = maxi(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist)); + dist = fmin; + const vfloat<N> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist)); + const vbool<N> vmask_node_hit = fmin <= fmax; + size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1); + return m_node; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node, + const FrustumRobust& frustum, vfloat<N>& dist) + { + const vfloat<N> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX); + const vfloat<N> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY); + const vfloat<N> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ); + const vfloat<N> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX); + const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY); + const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ); + + const vfloat<N> fminX = (bminX - vfloat<N>(frustum.min_org.x)) * vfloat<N>(frustum.min_rdir.x); + const vfloat<N> fminY = (bminY - vfloat<N>(frustum.min_org.y)) * vfloat<N>(frustum.min_rdir.y); + const vfloat<N> fminZ = (bminZ - vfloat<N>(frustum.min_org.z)) * vfloat<N>(frustum.min_rdir.z); + const vfloat<N> fmaxX = (bmaxX - vfloat<N>(frustum.max_org.x)) * vfloat<N>(frustum.max_rdir.x); + const vfloat<N> fmaxY = (bmaxY - vfloat<N>(frustum.max_org.y)) * vfloat<N>(frustum.max_rdir.y); + const vfloat<N> fmaxZ = (bmaxZ - vfloat<N>(frustum.max_org.z)) * vfloat<N>(frustum.max_rdir.z); + + const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512 + const float round_up = 1.0f+2.0f*float(ulp); + const vfloat<N> fmin = max(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist)); + dist = fmin; + const vfloat<N> fmax = min(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist)); + const vbool<N> vmask_node_hit = (round_down*fmin <= round_up*fmax); + size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1); + return m_node; + } + } +} diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet.h b/thirdparty/embree/kernels/bvh/node_intersector_packet.h new file mode 100644 index 0000000000..d5498fc5db --- /dev/null +++ b/thirdparty/embree/kernels/bvh/node_intersector_packet.h @@ -0,0 +1,805 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray packet structure used in hybrid traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template<int K, bool robust> + struct TravRayK; + + /* Fast variant */ + template<int K> + struct TravRayK<K, false> + { + __forceinline TravRayK() {} + + __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N) + { + init(ray_org, ray_dir, N); + } + + __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + init(ray_org, ray_dir, N); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N) + { + org = ray_org; + dir = ray_dir; + rdir = rcp_safe(ray_dir); +#if defined(__AVX2__) || defined(__ARM_NEON) + org_rdir = org * rdir; +#endif + + if (N) + { + const int size = sizeof(float)*N; + nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size)); + nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size)); + nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size)); + } + } + + Vec3vf<K> org; + Vec3vf<K> dir; + Vec3vf<K> rdir; +#if defined(__AVX2__) || defined(__ARM_NEON) + Vec3vf<K> org_rdir; +#endif + Vec3vi<K> nearXYZ; + vfloat<K> tnear; + vfloat<K> tfar; + }; + + template<int K> + using TravRayKFast = TravRayK<K, false>; + + /* Robust variant */ + template<int K> + struct TravRayK<K, true> + { + __forceinline TravRayK() {} + + __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N) + { + init(ray_org, ray_dir, N); + } + + __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + init(ray_org, ray_dir, N); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N) + { + org = ray_org; + dir = ray_dir; + rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir)); + + if (N) + { + const int size = sizeof(float)*N; + nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size)); + nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size)); + nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size)); + } + } + + Vec3vf<K> org; + Vec3vf<K> dir; + Vec3vf<K> rdir; + Vec3vi<K> nearXYZ; + vfloat<K> tnear; + vfloat<K> tfar; + }; + + template<int K> + using TravRayKRobust = TravRayK<K, true>; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNode* node, size_t i, + const TravRayKFast<K>& ray, vfloat<K>& dist) + + { + #if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z); + #else + const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z; + #endif + + #if defined(__AVX512F__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else + #endif + { + const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + #if defined(__AVX512F__) // SKX + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + #else + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + #endif + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNode* node, size_t i, + const TravRayKRobust<K>& ray, vfloat<K>& dist) + { + // FIXME: use per instruction rounding for AVX512 + const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z; + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + const vfloat<K> lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNodeMB* node, const size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i])); + const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i])); + const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i])); + const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i])); + const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); + const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); + +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX512F__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else +#endif + { + const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); +#if defined(__AVX512F__) // SKX + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); +#else + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); +#endif + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNodeMB* node, const size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i])); + const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i])); + const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i])); + const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i])); + const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); + const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); + + const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + +#if defined(__AVX512F__) // SKX + if (K == 16) + { + const vfloat<K> lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + else +#endif + { + const vfloat<K> lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeKMB4D(const typename BVHN<N>::NodeRef ref, const size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i])); + const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i])); + const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i])); + const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i])); + const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); + const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); + +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; +#endif + + const vfloat<K> lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); + vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i])); + } + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeKMB4DRobust(const typename BVHN<N>::NodeRef ref, const size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i])); + const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i])); + const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i])); + const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i])); + const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); + const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); + + const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + const vfloat<K> lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); + vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i])); + } + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K, bool robust> + __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNode* node, const size_t i, + const TravRayK<K,robust>& ray, vfloat<K>& dist) + { + const AffineSpace3vf<K> naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]), + Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]), + Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]), + Vec3f(node->naabb.p .x[i], node->naabb.p .y[i], node->naabb.p .z[i])); + + const Vec3vf<K> dir = xfmVector(naabb, ray.dir); + const Vec3vf<K> nrdir = Vec3vf<K>(vfloat<K>(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1? + const Vec3vf<K> org = xfmPoint(naabb, ray.org); + + const vfloat<K> lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir; + const vfloat<K> lclipMinY = org.y * nrdir.y; + const vfloat<K> lclipMinZ = org.z * nrdir.z; + const vfloat<K> lclipMaxX = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir; + const vfloat<K> lclipMaxY = lclipMinY - nrdir.y; + const vfloat<K> lclipMaxZ = lclipMinZ - nrdir.z; + + vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + if (robust) { + lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp)); + lfarP = lfarP *vfloat<K>(1.0f+3.0f*float(ulp)); + } + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K, bool robust> + __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNodeMB* node, const size_t i, + const TravRayK<K,robust>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const AffineSpace3vf<K> xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]), + Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]), + Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]), + Vec3f(node->space0.p .x[i], node->space0.p .y[i], node->space0.p .z[i])); + + const Vec3vf<K> b0_lower = zero; + const Vec3vf<K> b0_upper = one; + const Vec3vf<K> b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]); + const Vec3vf<K> b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]); + const Vec3vf<K> lower = lerp(b0_lower, b1_lower, time); + const Vec3vf<K> upper = lerp(b0_upper, b1_upper, time); + + const Vec3vf<K> dir = xfmVector(xfm, ray.dir); + const Vec3vf<K> rdir = rcp_safe(dir); + const Vec3vf<K> org = xfmPoint(xfm, ray.org); + + const vfloat<K> lclipMinX = (lower.x - org.x) * rdir.x; + const vfloat<K> lclipMinY = (lower.y - org.y) * rdir.y; + const vfloat<K> lclipMinZ = (lower.z - org.z) * rdir.z; + const vfloat<K> lclipMaxX = (upper.x - org.x) * rdir.x; + const vfloat<K> lclipMaxY = (upper.y - org.y) * rdir.y; + const vfloat<K> lclipMaxZ = (upper.z - org.z) * rdir.z; + + vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + if (robust) { + lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp)); + lfarP = lfarP *vfloat<K>(1.0f+3.0f*float(ulp)); + } + + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + + ////////////////////////////////////////////////////////////////////////////////////// + // QuantizedBaseNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i, + const TravRayK<K,false>& ray, vfloat<K>& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + const vfloat<N> lower_x = node->dequantizeLowerX(); + const vfloat<N> upper_x = node->dequantizeUpperX(); + const vfloat<N> lower_y = node->dequantizeLowerY(); + const vfloat<N> upper_y = node->dequantizeUpperY(); + const vfloat<N> lower_z = node->dequantizeLowerZ(); + const vfloat<N> upper_z = node->dequantizeUpperZ(); + + #if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z); + #else + const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z; + #endif + + #if defined(__AVX512F__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else + #endif + { + const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + #if defined(__AVX512F__) // SKX + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + #else + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + #endif + dist = lnearP; + return lhit; + } + } + + template<int N, int K> + __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i, + const TravRayK<K,true>& ray, vfloat<K>& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + const vfloat<N> lower_x = node->dequantizeLowerX(); + const vfloat<N> upper_x = node->dequantizeUpperX(); + const vfloat<N> lower_y = node->dequantizeLowerY(); + const vfloat<N> upper_y = node->dequantizeUpperY(); + const vfloat<N> lower_z = node->dequantizeLowerZ(); + const vfloat<N> upper_z = node->dequantizeUpperZ(); + + const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + + const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + template<int N, int K> + __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + + const vfloat<K> lower_x = node->template dequantizeLowerX<K>(i,time); + const vfloat<K> upper_x = node->template dequantizeUpperX<K>(i,time); + const vfloat<K> lower_y = node->template dequantizeLowerY<K>(i,time); + const vfloat<K> upper_y = node->template dequantizeUpperY<K>(i,time); + const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time); + const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time); + +#if defined(__AVX2__) || defined(__ARM_NEON) + const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z; + #endif + const vfloat<K> lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + template<int N, int K> + __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + + const vfloat<K> lower_x = node->template dequantizeLowerX<K>(i,time); + const vfloat<K> upper_x = node->template dequantizeUpperX<K>(i,time); + const vfloat<K> lower_y = node->template dequantizeLowerY<K>(i,time); + const vfloat<K> upper_y = node->template dequantizeUpperY<K>(i,time); + const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time); + const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time); + + const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + + const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in hybrid traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Intersects N nodes with K rays */ + template<int N, int K, int types, bool robust> + struct BVHNNodeIntersectorK; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN1, false> + { + /* vmask is both an input and an output parameter! Its initial value should be the parent node + hit mask, which is used for correctly computing the current hit mask. The parent hit mask + is actually required only for motion blur node intersections (because different rays may + have different times), so for regular nodes vmask is simply overwritten. */ + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNode())) vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist); + else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNode())) vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist); + else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNodeMB())) vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist); + else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNodeMB())) vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist); + else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) { + vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist); + } else /*if (unlikely(node.isOBBNodeMB()))*/ { + assert(node.isOBBNodeMB()); + vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist); + } + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) { + vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist); + } else /*if (unlikely(node.isOBBNodeMB()))*/ { + assert(node.isOBBNodeMB()); + vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist); + } + return true; + } + }; + + + /*! Intersects N nodes with K rays */ + template<int N, int K, bool robust> + struct BVHNQuantizedBaseNodeIntersectorK; + + template<int N, int K> + struct BVHNQuantizedBaseNodeIntersectorK<N, K, false> + { + static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i, + const TravRayK<K,false>& ray, vfloat<K>& dist) + { + return intersectQuantizedNodeK<N,K>(node,i,ray,dist); + } + + static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist); + } + + }; + + template<int N, int K> + struct BVHNQuantizedBaseNodeIntersectorK<N, K, true> + { + static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i, + const TravRayK<K,true>& ray, vfloat<K>& dist) + { + return intersectQuantizedNodeK<N,K>(node,i,ray,dist); + } + + static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist); + } + }; + + + } +} diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h new file mode 100644 index 0000000000..55b2c27231 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h @@ -0,0 +1,189 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray packet structure used in stream traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template<int K, bool robust> + struct TravRayKStream; + + /* Fast variant */ + template<int K> + struct TravRayKStream<K, false> + { + __forceinline TravRayKStream() {} + + __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar) + { + init(ray_org, ray_dir); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir) + { + rdir = rcp_safe(ray_dir); + org_rdir = ray_org * rdir; + } + + Vec3vf<K> rdir; + Vec3vf<K> org_rdir; + vfloat<K> tnear; + vfloat<K> tfar; + }; + + template<int K> + using TravRayKStreamFast = TravRayKStream<K, false>; + + /* Robust variant */ + template<int K> + struct TravRayKStream<K, true> + { + __forceinline TravRayKStream() {} + + __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar) + { + init(ray_org, ray_dir); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir) + { + rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir)); + org = ray_org; + } + + Vec3vf<K> rdir; + Vec3vf<K> org; + vfloat<K> tnear; + vfloat<K> tfar; + }; + + template<int K> + using TravRayKStreamRobust = TravRayKStream<K, true>; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node, + const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf) + { + const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); + const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); + const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); + const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); + const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); + + const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k])); + const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k])); + const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k])); + const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k])); + const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k])); + const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k])); + const vfloat<N> rmin = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k])); + const vfloat<N> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k])); + + const vbool<N> vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit) & (((size_t)1 << N)-1); + } + + template<int N, int K> + __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i, + const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf) + { + char* ptr = (char*)&node->lower_x + i*sizeof(float); + const vfloat<K> bminX = *(const float*)(ptr + nf.nearX); + const vfloat<K> bminY = *(const float*)(ptr + nf.nearY); + const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ); + const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX); + const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY); + const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ); + + const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z); + const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z); + + const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear); + const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar); + + const vbool<K> vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node, + const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf) + { + const vfloat<N> bminX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); + const vfloat<N> bminY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); + const vfloat<N> bminZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat<N> bmaxX = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); + const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); + const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); + + const vfloat<N> rminX = (bminX - vfloat<N>(ray.org.x[k])) * vfloat<N>(ray.rdir.x[k]); + const vfloat<N> rminY = (bminY - vfloat<N>(ray.org.y[k])) * vfloat<N>(ray.rdir.y[k]); + const vfloat<N> rminZ = (bminZ - vfloat<N>(ray.org.z[k])) * vfloat<N>(ray.rdir.z[k]); + const vfloat<N> rmaxX = (bmaxX - vfloat<N>(ray.org.x[k])) * vfloat<N>(ray.rdir.x[k]); + const vfloat<N> rmaxY = (bmaxY - vfloat<N>(ray.org.y[k])) * vfloat<N>(ray.rdir.y[k]); + const vfloat<N> rmaxZ = (bmaxZ - vfloat<N>(ray.org.z[k])) * vfloat<N>(ray.rdir.z[k]); + const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512 + const vfloat<N> rmin = max(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k])); + const vfloat<N> rmax = round_up *min(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k])); + + const vbool<N> vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit) & (((size_t)1 << N)-1); + } + + template<int N, int K> + __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i, + const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf) + { + char *ptr = (char*)&node->lower_x + i*sizeof(float); + const vfloat<K> bminX = *(const float*)(ptr + nf.nearX); + const vfloat<K> bminY = *(const float*)(ptr + nf.nearY); + const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ); + const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX); + const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY); + const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ); + + const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x; + const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y; + const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z; + const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x; + const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y; + const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const vfloat<K> rmin = max(rminX, rminY, rminZ, vfloat<K>(ray.tnear)); + const vfloat<K> rmax = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar)); + + const vbool<K> vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit); + } + } +} diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h new file mode 100644 index 0000000000..cc4ea1805b --- /dev/null +++ b/thirdparty/embree/kernels/common/accel.h @@ -0,0 +1,556 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "ray.h" +#include "point_query.h" +#include "context.h" + +namespace embree +{ + class Scene; + + /*! Base class for the acceleration structure data. */ + class AccelData : public RefCount + { + ALIGNED_CLASS_(16); + public: + enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 }; + + public: + AccelData (const Type type) + : bounds(empty), type(type) {} + + /*! notifies the acceleration structure about the deletion of some geometry */ + virtual void deleteGeometry(size_t geomID) {}; + + /*! clears the acceleration structure data */ + virtual void clear() = 0; + + /*! returns normal bounds */ + __forceinline BBox3fa getBounds() const { + return bounds.bounds(); + } + + /*! returns bounds for some time */ + __forceinline BBox3fa getBounds(float t) const { + return bounds.interpolate(t); + } + + /*! returns linear bounds */ + __forceinline LBBox3fa getLinearBounds() const { + return bounds; + } + + /*! checks if acceleration structure is empty */ + __forceinline bool isEmpty() const { + return bounds.bounds0.lower.x == float(pos_inf); + } + + public: + LBBox3fa bounds; // linear bounds + Type type; + }; + + /*! Base class for all intersectable and buildable acceleration structures. */ + class Accel : public AccelData + { + ALIGNED_CLASS_(16); + public: + + struct Intersectors; + + /*! Type of collide function */ + typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr); + + /*! Type of point query function */ + typedef bool(*PointQueryFunc)(Intersectors* This, /*!< this pointer to accel */ + PointQuery* query, /*!< point query for lookup */ + PointQueryContext* context); /*!< point query context */ + + /*! Type of intersect function pointer for single rays. */ + typedef void (*IntersectFunc)(Intersectors* This, /*!< this pointer to accel */ + RTCRayHit& ray, /*!< ray to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 4. */ + typedef void (*IntersectFunc4)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit4& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 8. */ + typedef void (*IntersectFunc8)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit8& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 16. */ + typedef void (*IntersectFunc16)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit16& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size N. */ + typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */ + RTCRayHitN** ray, /*!< ray stream to intersect */ + const size_t N, /*!< number of rays in stream */ + IntersectContext* context /*!< layout flags */); + + + /*! Type of occlusion function pointer for single rays. */ + typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */ + RTCRay& ray, /*!< ray to test occlusion */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 4. */ + typedef void (*OccludedFunc4) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay4& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 8. */ + typedef void (*OccludedFunc8) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay8& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 16. */ + typedef void (*OccludedFunc16) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay16& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size N. */ + typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */ + RTCRayN** ray, /*!< ray stream to test occlusion */ + const size_t N, /*!< number of rays in stream */ + IntersectContext* context /*!< layout flags */); + typedef void (*ErrorFunc) (); + + struct Collider + { + Collider (ErrorFunc error = nullptr) + : collide((CollideFunc)error), name(nullptr) {} + + Collider (CollideFunc collide, const char* name) + : collide(collide), name(name) {} + + operator bool() const { return name; } + + public: + CollideFunc collide; + const char* name; + }; + + struct Intersector1 + { + Intersector1 (ErrorFunc error = nullptr) + : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {} + + Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name) + : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {} + + Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name) + : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc intersect; + OccludedFunc occluded; + PointQueryFunc pointQuery; + const char* name; + }; + + struct Intersector4 + { + Intersector4 (ErrorFunc error = nullptr) + : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {} + + Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc4 intersect; + OccludedFunc4 occluded; + const char* name; + }; + + struct Intersector8 + { + Intersector8 (ErrorFunc error = nullptr) + : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {} + + Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc8 intersect; + OccludedFunc8 occluded; + const char* name; + }; + + struct Intersector16 + { + Intersector16 (ErrorFunc error = nullptr) + : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {} + + Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc16 intersect; + OccludedFunc16 occluded; + const char* name; + }; + + struct IntersectorN + { + IntersectorN (ErrorFunc error = nullptr) + : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {} + + IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFuncN intersect; + OccludedFuncN occluded; + const char* name; + }; + + struct Intersectors + { + Intersectors() + : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {} + + Intersectors (ErrorFunc error) + : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {} + + void print(size_t ident) + { + if (collider.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "collider = " << collider.name << std::endl; + } + if (intersector1.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersector1 = " << intersector1.name << std::endl; + } + if (intersector4.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersector4 = " << intersector4.name << std::endl; + } + if (intersector8.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersector8 = " << intersector8.name << std::endl; + } + if (intersector16.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersector16 = " << intersector16.name << std::endl; + } + if (intersectorN.name) { + for (size_t i=0; i<ident; i++) std::cout << " "; + std::cout << "intersectorN = " << intersectorN.name << std::endl; + } + } + + void select(bool filter) + { + if (intersector4_filter) { + if (filter) intersector4 = intersector4_filter; + else intersector4 = intersector4_nofilter; + } + if (intersector8_filter) { + if (filter) intersector8 = intersector8_filter; + else intersector8 = intersector8_nofilter; + } + if (intersector16_filter) { + if (filter) intersector16 = intersector16_filter; + else intersector16 = intersector16_nofilter; + } + if (intersectorN_filter) { + if (filter) intersectorN = intersectorN_filter; + else intersectorN = intersectorN_nofilter; + } + } + + __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) { + assert(intersector1.pointQuery); + return intersector1.pointQuery(this,query,context); + } + + /*! collides two scenes */ + __forceinline void collide (Accel* scene0, Accel* scene1, RTCCollideFunc callback, void* userPtr) { + assert(collider.collide); + collider.collide(scene0->intersectors.ptr,scene1->intersectors.ptr,callback,userPtr); + } + + /*! Intersects a single ray with the scene. */ + __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) { + assert(intersector1.intersect); + intersector1.intersect(this,ray,context); + } + + /*! Intersects a packet of 4 rays with the scene. */ + __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) { + assert(intersector4.intersect); + intersector4.intersect(valid,this,ray,context); + } + + /*! Intersects a packet of 8 rays with the scene. */ + __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) { + assert(intersector8.intersect); + intersector8.intersect(valid,this,ray,context); + } + + /*! Intersects a packet of 16 rays with the scene. */ + __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) { + assert(intersector16.intersect); + intersector16.intersect(valid,this,ray,context); + } + + /*! Intersects a stream of N rays in SOA layout with the scene. */ + __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context) + { + assert(intersectorN.intersect); + intersectorN.intersect(this,rayN,N,context); + } + +#if defined(__SSE__) + __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) { + const vint<4> mask = valid.mask32(); + intersect4(&mask,(RTCRayHit4&)ray,context); + } +#endif +#if defined(__AVX__) + __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) { + const vint<8> mask = valid.mask32(); + intersect8(&mask,(RTCRayHit8&)ray,context); + } +#endif +#if defined(__AVX512F__) + __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) { + const vint<16> mask = valid.mask32(); + intersect16(&mask,(RTCRayHit16&)ray,context); + } +#endif + + template<int K> + __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context) + { + intersectN((RTCRayHitN**)rayN,N,context); + } + + /*! Tests if single ray is occluded by the scene. */ + __forceinline void occluded (RTCRay& ray, IntersectContext* context) { + assert(intersector1.occluded); + intersector1.occluded(this,ray,context); + } + + /*! Tests if a packet of 4 rays is occluded by the scene. */ + __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) { + assert(intersector4.occluded); + intersector4.occluded(valid,this,ray,context); + } + + /*! Tests if a packet of 8 rays is occluded by the scene. */ + __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) { + assert(intersector8.occluded); + intersector8.occluded(valid,this,ray,context); + } + + /*! Tests if a packet of 16 rays is occluded by the scene. */ + __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) { + assert(intersector16.occluded); + intersector16.occluded(valid,this,ray,context); + } + + /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */ + __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context) + { + assert(intersectorN.occluded); + intersectorN.occluded(this,rayN,N,context); + } + +#if defined(__SSE__) + __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) { + const vint<4> mask = valid.mask32(); + occluded4(&mask,(RTCRay4&)ray,context); + } +#endif +#if defined(__AVX__) + __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) { + const vint<8> mask = valid.mask32(); + occluded8(&mask,(RTCRay8&)ray,context); + } +#endif +#if defined(__AVX512F__) + __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) { + const vint<16> mask = valid.mask32(); + occluded16(&mask,(RTCRay16&)ray,context); + } +#endif + + template<int K> + __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context) + { + occludedN((RTCRayN**)rayN,N,context); + } + + /*! Tests if single ray is occluded by the scene. */ + __forceinline void intersect(RTCRay& ray, IntersectContext* context) { + occluded(ray, context); + } + + /*! Tests if a packet of K rays is occluded by the scene. */ + template<int K> + __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) { + occluded(valid, ray, context); + } + + /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */ + template<int K> + __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) { + occludedN(rayN, N, context); + } + + public: + AccelData* ptr; + void* leafIntersector; + Collider collider; + Intersector1 intersector1; + Intersector4 intersector4; + Intersector4 intersector4_filter; + Intersector4 intersector4_nofilter; + Intersector8 intersector8; + Intersector8 intersector8_filter; + Intersector8 intersector8_nofilter; + Intersector16 intersector16; + Intersector16 intersector16_filter; + Intersector16 intersector16_nofilter; + IntersectorN intersectorN; + IntersectorN intersectorN_filter; + IntersectorN intersectorN_nofilter; + }; + + public: + + /*! Construction */ + Accel (const AccelData::Type type) + : AccelData(type) {} + + /*! Construction */ + Accel (const AccelData::Type type, const Intersectors& intersectors) + : AccelData(type), intersectors(intersectors) {} + + /*! Virtual destructor */ + virtual ~Accel() {} + + /*! makes the acceleration structure immutable */ + virtual void immutable () {} + + /*! build acceleration structure */ + virtual void build () = 0; + + public: + Intersectors intersectors; + }; + +#define DEFINE_COLLIDER(symbol,collider) \ + Accel::Collider symbol() { \ + return Accel::Collider((Accel::CollideFunc)collider::collide, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTOR1(symbol,intersector) \ + Accel::Intersector1 symbol() { \ + return Accel::Intersector1((Accel::IntersectFunc )intersector::intersect, \ + (Accel::OccludedFunc )intersector::occluded, \ + (Accel::PointQueryFunc)intersector::pointQuery,\ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTOR4(symbol,intersector) \ + Accel::Intersector4 symbol() { \ + return Accel::Intersector4((Accel::IntersectFunc4)intersector::intersect, \ + (Accel::OccludedFunc4)intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTOR8(symbol,intersector) \ + Accel::Intersector8 symbol() { \ + return Accel::Intersector8((Accel::IntersectFunc8)intersector::intersect, \ + (Accel::OccludedFunc8)intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTOR16(symbol,intersector) \ + Accel::Intersector16 symbol() { \ + return Accel::Intersector16((Accel::IntersectFunc16)intersector::intersect, \ + (Accel::OccludedFunc16)intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + +#define DEFINE_INTERSECTORN(symbol,intersector) \ + Accel::IntersectorN symbol() { \ + return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \ + (Accel::OccludedFuncN)intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } + + /* ray stream filter interface */ + typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit* _rayN, const size_t N, const size_t stride, IntersectContext* context); + typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context); + typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context); + typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context); + + typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay* _rayN, const size_t N, const size_t stride, IntersectContext* context); + typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context); + typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context); + typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context); + + struct RayStreamFilterFuncs + { + RayStreamFilterFuncs() + : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr), + occludedAOS(nullptr), occludedAOP(nullptr), occludedSOA(nullptr), occludedSOP(nullptr) {} + + RayStreamFilterFuncs(void (*ptr) ()) + : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr), + occludedAOS((occludedStreamAOS_func) ptr), occludedAOP((occludedStreamAOP_func) ptr), occludedSOA((occludedStreamSOA_func) ptr), occludedSOP((occludedStreamSOP_func) ptr) {} + + RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP, + occludedStreamAOS_func occludedAOS, occludedStreamAOP_func occludedAOP, occludedStreamSOA_func occludedSOA, occludedStreamSOP_func occludedSOP) + : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP), + occludedAOS(occludedAOS), occludedAOP(occludedAOP), occludedSOA(occludedSOA), occludedSOP(occludedSOP) {} + + public: + intersectStreamAOS_func intersectAOS; + intersectStreamAOP_func intersectAOP; + intersectStreamSOA_func intersectSOA; + intersectStreamSOP_func intersectSOP; + + occludedStreamAOS_func occludedAOS; + occludedStreamAOP_func occludedAOP; + occludedStreamSOA_func occludedSOA; + occludedStreamSOP_func occludedSOP; + }; + + typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)(); +} diff --git a/thirdparty/embree/kernels/common/accelinstance.h b/thirdparty/embree/kernels/common/accelinstance.h new file mode 100644 index 0000000000..c63ef998bd --- /dev/null +++ b/thirdparty/embree/kernels/common/accelinstance.h @@ -0,0 +1,41 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "accel.h" +#include "builder.h" + +namespace embree +{ + class AccelInstance : public Accel + { + public: + AccelInstance (AccelData* accel, Builder* builder, Intersectors& intersectors) + : Accel(AccelData::TY_ACCEL_INSTANCE,intersectors), accel(accel), builder(builder) {} + + void immutable () { + builder.reset(nullptr); + } + + public: + void build () { + if (builder) builder->build(); + bounds = accel->bounds; + } + + void deleteGeometry(size_t geomID) { + if (accel ) accel->deleteGeometry(geomID); + if (builder) builder->deleteGeometry(geomID); + } + + void clear() { + if (accel) accel->clear(); + if (builder) builder->clear(); + } + + private: + std::unique_ptr<AccelData> accel; + std::unique_ptr<Builder> builder; + }; +} diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp new file mode 100644 index 0000000000..32a27c560a --- /dev/null +++ b/thirdparty/embree/kernels/common/acceln.cpp @@ -0,0 +1,232 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "acceln.h" +#include "ray.h" +#include "../../include/embree3/rtcore_ray.h" +#include "../../common/algorithms/parallel_for.h" + +namespace embree +{ + AccelN::AccelN() + : Accel(AccelData::TY_ACCELN), accels() {} + + AccelN::~AccelN() + { + for (size_t i=0; i<accels.size(); i++) + delete accels[i]; + } + + void AccelN::accels_add(Accel* accel) + { + assert(accel); + accels.push_back(accel); + } + + void AccelN::accels_init() + { + for (size_t i=0; i<accels.size(); i++) + delete accels[i]; + + accels.clear(); + } + + bool AccelN::pointQuery (Accel::Intersectors* This_in, PointQuery* query, PointQueryContext* context) + { + bool changed = false; + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + changed |= This->accels[i]->intersectors.pointQuery(query,context); + return changed; + } + + void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect(ray,context); + } + + void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect4(valid,ray,context); + } + + void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect8(valid,ray,context); + } + + void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect16(valid,ray,context); + } + + void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersectN(ray,N,context); + } + + void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded(ray,context); + if (ray.tfar < 0.0f) break; + } + } + + void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded4(valid,ray,context); +#if defined(__SSE2__) + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + if (unlikely(none(valid0 & hit0))) break; +#endif + } + } + + void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded8(valid,ray,context); +#if defined(__SSE2__) // FIXME: use higher ISA + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + vbool4 valid1 = asBool(((vint4*)valid)[1]); + vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero); + if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break; +#endif + } + } + + void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; i<This->accels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded16(valid,ray,context); +#if defined(__SSE2__) // FIXME: use higher ISA + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + vbool4 valid1 = asBool(((vint4*)valid)[1]); + vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero); + vbool4 valid2 = asBool(((vint4*)valid)[2]); + vbool4 hit2 = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero); + vbool4 valid3 = asBool(((vint4*)valid)[3]); + vbool4 hit3 = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero); + if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break; +#endif + } + } + + void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + size_t M = N; + for (size_t i=0; i<This->accels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.occludedN(ray,M,context); + } + + void AccelN::accels_print(size_t ident) + { + for (size_t i=0; i<accels.size(); i++) + { + for (size_t j=0; j<ident; j++) std::cout << " "; + std::cout << "accels[" << i << "]" << std::endl; + accels[i]->intersectors.print(ident+2); + } + } + + void AccelN::accels_immutable() + { + for (size_t i=0; i<accels.size(); i++) + accels[i]->immutable(); + } + + void AccelN::accels_build () + { + /* reduce memory consumption */ + accels.shrink_to_fit(); + + /* build all acceleration structures in parallel */ + parallel_for (accels.size(), [&] (size_t i) { + accels[i]->build(); + }); + + /* create list of non-empty acceleration structures */ + bool valid1 = true; + bool valid4 = true; + bool valid8 = true; + bool valid16 = true; + for (size_t i=0; i<accels.size(); i++) { + valid1 &= (bool) accels[i]->intersectors.intersector1; + valid4 &= (bool) accels[i]->intersectors.intersector4; + valid8 &= (bool) accels[i]->intersectors.intersector8; + valid16 &= (bool) accels[i]->intersectors.intersector16; + } + + if (accels.size() == 1) { + type = accels[0]->type; // FIXME: should just assign entire Accel + bounds = accels[0]->bounds; + intersectors = accels[0]->intersectors; + } + else + { + type = AccelData::TY_ACCELN; + intersectors.ptr = this; + intersectors.intersector1 = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr); + intersectors.intersector4 = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr); + intersectors.intersector8 = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr); + intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr); + intersectors.intersectorN = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN"); + + /*! calculate bounds */ + bounds = empty; + for (size_t i=0; i<accels.size(); i++) + bounds.extend(accels[i]->bounds); + } + } + + void AccelN::accels_select(bool filter) + { + for (size_t i=0; i<accels.size(); i++) + accels[i]->intersectors.select(filter); + } + + void AccelN::accels_deleteGeometry(size_t geomID) + { + for (size_t i=0; i<accels.size(); i++) + accels[i]->deleteGeometry(geomID); + } + + void AccelN::accels_clear() + { + for (size_t i=0; i<accels.size(); i++) { + accels[i]->clear(); + } + } +} + diff --git a/thirdparty/embree/kernels/common/acceln.h b/thirdparty/embree/kernels/common/acceln.h new file mode 100644 index 0000000000..0445b2e811 --- /dev/null +++ b/thirdparty/embree/kernels/common/acceln.h @@ -0,0 +1,49 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "accel.h" + +namespace embree +{ + /*! merges N acceleration structures together, by processing them in order */ + class AccelN : public Accel + { + public: + AccelN (); + ~AccelN(); + + public: + void accels_add(Accel* accel); + void accels_init(); + + public: + static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context); + + public: + static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context); + static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context); + static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context); + static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context); + static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context); + + public: + static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context); + static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context); + static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context); + static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context); + static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context); + + public: + void accels_print(size_t ident); + void accels_immutable(); + void accels_build (); + void accels_select(bool filter); + void accels_deleteGeometry(size_t geomID); + void accels_clear (); + + public: + std::vector<Accel*> accels; + }; +} diff --git a/thirdparty/embree/kernels/common/accelset.cpp b/thirdparty/embree/kernels/common/accelset.cpp new file mode 100644 index 0000000000..8c18f31776 --- /dev/null +++ b/thirdparty/embree/kernels/common/accelset.cpp @@ -0,0 +1,17 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "accelset.h" +#include "scene.h" + +namespace embree +{ + AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) + : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {} + + AccelSet::IntersectorN::IntersectorN (ErrorFunc error) + : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {} + + AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} +} diff --git a/thirdparty/embree/kernels/common/accelset.h b/thirdparty/embree/kernels/common/accelset.h new file mode 100644 index 0000000000..90b184a07b --- /dev/null +++ b/thirdparty/embree/kernels/common/accelset.h @@ -0,0 +1,248 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "builder.h" +#include "geometry.h" +#include "ray.h" +#include "hit.h" + +namespace embree +{ + struct IntersectFunctionNArguments; + struct OccludedFunctionNArguments; + + typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); + typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); + + struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments + { + IntersectContext* internal_context; + Geometry* geometry; + ReportIntersectionFunc report; + }; + + struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments + { + IntersectContext* internal_context; + Geometry* geometry; + ReportOcclusionFunc report; + }; + + /*! Base class for set of acceleration structures. */ + class AccelSet : public Geometry + { + public: + typedef RTCIntersectFunctionN IntersectFuncN; + typedef RTCOccludedFunctionN OccludedFuncN; + typedef void (*ErrorFunc) (); + + struct IntersectorN + { + IntersectorN (ErrorFunc error = nullptr) ; + IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name); + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFuncN intersect; + OccludedFuncN occluded; + const char* name; + }; + + public: + + /*! construction */ + AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps); + + /*! makes the acceleration structure immutable */ + virtual void immutable () {} + + /*! build accel */ + virtual void build () = 0; + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + if (!isvalid_non_empty(bounds(i,itime))) return false; + + return true; + } + + /*! Calculates the bounds of an item */ + __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const + { + BBox3fa box; + assert(i < size()); + RTCBoundsFunctionArguments args; + args.geometryUserPtr = userPtr; + args.primID = (unsigned int)i; + args.timeStep = (unsigned int)itime; + args.bounds_o = (RTCBounds*)&box; + boundsFunc(&args); + return box; + } + + /*! calculates the linear bounds of the i'th item at the itime'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const + { + BBox3fa box[2]; + assert(i < size()); + RTCBoundsFunctionArguments args; + args.geometryUserPtr = userPtr; + args.primID = (unsigned int)i; + args.timeStep = (unsigned int)(itime+0); + args.bounds_o = (RTCBounds*)&box[0]; + boundsFunc(&args); + args.timeStep = (unsigned int)(itime+1); + args.bounds_o = (RTCBounds*)&box[1]; + boundsFunc(&args); + return LBBox3fa(box[0],box[1]); + } + + /*! calculates the build bounds of the i'th item, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const BBox3fa b = bounds(i); + if (bbox) *bbox = b; + return isvalid_non_empty(b); + } + + /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + const LBBox3fa bounds = linearBounds(i,itime); + bbox = bounds.bounds0; // use bounding box of first timestep to build BVH + return isvalid_non_empty(bounds); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return numPrimitives; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return numPrimitives != otherVersion; + } + + public: + + /*! Intersects a single ray with the scene. */ + __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + { + assert(primID < size()); + assert(intersectorN.intersect); + + int mask = -1; + IntersectFunctionNArguments args; + args.valid = &mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.rayhit = (RTCRayHitN*)&ray; + args.N = 1; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.intersect(&args); + } + + /*! Tests if single ray is occluded by the scene. */ + __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + { + assert(primID < size()); + assert(intersectorN.occluded); + + int mask = -1; + OccludedFunctionNArguments args; + args.valid = &mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.N = 1; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.occluded(&args); + } + + /*! Intersects a packet of K rays with the scene. */ + template<int K> + __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + { + assert(primID < size()); + assert(intersectorN.intersect); + + vint<K> mask = valid.mask32(); + IntersectFunctionNArguments args; + args.valid = (int*)&mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.rayhit = (RTCRayHitN*)&ray; + args.N = K; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.intersect(&args); + } + + /*! Tests if a packet of K rays is occluded by the scene. */ + template<int K> + __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + { + assert(primID < size()); + assert(intersectorN.occluded); + + vint<K> mask = valid.mask32(); + OccludedFunctionNArguments args; + args.valid = (int*)&mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.N = K; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.occluded(&args); + } + + public: + RTCBoundsFunction boundsFunc; + IntersectorN intersectorN; + }; + +#define DEFINE_SET_INTERSECTORN(symbol,intersector) \ + AccelSet::IntersectorN symbol() { \ + return AccelSet::IntersectorN(intersector::intersect, \ + intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } +} diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp new file mode 100644 index 0000000000..1a0e1aeed3 --- /dev/null +++ b/thirdparty/embree/kernels/common/alloc.cpp @@ -0,0 +1,79 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "alloc.h" +#include "../../common/sys/thread.h" + +namespace embree +{ + __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr; + SpinLock FastAllocator::s_thread_local_allocators_lock; + std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators; + + struct fast_allocator_regression_test : public RegressionTest + { + BarrierSys barrier; + std::atomic<size_t> numFailed; + std::unique_ptr<FastAllocator> alloc; + + fast_allocator_regression_test() + : RegressionTest("fast_allocator_regression_test"), numFailed(0) + { + registerRegressionTest(this); + } + + static void thread_alloc(fast_allocator_regression_test* This) + { + FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator(); + + size_t* ptrs[1000]; + for (size_t j=0; j<1000; j++) + { + This->barrier.wait(); + for (size_t i=0; i<1000; i++) { + ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32)); + *ptrs[i] = size_t(threadalloc.talloc0) + i; + } + for (size_t i=0; i<1000; i++) { + if (*ptrs[i] != size_t(threadalloc.talloc0) + i) + This->numFailed++; + } + This->barrier.wait(); + } + } + + bool run () + { + alloc = make_unique(new FastAllocator(nullptr,false)); + numFailed.store(0); + + size_t numThreads = getNumberOfLogicalThreads(); + barrier.init(numThreads+1); + + /* create threads */ + std::vector<thread_t> threads; + for (size_t i=0; i<numThreads; i++) + threads.push_back(createThread((thread_func)thread_alloc,this)); + + /* run test */ + for (size_t i=0; i<1000; i++) + { + alloc->reset(); + barrier.wait(); + barrier.wait(); + } + + /* destroy threads */ + for (size_t i=0; i<numThreads; i++) + join(threads[i]); + + alloc = nullptr; + + return numFailed == 0; + } + }; + + fast_allocator_regression_test fast_allocator_regression; +} + + diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h new file mode 100644 index 0000000000..4458e35c24 --- /dev/null +++ b/thirdparty/embree/kernels/common/alloc.h @@ -0,0 +1,958 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" +#include "scene.h" +#include "primref.h" + +namespace embree +{ + class FastAllocator + { + /*! maximum supported alignment */ + static const size_t maxAlignment = 64; + + /*! maximum allocation size */ + + /* default settings */ + //static const size_t defaultBlockSize = 4096; +#define maxAllocationSize size_t(2*1024*1024-maxAlignment) + + static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8; + + public: + + struct ThreadLocal2; + enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE }; + + /*! Per thread structure holding the current memory block. */ + struct __aligned(64) ThreadLocal + { + ALIGNED_CLASS_(64); + public: + + /*! Constructor for usage with ThreadLocalData */ + __forceinline ThreadLocal (ThreadLocal2* parent) + : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {} + + /*! initialize allocator */ + void init(FastAllocator* alloc) + { + ptr = nullptr; + cur = end = 0; + bytesUsed = 0; + bytesWasted = 0; + allocBlockSize = 0; + if (alloc) allocBlockSize = alloc->defaultBlockSize; + } + + /* Allocate aligned memory from the threads memory block. */ + __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) + { + /* bind the thread local allocator to the proper FastAllocator*/ + parent->bind(alloc); + + assert(align <= maxAlignment); + bytesUsed += bytes; + + /* try to allocate in local block */ + size_t ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* if allocation is too large allocate with parent allocator */ + if (4*bytes > allocBlockSize) { + return alloc->malloc(bytes,maxAlignment,false); + } + + /* get new partial block if allocation failed */ + size_t blockSize = allocBlockSize; + ptr = (char*) alloc->malloc(blockSize,maxAlignment,true); + bytesWasted += end-cur; + cur = 0; end = blockSize; + + /* retry allocation */ + ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* get new full block if allocation failed */ + blockSize = allocBlockSize; + ptr = (char*) alloc->malloc(blockSize,maxAlignment,false); + bytesWasted += end-cur; + cur = 0; end = blockSize; + + /* retry allocation */ + ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* should never happen as large allocations get handled specially above */ + assert(false); + return nullptr; + } + + + /*! returns amount of used bytes */ + __forceinline size_t getUsedBytes() const { return bytesUsed; } + + /*! returns amount of free bytes */ + __forceinline size_t getFreeBytes() const { return end-cur; } + + /*! returns amount of wasted bytes */ + __forceinline size_t getWastedBytes() const { return bytesWasted; } + + private: + ThreadLocal2* parent; + char* ptr; //!< pointer to memory block + size_t cur; //!< current location of the allocator + size_t end; //!< end of the memory block + size_t allocBlockSize; //!< block size for allocations + size_t bytesUsed; //!< number of total bytes allocated + size_t bytesWasted; //!< number of bytes wasted + }; + + /*! Two thread local structures. */ + struct __aligned(64) ThreadLocal2 + { + ALIGNED_CLASS_(64); + public: + + __forceinline ThreadLocal2() + : alloc(nullptr), alloc0(this), alloc1(this) {} + + /*! bind to fast allocator */ + __forceinline void bind(FastAllocator* alloc_i) + { + assert(alloc_i); + if (alloc.load() == alloc_i) return; + Lock<SpinLock> lock(mutex); + //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind + if (alloc.load()) { + alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); + alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); + alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); + } + alloc0.init(alloc_i); + alloc1.init(alloc_i); + alloc.store(alloc_i); + alloc_i->join(this); + } + + /*! unbind to fast allocator */ + void unbind(FastAllocator* alloc_i) + { + assert(alloc_i); + if (alloc.load() != alloc_i) return; + Lock<SpinLock> lock(mutex); + if (alloc.load() != alloc_i) return; // required as a different thread calls unbind + alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); + alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); + alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); + alloc0.init(nullptr); + alloc1.init(nullptr); + alloc.store(nullptr); + } + + public: + SpinLock mutex; //!< required as unbind is called from other threads + std::atomic<FastAllocator*> alloc; //!< parent allocator + ThreadLocal alloc0; + ThreadLocal alloc1; + }; + + FastAllocator (Device* device, bool osAllocation) + : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), + growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC), + primrefarray(device,0) + { + for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) + { + threadUsedBlocks[i] = nullptr; + threadBlocks[i] = nullptr; + assert(!slotMutex[i].isLocked()); + } + } + + ~FastAllocator () { + clear(); + } + + /*! returns the device attached to this allocator */ + Device* getDevice() { + return device; + } + + void share(mvector<PrimRef>& primrefarray_i) { + primrefarray = std::move(primrefarray_i); + } + + void unshare(mvector<PrimRef>& primrefarray_o) + { + reset(); // this removes blocks that are allocated inside the shared primref array + primrefarray_o = std::move(primrefarray); + } + + /*! returns first fast thread local allocator */ + __forceinline ThreadLocal* _threadLocal() { + return &threadLocal2()->alloc0; + } + + void setOSallocation(bool flag) + { + atype = flag ? OS_MALLOC : ALIGNED_MALLOC; + } + + private: + + /*! returns both fast thread local allocators */ + __forceinline ThreadLocal2* threadLocal2() + { + ThreadLocal2* alloc = thread_local_allocator2; + if (alloc == nullptr) { + thread_local_allocator2 = alloc = new ThreadLocal2; + Lock<SpinLock> lock(s_thread_local_allocators_lock); + s_thread_local_allocators.push_back(make_unique(alloc)); + } + return alloc; + } + + public: + + __forceinline void join(ThreadLocal2* alloc) + { + Lock<SpinLock> lock(thread_local_allocators_lock); + thread_local_allocators.push_back(alloc); + } + + public: + + struct CachedAllocator + { + __forceinline CachedAllocator(void* ptr) + : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) + { + assert(ptr == nullptr); + } + + __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc) + : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {} + + __forceinline operator bool () const { + return alloc != nullptr; + } + + __forceinline void* operator() (size_t bytes, size_t align = 16) const { + return talloc0->malloc(alloc,bytes,align); + } + + __forceinline void* malloc0 (size_t bytes, size_t align = 16) const { + return talloc0->malloc(alloc,bytes,align); + } + + __forceinline void* malloc1 (size_t bytes, size_t align = 16) const { + return talloc1->malloc(alloc,bytes,align); + } + + public: + FastAllocator* alloc; + ThreadLocal* talloc0; + ThreadLocal* talloc1; + }; + + __forceinline CachedAllocator getCachedAllocator() { + return CachedAllocator(this,threadLocal2()); + } + + /*! Builder interface to create thread local allocator */ + struct Create + { + public: + __forceinline Create (FastAllocator* allocator) : allocator(allocator) {} + __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); } + + private: + FastAllocator* allocator; + }; + + void internal_fix_used_blocks() + { + /* move thread local blocks to global block list */ + for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++) + { + while (threadBlocks[i].load() != nullptr) { + Block* nextUsedBlock = threadBlocks[i].load()->next; + threadBlocks[i].load()->next = usedBlocks.load(); + usedBlocks = threadBlocks[i].load(); + threadBlocks[i] = nextUsedBlock; + } + threadBlocks[i] = nullptr; + } + } + + static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks + static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks + static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks + + /* calculates a single threaded threshold for the builders such + * that for small scenes the overhead of partly allocated blocks + * per thread is low */ + size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated) + { + if (numPrimitives == 0 || bytesEstimated == 0) + return defaultThreshold; + + /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */ + const size_t single_mode_factor = use_single_mode ? 1 : 2; + const size_t threadCount = TaskScheduler::threadCount(); + const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize; + + /* if we do not have to limit number of threads use optimal thresdhold */ + if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) + return defaultThreshold; + + /* otherwise limit number of threads by calculating proper single thread threshold */ + else { + double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives); + return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); + } + } + + __forceinline size_t alignSize(size_t i) { + return (i+127)/128*128; + } + + /*! initializes the grow size */ + __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) + { + /* we do not need single thread local allocator mode */ + use_single_mode = false; + + /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */ + size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic; + size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead); + growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize); + + /* if we reached the maxAllocationSize for growSize, we can + * increase the number of allocation slots by still guaranteeing + * the mainAllocationOverhead */ + slotMask = 0x0; + + if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */ + + /* set the thread local alloc block size */ + size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment; + + /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */ +#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL + const size_t threadCount = TaskScheduler::threadCount(); + const size_t single_mode_factor = use_single_mode ? 1 : 2; + const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch; + if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) + defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize); + + /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */ + else +#endif + defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch); + + if (bytesEstimated == 0) { + maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size + defaultBlockSize = defaultBlockSizeSwitch; + } + log2_grow_size_scale = 0; + + if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size; + if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0; + if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1; + if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3; + if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7; + if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size; + if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc; + } + + /*! initializes the allocator */ + void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate) + { + internal_fix_used_blocks(); + /* distribute the allocation to multiple thread block slots */ + slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove + if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } + if (bytesReserve == 0) bytesReserve = bytesAllocate; + freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype); + estimatedSize = bytesEstimate; + initGrowSizeAndNumSlots(bytesEstimate,true); + } + + /*! initializes the allocator */ + void init_estimate(size_t bytesEstimate) + { + internal_fix_used_blocks(); + if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } + /* single allocator mode ? */ + estimatedSize = bytesEstimate; + //initGrowSizeAndNumSlots(bytesEstimate,false); + initGrowSizeAndNumSlots(bytesEstimate,false); + + } + + /*! frees state not required after build */ + __forceinline void cleanup() + { + internal_fix_used_blocks(); + + /* unbind all thread local allocators */ + for (auto alloc : thread_local_allocators) alloc->unbind(this); + thread_local_allocators.clear(); + } + + /*! resets the allocator, memory blocks get reused */ + void reset () + { + internal_fix_used_blocks(); + + bytesUsed.store(0); + bytesFree.store(0); + bytesWasted.store(0); + + /* reset all used blocks and move them to begin of free block list */ + while (usedBlocks.load() != nullptr) { + usedBlocks.load()->reset_block(); + Block* nextUsedBlock = usedBlocks.load()->next; + usedBlocks.load()->next = freeBlocks.load(); + freeBlocks = usedBlocks.load(); + usedBlocks = nextUsedBlock; + } + + /* remove all shared blocks as they are re-added during build */ + freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load())); + + for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) + { + threadUsedBlocks[i] = nullptr; + threadBlocks[i] = nullptr; + } + + /* unbind all thread local allocators */ + for (auto alloc : thread_local_allocators) alloc->unbind(this); + thread_local_allocators.clear(); + } + + /*! frees all allocated memory */ + __forceinline void clear() + { + cleanup(); + bytesUsed.store(0); + bytesFree.store(0); + bytesWasted.store(0); + if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr; + if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr; + for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) { + threadUsedBlocks[i] = nullptr; + threadBlocks[i] = nullptr; + } + primrefarray.clear(); + } + + __forceinline size_t incGrowSizeScale() + { + size_t scale = log2_grow_size_scale.fetch_add(1)+1; + return size_t(1) << min(size_t(16),scale); + } + + /*! thread safe allocation of memory */ + void* malloc(size_t& bytes, size_t align, bool partial) + { + assert(align <= maxAlignment); + + while (true) + { + /* allocate using current block */ + size_t threadID = TaskScheduler::threadID(); + size_t slot = threadID & slotMask; + Block* myUsedBlocks = threadUsedBlocks[slot]; + if (myUsedBlocks) { + void* ptr = myUsedBlocks->malloc(device,bytes,align,partial); + if (ptr) return ptr; + } + + /* throw error if allocation is too large */ + if (bytes > maxAllocationSize) + throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large"); + + /* parallel block creation in case of no freeBlocks, avoids single global mutex */ + if (likely(freeBlocks.load() == nullptr)) + { + Lock<SpinLock> lock(slotMutex[slot]); + if (myUsedBlocks == threadUsedBlocks[slot]) { + const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); + const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes); + assert(allocSize >= bytes); + threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here! + // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail. + } + continue; + } + + /* if this fails allocate new block */ + { + Lock<SpinLock> lock(mutex); + if (myUsedBlocks == threadUsedBlocks[slot]) + { + if (freeBlocks.load() != nullptr) { + Block* nextFreeBlock = freeBlocks.load()->next; + freeBlocks.load()->next = usedBlocks; + __memory_barrier(); + usedBlocks = freeBlocks.load(); + threadUsedBlocks[slot] = freeBlocks.load(); + freeBlocks = nextFreeBlock; + } else { + const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize); + usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above! + } + } + } + } + } + + /*! add new block */ + void addBlock(void* ptr, ssize_t bytes) + { + Lock<SpinLock> lock(mutex); + const size_t sizeof_Header = offsetof(Block,data[0]); + void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); + size_t ofs = (size_t) aptr - (size_t) ptr; + bytes -= ofs; + if (bytes < 4096) return; // ignore empty or very small blocks + freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs); + } + + /* special allocation only used from morton builder only a single time for each build */ + void* specialAlloc(size_t bytes) + { + assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes); + return freeBlocks.load()->ptr(); + } + + struct Statistics + { + Statistics () + : bytesUsed(0), bytesFree(0), bytesWasted(0) {} + + Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted) + : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {} + + Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false) + : bytesUsed(0), bytesFree(0), bytesWasted(0) + { + Block* usedBlocks = alloc->usedBlocks.load(); + Block* freeBlocks = alloc->freeBlocks.load(); + if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages); + if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages); + if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages); + if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages); + if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages); + } + + std::string str(size_t numPrimitives) + { + std::stringstream str; + str.setf(std::ios::fixed, std::ios::floatfield); + str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " + << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " + << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives); + return str.str(); + } + + friend Statistics operator+ ( const Statistics& a, const Statistics& b) + { + return Statistics(a.bytesUsed+b.bytesUsed, + a.bytesFree+b.bytesFree, + a.bytesWasted+b.bytesWasted); + } + + size_t bytesAllocatedTotal() const { + return bytesUsed + bytesFree + bytesWasted; + } + + public: + size_t bytesUsed; + size_t bytesFree; + size_t bytesWasted; + }; + + Statistics getStatistics(AllocationType atype, bool huge_pages = false) { + return Statistics(this,atype,huge_pages); + } + + size_t getUsedBytes() { + return bytesUsed; + } + + size_t getWastedBytes() { + return bytesWasted; + } + + struct AllStatistics + { + AllStatistics (FastAllocator* alloc) + + : bytesUsed(alloc->bytesUsed), + bytesFree(alloc->bytesFree), + bytesWasted(alloc->bytesWasted), + stat_all(alloc,ANY_TYPE), + stat_malloc(alloc,ALIGNED_MALLOC), + stat_4K(alloc,OS_MALLOC,false), + stat_2M(alloc,OS_MALLOC,true), + stat_shared(alloc,SHARED) {} + + AllStatistics (size_t bytesUsed, + size_t bytesFree, + size_t bytesWasted, + Statistics stat_all, + Statistics stat_malloc, + Statistics stat_4K, + Statistics stat_2M, + Statistics stat_shared) + + : bytesUsed(bytesUsed), + bytesFree(bytesFree), + bytesWasted(bytesWasted), + stat_all(stat_all), + stat_malloc(stat_malloc), + stat_4K(stat_4K), + stat_2M(stat_2M), + stat_shared(stat_shared) {} + + friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b) + { + return AllStatistics(a.bytesUsed+b.bytesUsed, + a.bytesFree+b.bytesFree, + a.bytesWasted+b.bytesWasted, + a.stat_all + b.stat_all, + a.stat_malloc + b.stat_malloc, + a.stat_4K + b.stat_4K, + a.stat_2M + b.stat_2M, + a.stat_shared + b.stat_shared); + } + + void print(size_t numPrimitives) + { + std::stringstream str0; + str0.setf(std::ios::fixed, std::ios::floatfield); + str0 << " alloc : " + << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << " " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives); + std::cout << str0.str() << std::endl; + + std::stringstream str1; + str1.setf(std::ios::fixed, std::ios::floatfield); + str1 << " alloc : " + << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " + << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " + << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives); + std::cout << str1.str() << std::endl; + + std::cout << " total : " << stat_all.str(numPrimitives) << std::endl; + std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl; + std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl; + std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl; + std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl; + } + + private: + size_t bytesUsed; + size_t bytesFree; + size_t bytesWasted; + Statistics stat_all; + Statistics stat_malloc; + Statistics stat_4K; + Statistics stat_2M; + Statistics stat_shared; + }; + + void print_blocks() + { + std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl; + + std::cout << " used blocks = "; + if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list(); + std::cout << "[END]" << std::endl; + + std::cout << " free blocks = "; + if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list(); + std::cout << "[END]" << std::endl; + } + + private: + + struct Block + { + static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype) + { + /* We avoid using os_malloc for small blocks as this could + * cause a risk of fragmenting the virtual address space and + * reach the limit of vm.max_map_count = 65k under Linux. */ + if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize) + atype = ALIGNED_MALLOC; + + /* we need to additionally allocate some header */ + const size_t sizeof_Header = offsetof(Block,data[0]); + bytesAllocate = sizeof_Header+bytesAllocate; + bytesReserve = sizeof_Header+bytesReserve; + + /* consume full 4k pages with using os_malloc */ + if (atype == OS_MALLOC) { + bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); + bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); + } + + /* either use alignedMalloc or os_malloc */ + void *ptr = nullptr; + if (atype == ALIGNED_MALLOC) + { + /* special handling for default block size */ + if (bytesAllocate == (2*PAGE_SIZE_2M)) + { + const size_t alignment = maxAlignment; + if (device) device->memoryMonitor(bytesAllocate+alignment,false); + ptr = alignedMalloc(bytesAllocate,alignment); + + /* give hint to transparently convert these pages to 2MB pages */ + const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1); + os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block + os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M); + os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block + + return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); + } + else + { + const size_t alignment = maxAlignment; + if (device) device->memoryMonitor(bytesAllocate+alignment,false); + ptr = alignedMalloc(bytesAllocate,alignment); + return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); + } + } + else if (atype == OS_MALLOC) + { + if (device) device->memoryMonitor(bytesAllocate,false); + bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages); + return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); + } + else + assert(false); + + return NULL; + } + + Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false) + : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages) + { + assert((((size_t)&data[0]) & (maxAlignment-1)) == 0); + } + + static Block* remove_shared_blocks(Block* head) + { + Block** prev_next = &head; + for (Block* block = head; block; block = block->next) { + if (block->atype == SHARED) *prev_next = block->next; + else prev_next = &block->next; + } + return head; + } + + void clear_list(MemoryMonitorInterface* device) + { + Block* block = this; + while (block) { + Block* next = block->next; + block->clear_block(device); + block = next; + } + } + + void clear_block (MemoryMonitorInterface* device) + { + const size_t sizeof_Header = offsetof(Block,data[0]); + const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes(); + + if (atype == ALIGNED_MALLOC) { + alignedFree(this); + if (device) device->memoryMonitor(-sizeof_Alloced,true); + } + + else if (atype == OS_MALLOC) { + size_t sizeof_This = sizeof_Header+reserveEnd; + os_free(this,sizeof_This,huge_pages); + if (device) device->memoryMonitor(-sizeof_Alloced,true); + } + + else /* if (atype == SHARED) */ { + } + } + + void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial) + { + size_t bytes = bytes_in; + assert(align <= maxAlignment); + bytes = (bytes+(align-1)) & ~(align-1); + if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr; + const size_t i = cur.fetch_add(bytes); + if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr; + if (unlikely(i > reserveEnd)) return nullptr; + bytes_in = bytes = min(bytes,reserveEnd-i); + + if (i+bytes > allocEnd) { + if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true); + } + return &data[i]; + } + + void* ptr() { + return &data[cur]; + } + + void reset_block () + { + allocEnd = max(allocEnd,(size_t)cur); + cur = 0; + } + + size_t getBlockUsedBytes() const { + return min(size_t(cur),reserveEnd); + } + + size_t getBlockFreeBytes() const { + return getBlockAllocatedBytes() - getBlockUsedBytes(); + } + + size_t getBlockAllocatedBytes() const { + return min(max(allocEnd,size_t(cur)),reserveEnd); + } + + size_t getBlockWastedBytes() const { + const size_t sizeof_Header = offsetof(Block,data[0]); + return sizeof_Header + wasted; + } + + size_t getBlockReservedBytes() const { + return reserveEnd; + } + + bool hasType(AllocationType atype_i, bool huge_pages_i) const + { + if (atype_i == ANY_TYPE ) return true; + else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; + else return atype_i == atype; + } + + size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockUsedBytes(); + } + return bytes; + } + + size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockFreeBytes(); + } + return bytes; + } + + size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockWastedBytes(); + } + return bytes; + } + + size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockAllocatedBytes(); + } + return bytes; + } + + void print_list () + { + for (const Block* block = this; block; block = block->next) + block->print_block(); + } + + void print_block() const + { + if (atype == ALIGNED_MALLOC) std::cout << "A"; + else if (atype == OS_MALLOC) std::cout << "O"; + else if (atype == SHARED) std::cout << "S"; + if (huge_pages) std::cout << "H"; + size_t bytesUsed = getBlockUsedBytes(); + size_t bytesFree = getBlockFreeBytes(); + size_t bytesWasted = getBlockWastedBytes(); + std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] "; + } + + public: + std::atomic<size_t> cur; //!< current location of the allocator + std::atomic<size_t> allocEnd; //!< end of the allocated memory region + std::atomic<size_t> reserveEnd; //!< end of the reserved memory region + Block* next; //!< pointer to next block in list + size_t wasted; //!< amount of memory wasted through block alignment + AllocationType atype; //!< allocation mode of the block + bool huge_pages; //!< whether the block uses huge pages + char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment + char data[1]; //!< here starts memory to use for allocations + }; + + private: + Device* device; + SpinLock mutex; + size_t slotMask; + std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; + std::atomic<Block*> usedBlocks; + std::atomic<Block*> freeBlocks; + + std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; + SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; + + bool use_single_mode; + size_t defaultBlockSize; + size_t estimatedSize; + size_t growSize; + size_t maxGrowSize; + std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove + std::atomic<size_t> bytesUsed; + std::atomic<size_t> bytesFree; + std::atomic<size_t> bytesWasted; + static __thread ThreadLocal2* thread_local_allocator2; + static SpinLock s_thread_local_allocators_lock; + static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators; + SpinLock thread_local_allocators_lock; + std::vector<ThreadLocal2*> thread_local_allocators; + AllocationType atype; + mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes + }; +} diff --git a/thirdparty/embree/kernels/common/buffer.h b/thirdparty/embree/kernels/common/buffer.h new file mode 100644 index 0000000000..793012c04d --- /dev/null +++ b/thirdparty/embree/kernels/common/buffer.h @@ -0,0 +1,263 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" + +namespace embree +{ + /*! Implements an API data buffer object. This class may or may not own the data. */ + class Buffer : public RefCount + { + public: + /*! Buffer construction */ + Buffer() + : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {} + + /*! Buffer construction */ + Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr) + : device(device), numBytes(numBytes_in) + { + device->refInc(); + + if (ptr_in) + { + shared = true; + ptr = (char*)ptr_in; + } + else + { + shared = false; + alloc(); + } + } + + /*! Buffer destruction */ + ~Buffer() { + free(); + device->refDec(); + } + + /*! this class is not copyable */ + private: + Buffer(const Buffer& other) DELETED; // do not implement + Buffer& operator =(const Buffer& other) DELETED; // do not implement + + public: + /* inits and allocates the buffer */ + void create(Device* device_in, size_t numBytes_in) + { + init(device_in, numBytes_in); + alloc(); + } + + /* inits the buffer */ + void init(Device* device_in, size_t numBytes_in) + { + free(); + device = device_in; + ptr = nullptr; + numBytes = numBytes_in; + shared = false; + } + + /*! sets shared buffer */ + void set(Device* device_in, void* ptr_in, size_t numBytes_in) + { + free(); + device = device_in; + ptr = (char*)ptr_in; + if (numBytes_in != (size_t)-1) + numBytes = numBytes_in; + shared = true; + } + + /*! allocated buffer */ + void alloc() + { + if (device) + device->memoryMonitor(this->bytes(), false); + size_t b = (this->bytes()+15) & ssize_t(-16); + ptr = (char*)alignedMalloc(b,16); + } + + /*! frees the buffer */ + void free() + { + if (shared) return; + alignedFree(ptr); + if (device) + device->memoryMonitor(-ssize_t(this->bytes()), true); + ptr = nullptr; + } + + /*! gets buffer pointer */ + void* data() + { + /* report error if buffer is not existing */ + if (!device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified"); + + /* return buffer */ + return ptr; + } + + /*! returns pointer to first element */ + __forceinline char* getPtr() const { + return ptr; + } + + /*! returns the number of bytes of the buffer */ + __forceinline size_t bytes() const { + return numBytes; + } + + /*! returns true of the buffer is not empty */ + __forceinline operator bool() const { + return ptr; + } + + public: + Device* device; //!< device to report memory usage to + char* ptr; //!< pointer to buffer data + size_t numBytes; //!< number of bytes in the buffer + bool shared; //!< set if memory is shared with application + }; + + /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */ + class RawBufferView + { + public: + /*! Buffer construction */ + RawBufferView() + : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {} + + public: + /*! sets the buffer view */ + void set(const Ref<Buffer>& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in) + { + if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes)) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds"); + + ptr_ofs = buffer_in->ptr + offset_in; + stride = stride_in; + num = num_in; + format = format_in; + modCounter++; + modified = true; + buffer = buffer_in; + } + + /*! returns pointer to the first element */ + __forceinline char* getPtr() const { + return ptr_ofs; + } + + /*! returns pointer to the i'th element */ + __forceinline char* getPtr(size_t i) const + { + assert(i<num); + return ptr_ofs + i*stride; + } + + /*! returns the number of elements of the buffer */ + __forceinline size_t size() const { + return num; + } + + /*! returns the number of bytes of the buffer */ + __forceinline size_t bytes() const { + return num*stride; + } + + /*! returns the buffer stride */ + __forceinline unsigned getStride() const + { + assert(stride <= unsigned(inf)); + return unsigned(stride); + } + + /*! return the buffer format */ + __forceinline RTCFormat getFormat() const { + return format; + } + + /*! mark buffer as modified or unmodified */ + __forceinline void setModified() { + modCounter++; + modified = true; + } + + /*! mark buffer as modified or unmodified */ + __forceinline bool isModified(unsigned int otherModCounter) const { + return modCounter > otherModCounter; + } + + /*! mark buffer as modified or unmodified */ + __forceinline bool isLocalModified() const { + return modified; + } + + /*! clear local modified flag */ + __forceinline void clearLocalModified() { + modified = false; + } + + /*! returns true of the buffer is not empty */ + __forceinline operator bool() const { + return ptr_ofs; + } + + /*! checks padding to 16 byte check, fails hard */ + __forceinline void checkPadding16() const + { + if (ptr_ofs && num) + volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable? + } + + public: + char* ptr_ofs; //!< base pointer plus offset + size_t stride; //!< stride of the buffer in bytes + size_t num; //!< number of elements in the buffer + RTCFormat format; //!< format of the buffer + unsigned int modCounter; //!< version ID of this buffer + bool modified; //!< local modified data + int userData; //!< special data + Ref<Buffer> buffer; //!< reference to the parent buffer + }; + + /*! A typed contiguous range of a buffer. This class does not own the buffer content. */ + template<typename T> + class BufferView : public RawBufferView + { + public: + typedef T value_type; + + /*! access to the ith element of the buffer */ + __forceinline T& operator [](size_t i) { assert(i<num); return *(T*)(ptr_ofs + i*stride); } + __forceinline const T& operator [](size_t i) const { assert(i<num); return *(T*)(ptr_ofs + i*stride); } + }; + + template<> + class BufferView<Vec3fa> : public RawBufferView + { + public: + typedef Vec3fa value_type; + + /*! access to the ith element of the buffer */ + __forceinline const Vec3fa operator [](size_t i) const + { + assert(i<num); + return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride))); + } + + /*! writes the i'th element */ + __forceinline void store(size_t i, const Vec3fa& v) + { + assert(i<num); + vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v); + } + }; +} diff --git a/thirdparty/embree/kernels/common/builder.h b/thirdparty/embree/kernels/common/builder.h new file mode 100644 index 0000000000..07fe7b069b --- /dev/null +++ b/thirdparty/embree/kernels/common/builder.h @@ -0,0 +1,60 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "accel.h" + +namespace embree +{ +#define MODE_HIGH_QUALITY (1<<8) + + /*! virtual interface for all hierarchy builders */ + class Builder : public RefCount { + public: + + static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024; + + /*! initiates the hierarchy builder */ + virtual void build() = 0; + + /*! notifies the builder about the deletion of some geometry */ + virtual void deleteGeometry(size_t geomID) {}; + + /*! clears internal builder state */ + virtual void clear() = 0; + }; + + /*! virtual interface for progress monitor class */ + struct BuildProgressMonitor { + virtual void operator() (size_t dn) const = 0; + }; + + /*! build the progress monitor interface from a closure */ + template<typename Closure> + struct ProgressMonitorClosure : BuildProgressMonitor + { + public: + ProgressMonitorClosure (const Closure& closure) : closure(closure) {} + void operator() (size_t dn) const { closure(dn); } + private: + const Closure closure; + }; + template<typename Closure> __forceinline const ProgressMonitorClosure<Closure> BuildProgressMonitorFromClosure(const Closure& closure) { + return ProgressMonitorClosure<Closure>(closure); + } + + struct LineSegments; + struct TriangleMesh; + struct QuadMesh; + struct UserGeometry; + + class Scene; + + typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder); + typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + +} diff --git a/thirdparty/embree/kernels/common/context.h b/thirdparty/embree/kernels/common/context.h new file mode 100644 index 0000000000..ccd88bdeac --- /dev/null +++ b/thirdparty/embree/kernels/common/context.h @@ -0,0 +1,131 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "rtcore.h" +#include "point_query.h" + +namespace embree +{ + class Scene; + + struct IntersectContext + { + public: + __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context) + : scene(scene), user(user_context) {} + + __forceinline bool hasContextFilter() const { + return user->filter != nullptr; + } + + __forceinline bool isCoherent() const { + return embree::isCoherent(user->flags); + } + + __forceinline bool isIncoherent() const { + return embree::isIncoherent(user->flags); + } + + public: + Scene* scene; + RTCIntersectContext* user; + }; + + template<int M, typename Geometry> + __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v) + { +#if RTC_MIN_WIDTH + const vfloat<M> d = length(Vec3vf<M>(v) - ray_org); + const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w); + return Vec4vf<M>(v.x,v.y,v.z,r); +#else + return v; +#endif + } + + template<typename Geometry> + __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v) + { +#if RTC_MIN_WIDTH + const float d = length(Vec3fa(v) - ray_org); + const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w); + return Vec3ff(v.x,v.y,v.z,r); +#else + return v; +#endif + } + + enum PointQueryType + { + POINT_QUERY_TYPE_UNDEFINED = 0, + POINT_QUERY_TYPE_SPHERE = 1, + POINT_QUERY_TYPE_AABB = 2, + }; + + typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args); + + struct PointQueryContext + { + public: + __forceinline PointQueryContext(Scene* scene, + PointQuery* query_ws, + PointQueryType query_type, + PointQueryFunction func, + RTCPointQueryContext* userContext, + float similarityScale, + void* userPtr) + : scene(scene) + , query_ws(query_ws) + , query_type(query_type) + , func(func) + , userContext(userContext) + , similarityScale(similarityScale) + , userPtr(userPtr) + , primID(RTC_INVALID_GEOMETRY_ID) + , geomID(RTC_INVALID_GEOMETRY_ID) + , query_radius(query_ws->radius) + { + if (query_type == POINT_QUERY_TYPE_AABB) { + assert(similarityScale == 0.f); + updateAABB(); + } + if (userContext->instStackSize == 0) { + assert(similarityScale == 1.f); + } + } + + public: + __forceinline void updateAABB() + { + if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) { + query_radius = Vec3fa(query_ws->radius); + return; + } + + const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]); + BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius)); + bbox = xfmBounds(m, bbox); + query_radius = 0.5f * (bbox.upper - bbox.lower); + } + +public: + Scene* scene; + + PointQuery* query_ws; // the original world space point query + PointQueryType query_type; + PointQueryFunction func; + RTCPointQueryContext* userContext; + const float similarityScale; + + void* userPtr; + + unsigned int primID; + unsigned int geomID; + + Vec3fa query_radius; // used if the query is converted to an AABB internally + }; +} + diff --git a/thirdparty/embree/kernels/common/default.h b/thirdparty/embree/kernels/common/default.h new file mode 100644 index 0000000000..f15d61b768 --- /dev/null +++ b/thirdparty/embree/kernels/common/default.h @@ -0,0 +1,268 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/sys/platform.h" +#include "../../common/sys/sysinfo.h" +#include "../../common/sys/thread.h" +#include "../../common/sys/alloc.h" +#include "../../common/sys/ref.h" +#include "../../common/sys/intrinsics.h" +#include "../../common/sys/atomic.h" +#include "../../common/sys/mutex.h" +#include "../../common/sys/vector.h" +#include "../../common/sys/array.h" +#include "../../common/sys/string.h" +#include "../../common/sys/regression.h" +#include "../../common/sys/vector.h" + +#include "../../common/math/math.h" +#include "../../common/math/transcendental.h" +#include "../../common/simd/simd.h" +#include "../../common/math/vec2.h" +#include "../../common/math/vec3.h" +#include "../../common/math/vec4.h" +#include "../../common/math/vec2fa.h" +#include "../../common/math/vec3fa.h" +#include "../../common/math/interval.h" +#include "../../common/math/bbox.h" +#include "../../common/math/obbox.h" +#include "../../common/math/lbbox.h" +#include "../../common/math/linearspace2.h" +#include "../../common/math/linearspace3.h" +#include "../../common/math/affinespace.h" +#include "../../common/math/range.h" +#include "../../common/lexers/tokenstream.h" + +#include "../../common/tasking/taskscheduler.h" + +#define COMMA , + +#include "../config.h" +#include "isa.h" +#include "stat.h" +#include "profile.h" +#include "rtcore.h" +#include "vector.h" +#include "state.h" +#include "instance_stack.h" + +#include <vector> +#include <map> +#include <algorithm> +#include <functional> +#include <utility> +#include <sstream> + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Vec2 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template<int N> using Vec2vf = Vec2<vfloat<N>>; + template<int N> using Vec2vd = Vec2<vdouble<N>>; + template<int N> using Vec2vr = Vec2<vreal<N>>; + template<int N> using Vec2vi = Vec2<vint<N>>; + template<int N> using Vec2vl = Vec2<vllong<N>>; + template<int N> using Vec2vb = Vec2<vbool<N>>; + template<int N> using Vec2vbf = Vec2<vboolf<N>>; + template<int N> using Vec2vbd = Vec2<vboold<N>>; + + typedef Vec2<vfloat4> Vec2vf4; + typedef Vec2<vdouble4> Vec2vd4; + typedef Vec2<vreal4> Vec2vr4; + typedef Vec2<vint4> Vec2vi4; + typedef Vec2<vllong4> Vec2vl4; + typedef Vec2<vbool4> Vec2vb4; + typedef Vec2<vboolf4> Vec2vbf4; + typedef Vec2<vboold4> Vec2vbd4; + + typedef Vec2<vfloat8> Vec2vf8; + typedef Vec2<vdouble8> Vec2vd8; + typedef Vec2<vreal8> Vec2vr8; + typedef Vec2<vint8> Vec2vi8; + typedef Vec2<vllong8> Vec2vl8; + typedef Vec2<vbool8> Vec2vb8; + typedef Vec2<vboolf8> Vec2vbf8; + typedef Vec2<vboold8> Vec2vbd8; + + typedef Vec2<vfloat16> Vec2vf16; + typedef Vec2<vdouble16> Vec2vd16; + typedef Vec2<vreal16> Vec2vr16; + typedef Vec2<vint16> Vec2vi16; + typedef Vec2<vllong16> Vec2vl16; + typedef Vec2<vbool16> Vec2vb16; + typedef Vec2<vboolf16> Vec2vbf16; + typedef Vec2<vboold16> Vec2vbd16; + + typedef Vec2<vfloatx> Vec2vfx; + typedef Vec2<vdoublex> Vec2vdx; + typedef Vec2<vrealx> Vec2vrx; + typedef Vec2<vintx> Vec2vix; + typedef Vec2<vllongx> Vec2vlx; + typedef Vec2<vboolx> Vec2vbx; + typedef Vec2<vboolfx> Vec2vbfx; + typedef Vec2<vbooldx> Vec2vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Vec3 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template<int N> using Vec3vf = Vec3<vfloat<N>>; + template<int N> using Vec3vd = Vec3<vdouble<N>>; + template<int N> using Vec3vr = Vec3<vreal<N>>; + template<int N> using Vec3vi = Vec3<vint<N>>; + template<int N> using Vec3vl = Vec3<vllong<N>>; + template<int N> using Vec3vb = Vec3<vbool<N>>; + template<int N> using Vec3vbf = Vec3<vboolf<N>>; + template<int N> using Vec3vbd = Vec3<vboold<N>>; + + typedef Vec3<vfloat4> Vec3vf4; + typedef Vec3<vdouble4> Vec3vd4; + typedef Vec3<vreal4> Vec3vr4; + typedef Vec3<vint4> Vec3vi4; + typedef Vec3<vllong4> Vec3vl4; + typedef Vec3<vbool4> Vec3vb4; + typedef Vec3<vboolf4> Vec3vbf4; + typedef Vec3<vboold4> Vec3vbd4; + + typedef Vec3<vfloat8> Vec3vf8; + typedef Vec3<vdouble8> Vec3vd8; + typedef Vec3<vreal8> Vec3vr8; + typedef Vec3<vint8> Vec3vi8; + typedef Vec3<vllong8> Vec3vl8; + typedef Vec3<vbool8> Vec3vb8; + typedef Vec3<vboolf8> Vec3vbf8; + typedef Vec3<vboold8> Vec3vbd8; + + typedef Vec3<vfloat16> Vec3vf16; + typedef Vec3<vdouble16> Vec3vd16; + typedef Vec3<vreal16> Vec3vr16; + typedef Vec3<vint16> Vec3vi16; + typedef Vec3<vllong16> Vec3vl16; + typedef Vec3<vbool16> Vec3vb16; + typedef Vec3<vboolf16> Vec3vbf16; + typedef Vec3<vboold16> Vec3vbd16; + + typedef Vec3<vfloatx> Vec3vfx; + typedef Vec3<vdoublex> Vec3vdx; + typedef Vec3<vrealx> Vec3vrx; + typedef Vec3<vintx> Vec3vix; + typedef Vec3<vllongx> Vec3vlx; + typedef Vec3<vboolx> Vec3vbx; + typedef Vec3<vboolfx> Vec3vbfx; + typedef Vec3<vbooldx> Vec3vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Vec4 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template<int N> using Vec4vf = Vec4<vfloat<N>>; + template<int N> using Vec4vd = Vec4<vdouble<N>>; + template<int N> using Vec4vr = Vec4<vreal<N>>; + template<int N> using Vec4vi = Vec4<vint<N>>; + template<int N> using Vec4vl = Vec4<vllong<N>>; + template<int N> using Vec4vb = Vec4<vbool<N>>; + template<int N> using Vec4vbf = Vec4<vboolf<N>>; + template<int N> using Vec4vbd = Vec4<vboold<N>>; + + typedef Vec4<vfloat4> Vec4vf4; + typedef Vec4<vdouble4> Vec4vd4; + typedef Vec4<vreal4> Vec4vr4; + typedef Vec4<vint4> Vec4vi4; + typedef Vec4<vllong4> Vec4vl4; + typedef Vec4<vbool4> Vec4vb4; + typedef Vec4<vboolf4> Vec4vbf4; + typedef Vec4<vboold4> Vec4vbd4; + + typedef Vec4<vfloat8> Vec4vf8; + typedef Vec4<vdouble8> Vec4vd8; + typedef Vec4<vreal8> Vec4vr8; + typedef Vec4<vint8> Vec4vi8; + typedef Vec4<vllong8> Vec4vl8; + typedef Vec4<vbool8> Vec4vb8; + typedef Vec4<vboolf8> Vec4vbf8; + typedef Vec4<vboold8> Vec4vbd8; + + typedef Vec4<vfloat16> Vec4vf16; + typedef Vec4<vdouble16> Vec4vd16; + typedef Vec4<vreal16> Vec4vr16; + typedef Vec4<vint16> Vec4vi16; + typedef Vec4<vllong16> Vec4vl16; + typedef Vec4<vbool16> Vec4vb16; + typedef Vec4<vboolf16> Vec4vbf16; + typedef Vec4<vboold16> Vec4vbd16; + + typedef Vec4<vfloatx> Vec4vfx; + typedef Vec4<vdoublex> Vec4vdx; + typedef Vec4<vrealx> Vec4vrx; + typedef Vec4<vintx> Vec4vix; + typedef Vec4<vllongx> Vec4vlx; + typedef Vec4<vboolx> Vec4vbx; + typedef Vec4<vboolfx> Vec4vbfx; + typedef Vec4<vbooldx> Vec4vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Other shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template<int N> using BBox3vf = BBox<Vec3vf<N>>; + typedef BBox<Vec3vf4> BBox3vf4; + typedef BBox<Vec3vf8> BBox3vf8; + typedef BBox<Vec3vf16> BBox3vf16; + + /* calculate time segment itime and fractional time ftime */ + __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime) + { + const float timeScaled = time * numTimeSegments; + const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return int(itimef); + } + + __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime) + { + const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments; + const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return int(itimef); + } + + template<int N> + __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime) + { + const vfloat<N> timeScaled = time * numTimeSegments; + const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return vint<N>(itimef); + } + + template<int N> + __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& start_time, const vfloat<N>& end_time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime) + { + const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments; + const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return vint<N>(itimef); + } + + /* calculate overlapping time segment range */ + __forceinline range<int> getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments) + { + const float round_up = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step + const float round_down = 1.0f-2.0f*float(ulp); + const int itime_lower = (int)max(floor(round_up *time_range.lower*numTimeSegments), 0.0f); + const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments); + return make_range(itime_lower, itime_upper); + } + + /* calculate overlapping time segment range */ + __forceinline range<int> getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments) + { + const float lower = (range.lower-time_range.lower)/time_range.size(); + const float upper = (range.upper-time_range.lower)/time_range.size(); + return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments); + } +} diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp new file mode 100644 index 0000000000..068e0c2983 --- /dev/null +++ b/thirdparty/embree/kernels/common/device.cpp @@ -0,0 +1,556 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "device.h" +#include "../hash.h" +#include "scene_triangle_mesh.h" +#include "scene_user_geometry.h" +#include "scene_instance.h" +#include "scene_curves.h" +#include "scene_subdiv_mesh.h" + +#include "../subdiv/tessellation_cache.h" + +#include "acceln.h" +#include "geometry.h" + +#include "../geometry/cylinder.h" + +#include "../bvh/bvh4_factory.h" +#include "../bvh/bvh8_factory.h" + +#include "../../common/tasking/taskscheduler.h" +#include "../../common/sys/alloc.h" + +namespace embree +{ + /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */ + ssize_t Device::debug_int0 = 0; + ssize_t Device::debug_int1 = 0; + ssize_t Device::debug_int2 = 0; + ssize_t Device::debug_int3 = 0; + + DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs); + + static MutexSys g_mutex; + static std::map<Device*,size_t> g_cache_size_map; + static std::map<Device*,size_t> g_num_threads_map; + + Device::Device (const char* cfg) + { + /* check that CPU supports lowest ISA */ + if (!hasISA(ISA)) { + throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR); + } + + /* set default frequency level for detected CPU */ + switch (getCPUModel()) { + case CPU::UNKNOWN: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_KABY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::XEON_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::XEON_BROADWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_BROADWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_HASWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_HASWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::SANDY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::NEHALEM: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE2: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE1: frequency_level = FREQUENCY_SIMD128; break; + case CPU::XEON_PHI_KNIGHTS_MILL : frequency_level = FREQUENCY_SIMD512; break; + case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break; + case CPU::ARM: frequency_level = FREQUENCY_SIMD128; break; + } + + /* initialize global state */ +#if defined(EMBREE_CONFIG) + State::parseString(EMBREE_CONFIG); +#endif + State::parseString(cfg); + State::verify(); + + /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */ + if (!checkISASupport()) { + throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA"); + } + + /*! do some internal tests */ + assert(isa::Cylinder::verify()); + + /*! enable huge page support if desired */ +#if defined(__WIN32__) + if (State::enable_selockmemoryprivilege) + State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3)); +#endif + State::hugepages_success &= os_init(State::hugepages,State::verbosity(3)); + + /*! set tessellation cache size */ + setCacheSize( State::tessellation_cache_size ); + + /*! enable some floating point exceptions to catch bugs */ + if (State::float_exceptions) + { + int exceptions = _MM_MASK_MASK; + //exceptions &= ~_MM_MASK_INVALID; + exceptions &= ~_MM_MASK_DENORM; + exceptions &= ~_MM_MASK_DIV_ZERO; + //exceptions &= ~_MM_MASK_OVERFLOW; + //exceptions &= ~_MM_MASK_UNDERFLOW; + //exceptions &= ~_MM_MASK_INEXACT; + _MM_SET_EXCEPTION_MASK(exceptions); + } + + /* print info header */ + if (State::verbosity(1)) + print(); + if (State::verbosity(2)) + State::print(); + + /* register all algorithms */ + bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features)); + +#if defined(EMBREE_TARGET_SIMD8) + bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features)); +#endif + + /* setup tasking system */ + initTaskingSystem(numThreads); + + /* ray stream SOA to AOS conversion */ +#if defined(EMBREE_RAY_PACKETS) + RayStreamFilterFuncsType rayStreamFilterFuncs; + SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(enabled_cpu_features,rayStreamFilterFuncs); + rayStreamFilters = rayStreamFilterFuncs(); +#endif + } + + Device::~Device () + { + setCacheSize(0); + exitTaskingSystem(); + } + + std::string getEnabledTargets() + { + std::string v; +#if defined(EMBREE_TARGET_SSE2) + v += "SSE2 "; +#endif +#if defined(EMBREE_TARGET_SSE42) + v += "SSE4.2 "; +#endif +#if defined(EMBREE_TARGET_AVX) + v += "AVX "; +#endif +#if defined(EMBREE_TARGET_AVX2) + v += "AVX2 "; +#endif +#if defined(EMBREE_TARGET_AVX512) + v += "AVX512 "; +#endif + return v; + } + + std::string getEmbreeFeatures() + { + std::string v; +#if defined(EMBREE_RAY_MASK) + v += "raymasks "; +#endif +#if defined (EMBREE_BACKFACE_CULLING) + v += "backfaceculling "; +#endif +#if defined (EMBREE_BACKFACE_CULLING_CURVES) + v += "backfacecullingcurves "; +#endif +#if defined(EMBREE_FILTER_FUNCTION) + v += "intersection_filter "; +#endif +#if defined (EMBREE_COMPACT_POLYS) + v += "compact_polys "; +#endif + return v; + } + + void Device::print() + { + const int cpu_features = getCPUFeatures(); + std::cout << std::endl; + std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl; + std::cout << " Compiler : " << getCompilerName() << std::endl; + std::cout << " Build : "; +#if defined(DEBUG) + std::cout << "Debug " << std::endl; +#else + std::cout << "Release " << std::endl; +#endif + std::cout << " Platform : " << getPlatformName() << std::endl; + std::cout << " CPU : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl; + std::cout << " Threads : " << getNumberOfLogicalThreads() << std::endl; + std::cout << " ISA : " << stringOfCPUFeatures(cpu_features) << std::endl; + std::cout << " Targets : " << supportedTargetList(cpu_features) << std::endl; + const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON; + const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON; + std::cout << " MXCSR : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl; + std::cout << " Config" << std::endl; + std::cout << " Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl; + std::cout << " ISA : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl; + std::cout << " Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl; + std::cout << " " << getEnabledTargets() << " (compile time enabled)" << std::endl; + std::cout << " Features: " << getEmbreeFeatures() << std::endl; + std::cout << " Tasking : "; +#if defined(TASKING_TBB) + std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " "; + #if TBB_INTERFACE_VERSION >= 12002 + std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " "; + #else + std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " "; + #endif +#endif +#if defined(TASKING_INTERNAL) + std::cout << "internal_tasking_system "; +#endif +#if defined(TASKING_PPL) + std::cout << "PPL "; +#endif + std::cout << std::endl; + + /* check of FTZ and DAZ flags are set in CSR */ + if (!hasFTZ || !hasDAZ) + { +#if !defined(_DEBUG) + if (State::verbosity(1)) +#endif + { + std::cout << std::endl; + std::cout << "================================================================================" << std::endl; + std::cout << " WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled " << std::endl + << " in the MXCSR control and status register. This can have a severe " << std::endl + << " performance impact. Please enable these modes for each application " << std::endl + << " thread the following way:" << std::endl + << std::endl + << " #include \"xmmintrin.h\"" << std::endl + << " #include \"pmmintrin.h\"" << std::endl + << std::endl + << " _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl + << " _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl; + std::cout << "================================================================================" << std::endl; + std::cout << std::endl; + } + } + std::cout << std::endl; + } + + void Device::setDeviceErrorCode(RTCError error) + { + RTCError* stored_error = errorHandler.error(); + if (*stored_error == RTC_ERROR_NONE) + *stored_error = error; + } + + RTCError Device::getDeviceErrorCode() + { + RTCError* stored_error = errorHandler.error(); + RTCError error = *stored_error; + *stored_error = RTC_ERROR_NONE; + return error; + } + + void Device::setThreadErrorCode(RTCError error) + { + RTCError* stored_error = g_errorHandler.error(); + if (*stored_error == RTC_ERROR_NONE) + *stored_error = error; + } + + RTCError Device::getThreadErrorCode() + { + RTCError* stored_error = g_errorHandler.error(); + RTCError error = *stored_error; + *stored_error = RTC_ERROR_NONE; + return error; + } + + void Device::process_error(Device* device, RTCError error, const char* str) + { + /* store global error code when device construction failed */ + if (!device) + return setThreadErrorCode(error); + + /* print error when in verbose mode */ + if (device->verbosity(1)) + { + switch (error) { + case RTC_ERROR_NONE : std::cerr << "Embree: No error"; break; + case RTC_ERROR_UNKNOWN : std::cerr << "Embree: Unknown error"; break; + case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break; + case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break; + case RTC_ERROR_OUT_OF_MEMORY : std::cerr << "Embree: Out of memory"; break; + case RTC_ERROR_UNSUPPORTED_CPU : std::cerr << "Embree: Unsupported CPU"; break; + default : std::cerr << "Embree: Invalid error code"; break; + }; + if (str) std::cerr << ", (" << str << ")"; + std::cerr << std::endl; + } + + /* call user specified error callback */ + if (device->error_function) + device->error_function(device->error_function_userptr,error,str); + + /* record error code */ + device->setDeviceErrorCode(error); + } + + void Device::memoryMonitor(ssize_t bytes, bool post) + { + if (State::memory_monitor_function && bytes != 0) { + if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) { + if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor + throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination"); + } + } + } + } + + size_t getMaxNumThreads() + { + size_t maxNumThreads = 0; + for (std::map<Device*,size_t>::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++) + maxNumThreads = max(maxNumThreads, (*i).second); + if (maxNumThreads == 0) + maxNumThreads = std::numeric_limits<size_t>::max(); + return maxNumThreads; + } + + size_t getMaxCacheSize() + { + size_t maxCacheSize = 0; + for (std::map<Device*,size_t>::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++) + maxCacheSize = max(maxCacheSize, (*i).second); + return maxCacheSize; + } + + void Device::setCacheSize(size_t bytes) + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + Lock<MutexSys> lock(g_mutex); + if (bytes == 0) g_cache_size_map.erase(this); + else g_cache_size_map[this] = bytes; + + size_t maxCacheSize = getMaxCacheSize(); + resizeTessellationCache(maxCacheSize); +#endif + } + + void Device::initTaskingSystem(size_t numThreads) + { + Lock<MutexSys> lock(g_mutex); + if (numThreads == 0) + g_num_threads_map[this] = std::numeric_limits<size_t>::max(); + else + g_num_threads_map[this] = numThreads; + + /* create task scheduler */ + size_t maxNumThreads = getMaxNumThreads(); + TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads); +#if USE_TASK_ARENA + const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount()); + const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads); + arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads)); +#endif + } + + void Device::exitTaskingSystem() + { + Lock<MutexSys> lock(g_mutex); + g_num_threads_map.erase(this); + + /* terminate tasking system */ + if (g_num_threads_map.size() == 0) { + TaskScheduler::destroy(); + } + /* or configure new number of threads */ + else { + size_t maxNumThreads = getMaxNumThreads(); + TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads); + } +#if USE_TASK_ARENA + arena.reset(); +#endif + } + + void Device::setProperty(const RTCDeviceProperty prop, ssize_t val) + { + /* hidden internal properties */ + switch ((size_t)prop) + { + case 1000000: debug_int0 = val; return; + case 1000001: debug_int1 = val; return; + case 1000002: debug_int2 = val; return; + case 1000003: debug_int3 = val; return; + } + + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property"); + } + + ssize_t Device::getProperty(const RTCDeviceProperty prop) + { + size_t iprop = (size_t)prop; + + /* get name of internal regression test */ + if (iprop >= 2000000 && iprop < 3000000) + { + RegressionTest* test = getRegressionTest(iprop-2000000); + if (test) return (ssize_t) test->name.c_str(); + else return 0; + } + + /* run internal regression test */ + if (iprop >= 3000000 && iprop < 4000000) + { + RegressionTest* test = getRegressionTest(iprop-3000000); + if (test) return test->run(); + else return 0; + } + + /* documented properties */ + switch (prop) + { + case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR; + case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR; + case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH; + case RTC_DEVICE_PROPERTY_VERSION : return RTC_VERSION; + +#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return hasISA(SSE2); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return hasISA(AVX); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_RAY_MASK) + case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_BACKFACE_CULLING) + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0; +#endif + +#if defined(EMBREE_BACKFACE_CULLING_CURVES) + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0; +#endif + +#if defined(EMBREE_COMPACT_POLYS) + case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0; +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_IGNORE_INVALID_RAYS) + case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0; +#endif + +#if defined(TASKING_INTERNAL) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0; +#endif + +#if defined(TASKING_TBB) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1; +#endif + +#if defined(TASKING_PPL) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2; +#endif + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) + case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_USER) + case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_POINT) + case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(TASKING_PPL) + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0; +#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0; +#else + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1; +#endif + +#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION + case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0; +#endif + + default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break; + }; + } +} diff --git a/thirdparty/embree/kernels/common/device.h b/thirdparty/embree/kernels/common/device.h new file mode 100644 index 0000000000..21c42c654d --- /dev/null +++ b/thirdparty/embree/kernels/common/device.h @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "state.h" +#include "accel.h" + +namespace embree +{ + class BVH4Factory; + class BVH8Factory; + + class Device : public State, public MemoryMonitorInterface + { + ALIGNED_CLASS_(16); + + public: + + /*! Device construction */ + Device (const char* cfg); + + /*! Device destruction */ + virtual ~Device (); + + /*! prints info about the device */ + void print(); + + /*! sets the error code */ + void setDeviceErrorCode(RTCError error); + + /*! returns and clears the error code */ + RTCError getDeviceErrorCode(); + + /*! sets the error code */ + static void setThreadErrorCode(RTCError error); + + /*! returns and clears the error code */ + static RTCError getThreadErrorCode(); + + /*! processes error codes, do not call directly */ + static void process_error(Device* device, RTCError error, const char* str); + + /*! invokes the memory monitor callback */ + void memoryMonitor(ssize_t bytes, bool post); + + /*! sets the size of the software cache. */ + void setCacheSize(size_t bytes); + + /*! sets a property */ + void setProperty(const RTCDeviceProperty prop, ssize_t val); + + /*! gets a property */ + ssize_t getProperty(const RTCDeviceProperty prop); + + private: + + /*! initializes the tasking system */ + void initTaskingSystem(size_t numThreads); + + /*! shuts down the tasking system */ + void exitTaskingSystem(); + + /*! some variables that can be set via rtcSetParameter1i for debugging purposes */ + public: + static ssize_t debug_int0; + static ssize_t debug_int1; + static ssize_t debug_int2; + static ssize_t debug_int3; + + public: + std::unique_ptr<BVH4Factory> bvh4_factory; +#if defined(EMBREE_TARGET_SIMD8) + std::unique_ptr<BVH8Factory> bvh8_factory; +#endif + +#if USE_TASK_ARENA + std::unique_ptr<tbb::task_arena> arena; +#endif + + /* ray streams filter */ + RayStreamFilterFuncs rayStreamFilters; + }; +} diff --git a/thirdparty/embree/kernels/common/geometry.cpp b/thirdparty/embree/kernels/common/geometry.cpp new file mode 100644 index 0000000000..d8d3f65a5c --- /dev/null +++ b/thirdparty/embree/kernels/common/geometry.cpp @@ -0,0 +1,259 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "geometry.h" +#include "scene.h" + +namespace embree +{ + const char* Geometry::gtype_names[Geometry::GTY_END] = + { + "flat_linear_curve", + "round_linear_curve", + "oriented_linear_curve", + "", + "flat_bezier_curve", + "round_bezier_curve", + "oriented_bezier_curve", + "", + "flat_bspline_curve", + "round_bspline_curve", + "oriented_bspline_curve", + "", + "flat_hermite_curve", + "round_hermite_curve", + "oriented_hermite_curve", + "", + "flat_catmull_rom_curve", + "round_catmull_rom_curve", + "oriented_catmull_rom_curve", + "", + "triangles", + "quads", + "grid", + "subdivs", + "", + "sphere", + "disc", + "oriented_disc", + "", + "usergeom", + "instance_cheap", + "instance_expensive", + }; + + Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) + : device(device), userPtr(nullptr), + numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f), + mask(-1), + gtype(gtype), + gsubtype(GTY_SUBTYPE_DEFAULT), + quality(RTC_BUILD_QUALITY_MEDIUM), + state((unsigned)State::MODIFIED), + enabled(true), + intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr) + { + device->refInc(); + } + + Geometry::~Geometry() + { + device->refDec(); + } + + void Geometry::setNumPrimitives(unsigned int numPrimitives_in) + { + if (numPrimitives_in == numPrimitives) return; + + numPrimitives = numPrimitives_in; + + Geometry::update(); + } + + void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in) + { + if (numTimeSteps_in == numTimeSteps) { + return; + } + + numTimeSteps = numTimeSteps_in; + fnumTimeSegments = float(numTimeSteps_in-1); + + Geometry::update(); + } + + void Geometry::setTimeRange (const BBox1f range) + { + time_range = range; + Geometry::update(); + } + + void Geometry::update() + { + ++modCounter_; // FIXME: required? + state = (unsigned)State::MODIFIED; + } + + void Geometry::commit() + { + ++modCounter_; + state = (unsigned)State::COMMITTED; + } + + void Geometry::preCommit() + { + if (State::MODIFIED == (State)state) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed"); + } + + void Geometry::postCommit() + { + } + + void Geometry::enable () + { + if (isEnabled()) + return; + + enabled = true; + ++modCounter_; + } + + void Geometry::disable () + { + if (isDisabled()) + return; + + enabled = false; + ++modCounter_; + } + + void Geometry::setUserData (void* ptr) + { + userPtr = ptr; + } + + void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) + { + if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH))) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); + + intersectionFilterN = filter; + } + + void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) + { + if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH))) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); + + occlusionFilterN = filter; + } + + void Geometry::setPointQueryFunction (RTCPointQueryFunction func) + { + pointQueryFunc = func; + } + + void Geometry::interpolateN(const RTCInterpolateNArguments* const args) + { + const void* valid_i = args->valid; + const unsigned* primIDs = args->primIDs; + const float* u = args->u; + const float* v = args->v; + unsigned int N = args->N; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* dPdv = args->dPdv; + float* ddPdudu = args->ddPdudu; + float* ddPdvdv = args->ddPdvdv; + float* ddPdudv = args->ddPdudv; + unsigned int valueCount = args->valueCount; + + if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex"); + const int* valid = (const int*) valid_i; + + __aligned(64) float P_tmp[256]; + __aligned(64) float dPdu_tmp[256]; + __aligned(64) float dPdv_tmp[256]; + __aligned(64) float ddPdudu_tmp[256]; + __aligned(64) float ddPdvdv_tmp[256]; + __aligned(64) float ddPdudv_tmp[256]; + + float* Pt = P ? P_tmp : nullptr; + float* dPdut = nullptr, *dPdvt = nullptr; + if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; } + float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr; + if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; } + + for (unsigned int i=0; i<N; i++) + { + if (valid && !valid[i]) continue; + + RTCInterpolateArguments iargs; + iargs.primID = primIDs[i]; + iargs.u = u[i]; + iargs.v = v[i]; + iargs.bufferType = bufferType; + iargs.bufferSlot = bufferSlot; + iargs.P = Pt; + iargs.dPdu = dPdut; + iargs.dPdv = dPdvt; + iargs.ddPdudu = ddPdudut; + iargs.ddPdvdv = ddPdvdvt; + iargs.ddPdudv = ddPdudvt; + iargs.valueCount = valueCount; + interpolate(&iargs); + + if (likely(P)) { + for (unsigned int j=0; j<valueCount; j++) + P[j*N+i] = Pt[j]; + } + if (likely(dPdu)) + { + for (unsigned int j=0; j<valueCount; j++) { + dPdu[j*N+i] = dPdut[j]; + dPdv[j*N+i] = dPdvt[j]; + } + } + if (likely(ddPdudu)) + { + for (unsigned int j=0; j<valueCount; j++) { + ddPdudu[j*N+i] = ddPdudut[j]; + ddPdvdv[j*N+i] = ddPdvdvt[j]; + ddPdudv[j*N+i] = ddPdudvt[j]; + } + } + } + } + + bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context) + { + assert(context->primID < size()); + + RTCPointQueryFunctionArguments args; + args.query = (RTCPointQuery*)context->query_ws; + args.userPtr = context->userPtr; + args.primID = context->primID; + args.geomID = context->geomID; + args.context = context->userContext; + args.similarityScale = context->similarityScale; + + bool update = false; + if(context->func) update |= context->func(&args); + if(pointQueryFunc) update |= pointQueryFunc(&args); + + if (update && context->userContext->instStackSize > 0) + { + // update point query + if (context->query_type == POINT_QUERY_TYPE_AABB) { + context->updateAABB(); + } else { + assert(context->similarityScale > 0.f); + query->radius = context->query_ws->radius * context->similarityScale; + } + } + return update; + } +} diff --git a/thirdparty/embree/kernels/common/geometry.h b/thirdparty/embree/kernels/common/geometry.h new file mode 100644 index 0000000000..2f9f2e7c94 --- /dev/null +++ b/thirdparty/embree/kernels/common/geometry.h @@ -0,0 +1,582 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" +#include "buffer.h" +#include "../common/point_query.h" +#include "../builders/priminfo.h" + +namespace embree +{ + class Scene; + class Geometry; + + struct GeometryCounts + { + __forceinline GeometryCounts() + : numFilterFunctions(0), + numTriangles(0), numMBTriangles(0), + numQuads(0), numMBQuads(0), + numBezierCurves(0), numMBBezierCurves(0), + numLineSegments(0), numMBLineSegments(0), + numSubdivPatches(0), numMBSubdivPatches(0), + numUserGeometries(0), numMBUserGeometries(0), + numInstancesCheap(0), numMBInstancesCheap(0), + numInstancesExpensive(0), numMBInstancesExpensive(0), + numGrids(0), numMBGrids(0), + numPoints(0), numMBPoints(0) {} + + __forceinline size_t size() const { + return numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints + + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints; + } + + __forceinline unsigned int enabledGeometryTypesMask() const + { + unsigned int mask = 0; + if (numTriangles) mask |= 1 << 0; + if (numQuads) mask |= 1 << 1; + if (numBezierCurves+numLineSegments) mask |= 1 << 2; + if (numSubdivPatches) mask |= 1 << 3; + if (numUserGeometries) mask |= 1 << 4; + if (numInstancesCheap) mask |= 1 << 5; + if (numInstancesExpensive) mask |= 1 << 6; + if (numGrids) mask |= 1 << 7; + if (numPoints) mask |= 1 << 8; + + unsigned int maskMB = 0; + if (numMBTriangles) maskMB |= 1 << 0; + if (numMBQuads) maskMB |= 1 << 1; + if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2; + if (numMBSubdivPatches) maskMB |= 1 << 3; + if (numMBUserGeometries) maskMB |= 1 << 4; + if (numMBInstancesCheap) maskMB |= 1 << 5; + if (numMBInstancesExpensive) maskMB |= 1 << 6; + if (numMBGrids) maskMB |= 1 << 7; + if (numMBPoints) maskMB |= 1 << 8; + + return (mask<<8) + maskMB; + } + + __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const + { + GeometryCounts ret; + ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions; + ret.numTriangles = numTriangles + rhs.numTriangles; + ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles; + ret.numQuads = numQuads + rhs.numQuads; + ret.numMBQuads = numMBQuads + rhs.numMBQuads; + ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves; + ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves; + ret.numLineSegments = numLineSegments + rhs.numLineSegments; + ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments; + ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches; + ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches; + ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries; + ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries; + ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap; + ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap; + ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive; + ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive; + ret.numGrids = numGrids + rhs.numGrids; + ret.numMBGrids = numMBGrids + rhs.numMBGrids; + ret.numPoints = numPoints + rhs.numPoints; + ret.numMBPoints = numMBPoints + rhs.numMBPoints; + + return ret; + } + + size_t numFilterFunctions; //!< number of geometries with filter functions enabled + size_t numTriangles; //!< number of enabled triangles + size_t numMBTriangles; //!< number of enabled motion blured triangles + size_t numQuads; //!< number of enabled quads + size_t numMBQuads; //!< number of enabled motion blurred quads + size_t numBezierCurves; //!< number of enabled curves + size_t numMBBezierCurves; //!< number of enabled motion blurred curves + size_t numLineSegments; //!< number of enabled line segments + size_t numMBLineSegments; //!< number of enabled line motion blurred segments + size_t numSubdivPatches; //!< number of enabled subdivision patches + size_t numMBSubdivPatches; //!< number of enabled motion blured subdivision patches + size_t numUserGeometries; //!< number of enabled user geometries + size_t numMBUserGeometries; //!< number of enabled motion blurred user geometries + size_t numInstancesCheap; //!< number of enabled cheap instances + size_t numMBInstancesCheap; //!< number of enabled motion blurred cheap instances + size_t numInstancesExpensive; //!< number of enabled expensive instances + size_t numMBInstancesExpensive; //!< number of enabled motion blurred expensive instances + size_t numGrids; //!< number of enabled grid geometries + size_t numMBGrids; //!< number of enabled motion blurred grid geometries + size_t numPoints; //!< number of enabled points + size_t numMBPoints; //!< number of enabled motion blurred points + }; + + /*! Base class all geometries are derived from */ + class Geometry : public RefCount + { + friend class Scene; + public: + + /*! type of geometry */ + enum GType + { + GTY_FLAT_LINEAR_CURVE = 0, + GTY_ROUND_LINEAR_CURVE = 1, + GTY_ORIENTED_LINEAR_CURVE = 2, + GTY_CONE_LINEAR_CURVE = 3, + + GTY_FLAT_BEZIER_CURVE = 4, + GTY_ROUND_BEZIER_CURVE = 5, + GTY_ORIENTED_BEZIER_CURVE = 6, + + GTY_FLAT_BSPLINE_CURVE = 8, + GTY_ROUND_BSPLINE_CURVE = 9, + GTY_ORIENTED_BSPLINE_CURVE = 10, + + GTY_FLAT_HERMITE_CURVE = 12, + GTY_ROUND_HERMITE_CURVE = 13, + GTY_ORIENTED_HERMITE_CURVE = 14, + + GTY_FLAT_CATMULL_ROM_CURVE = 16, + GTY_ROUND_CATMULL_ROM_CURVE = 17, + GTY_ORIENTED_CATMULL_ROM_CURVE = 18, + + GTY_TRIANGLE_MESH = 20, + GTY_QUAD_MESH = 21, + GTY_GRID_MESH = 22, + GTY_SUBDIV_MESH = 23, + + GTY_SPHERE_POINT = 25, + GTY_DISC_POINT = 26, + GTY_ORIENTED_DISC_POINT = 27, + + GTY_USER_GEOMETRY = 29, + GTY_INSTANCE_CHEAP = 30, + GTY_INSTANCE_EXPENSIVE = 31, + GTY_END = 32, + + GTY_BASIS_LINEAR = 0, + GTY_BASIS_BEZIER = 4, + GTY_BASIS_BSPLINE = 8, + GTY_BASIS_HERMITE = 12, + GTY_BASIS_CATMULL_ROM = 16, + GTY_BASIS_MASK = 28, + + GTY_SUBTYPE_FLAT_CURVE = 0, + GTY_SUBTYPE_ROUND_CURVE = 1, + GTY_SUBTYPE_ORIENTED_CURVE = 2, + GTY_SUBTYPE_MASK = 3, + }; + + enum GSubType + { + GTY_SUBTYPE_DEFAULT= 0, + GTY_SUBTYPE_INSTANCE_LINEAR = 0, + GTY_SUBTYPE_INSTANCE_QUATERNION = 1 + }; + + enum GTypeMask + { + MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE, + MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE, + MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE, + MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE, + + MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE, + MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE, + MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE, + + MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE, + MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE, + MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE, + + MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE, + MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE, + MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE, + + MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE, + MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE, + MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE, + + MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE, + + MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE | + MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE | + MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE | + MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE, + + MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT, + MTY_DISC_POINT = 1ul << GTY_DISC_POINT, + MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT, + + MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT, + + MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS, + + MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH, + MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH, + MTY_GRID_MESH = 1ul << GTY_GRID_MESH, + MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH, + MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY, + + MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP, + MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE, + MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE + }; + + static const char* gtype_names[GTY_END]; + + enum class State : unsigned { + MODIFIED = 0, + COMMITTED = 1, + }; + + public: + + /*! Geometry constructor */ + Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps); + + /*! Geometry destructor */ + virtual ~Geometry(); + + public: + + /*! tests if geometry is enabled */ + __forceinline bool isEnabled() const { return enabled; } + + /*! tests if geometry is disabled */ + __forceinline bool isDisabled() const { return !isEnabled(); } + + /*! tests if that geometry has some filter function set */ + __forceinline bool hasFilterFunctions () const { + return (intersectionFilterN != nullptr) || (occlusionFilterN != nullptr); + } + + /*! returns geometry type */ + __forceinline GType getType() const { return gtype; } + + /*! returns curve type */ + __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); } + + /*! returns curve basis */ + __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); } + + /*! returns geometry type mask */ + __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); } + + /*! returns number of primitives */ + __forceinline size_t size() const { return numPrimitives; } + + /*! sets the number of primitives */ + virtual void setNumPrimitives(unsigned int numPrimitives_in); + + /*! sets number of time steps */ + virtual void setNumTimeSteps (unsigned int numTimeSteps_in); + + /*! sets motion blur time range */ + void setTimeRange (const BBox1f range); + + /*! sets number of vertex attributes */ + virtual void setVertexAttributeCount (unsigned int N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! sets number of topologies */ + virtual void setTopologyCount (unsigned int N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! sets the build quality */ + void setBuildQuality(RTCBuildQuality quality_in) + { + this->quality = quality_in; + Geometry::update(); + } + + /* calculate time segment itime and fractional time ftime */ + __forceinline int timeSegment(float time, float& ftime) const { + return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime); + } + + template<int N> + __forceinline vint<N> timeSegment(const vfloat<N>& time, vfloat<N>& ftime) const { + return getTimeSegment<N>(time,vfloat<N>(time_range.lower),vfloat<N>(time_range.upper),vfloat<N>(fnumTimeSegments),ftime); + } + + /* calculate overlapping time segment range */ + __forceinline range<int> timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,fnumTimeSegments); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<(int)numTimeSteps); + return time_range.lower + time_range.size()*float(i)/fnumTimeSegments; + } + + /*! for all geometries */ + public: + + /*! Enable geometry. */ + virtual void enable(); + + /*! Update geometry. */ + void update(); + + /*! commit of geometry */ + virtual void commit(); + + /*! Update geometry buffer. */ + virtual void updateBuffer(RTCBufferType type, unsigned int slot) { + update(); // update everything for geometries not supporting this call + } + + /*! Disable geometry. */ + virtual void disable(); + + /*! Verify the geometry */ + virtual bool verify() { return true; } + + /*! called before every build */ + virtual void preCommit(); + + /*! called after every build */ + virtual void postCommit(); + + virtual void addElementsToCount (GeometryCounts & counts) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + }; + + /*! sets constant tessellation rate for the geometry */ + virtual void setTessellationRate(float N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets the maximal curve radius scale allowed by min-width feature. */ + virtual void setMaxRadiusScale(float s) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set user data pointer. */ + virtual void setUserData(void* ptr); + + /*! Get user data pointer. */ + __forceinline void* getUserData() const { + return userPtr; + } + + /*! interpolates user data to the specified u/v location */ + virtual void interpolate(const RTCInterpolateArguments* const args) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! interpolates user data to the specified u/v locations */ + virtual void interpolateN(const RTCInterpolateNArguments* const args); + + /* point query api */ + bool pointQuery(PointQuery* query, PointQueryContext* context); + + /*! for subdivision surfaces only */ + public: + virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set displacement function. */ + virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getFirstHalfEdge(unsigned int faceID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getFace(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getNextHalfEdge(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! get fast access to first vertex buffer if applicable */ + virtual float * getCompactVertexArray () const { + return nullptr; + } + + /*! Returns the modified counter - how many times the geo has been modified */ + __forceinline unsigned int getModCounter () const { + return modCounter_; + } + + /*! for triangle meshes and bezier curves only */ + public: + + + /*! Sets ray mask. */ + virtual void setMask(unsigned mask) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets specified buffer. */ + virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Gets specified buffer. */ + virtual void* getBuffer(RTCBufferType type, unsigned int slot) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set intersection filter function for ray packets of size N. */ + virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN); + + /*! Set occlusion filter function for ray packets of size N. */ + virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN); + + /*! for instances only */ + public: + + /*! Sets the instanced scene */ + virtual void setInstancedScene(const Ref<Scene>& scene) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets transformation of the instance */ + virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets transformation of the instance */ + virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Returns the transformation of the instance */ + virtual AffineSpace3fa getTransform(float time) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! for user geometries only */ + public: + + /*! Set bounds function. */ + virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set intersect function for ray packets of size N. */ + virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set occlusion function for ray packets of size N. */ + virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set point query function. */ + void setPointQueryFunction(RTCPointQueryFunction func); + + /*! returns number of time segments */ + __forceinline unsigned numTimeSegments () const { + return numTimeSteps-1; + } + + public: + + virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); + } + + virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); + } + + virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); + } + + virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); + } + + virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); + } + + virtual Vec3fa computeDirection(unsigned int primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); + } + + virtual Vec3fa computeDirection(unsigned int primID, size_t time) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); + } + + virtual BBox3fa vbounds(size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + public: + __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; } + __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; } + + public: + Device* device; //!< device this geometry belongs to + + void* userPtr; //!< user pointer + unsigned int numPrimitives; //!< number of primitives of this geometry + + unsigned int numTimeSteps; //!< number of time steps + float fnumTimeSegments; //!< number of time segments (precalculation) + BBox1f time_range; //!< motion blur time range + + unsigned int mask; //!< for masking out geometry + unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified + + struct { + GType gtype : 8; //!< geometry type + GSubType gsubtype : 8; //!< geometry subtype + RTCBuildQuality quality : 3; //!< build quality for geometry + unsigned state : 2; + bool enabled : 1; //!< true if geometry is enabled + }; + + RTCFilterFunctionN intersectionFilterN; + RTCFilterFunctionN occlusionFilterN; + RTCPointQueryFunction pointQueryFunc; + }; +} diff --git a/thirdparty/embree/kernels/common/hit.h b/thirdparty/embree/kernels/common/hit.h new file mode 100644 index 0000000000..fd1a9d6391 --- /dev/null +++ b/thirdparty/embree/kernels/common/hit.h @@ -0,0 +1,114 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "ray.h" +#include "instance_stack.h" + +namespace embree +{ + /* Hit structure for K hits */ + template<int K> + struct HitK + { + /* Default construction does nothing */ + __forceinline HitK() {} + + /* Constructs a hit */ + __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng) + : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + instance_id_stack::copy_UV<K>(context->instID, instID); + } + + /* Returns the size of the hit */ + static __forceinline size_t size() { return K; } + + public: + Vec3vf<K> Ng; // geometry normal + vfloat<K> u; // barycentric u coordinate of hit + vfloat<K> v; // barycentric v coordinate of hit + vuint<K> primID; // primitive ID + vuint<K> geomID; // geometry ID + vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Specialization for a single hit */ + template<> + struct __aligned(16) HitK<1> + { + /* Default construction does nothing */ + __forceinline HitK() {} + + /* Constructs a hit */ + __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng) + : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID) + { + instance_id_stack::copy_UU(context->instID, instID); + } + + /* Returns the size of the hit */ + static __forceinline size_t size() { return 1; } + + public: + Vec3<float> Ng; // geometry normal + float u; // barycentric u coordinate of hit + float v; // barycentric v coordinate of hit + unsigned int primID; // primitive ID + unsigned int geomID; // geometry ID + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Shortcuts */ + typedef HitK<1> Hit; + typedef HitK<4> Hit4; + typedef HitK<8> Hit8; + typedef HitK<16> Hit16; + + /* Outputs hit to stream */ + template<int K> + __forceinline embree_ostream operator<<(embree_ostream cout, const HitK<K>& ray) + { + cout << "{ " << embree_endl + << " Ng = " << ray.Ng << embree_endl + << " u = " << ray.u << embree_endl + << " v = " << ray.v << embree_endl + << " primID = " << ray.primID << embree_endl + << " geomID = " << ray.geomID << embree_endl + << " instID ="; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + { + cout << " " << ray.instID[l]; + } + cout << embree_endl; + return cout << "}"; + } + + template<typename Hit> + __forceinline void copyHitToRay(RayHit& ray, const Hit& hit) + { + ray.Ng = hit.Ng; + ray.u = hit.u; + ray.v = hit.v; + ray.primID = hit.primID; + ray.geomID = hit.geomID; + instance_id_stack::copy_UU(hit.instID, ray.instID); + } + + template<int K> + __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit) + { + vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x); + vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y); + vfloat<K>::storeu(mask,&ray.Ng.z, hit.Ng.z); + vfloat<K>::storeu(mask,&ray.u, hit.u); + vfloat<K>::storeu(mask,&ray.v, hit.v); + vuint<K>::storeu(mask,&ray.primID, hit.primID); + vuint<K>::storeu(mask,&ray.geomID, hit.geomID); + instance_id_stack::copy_VV<K>(hit.instID, ray.instID, mask); + } +} diff --git a/thirdparty/embree/kernels/common/instance_stack.h b/thirdparty/embree/kernels/common/instance_stack.h new file mode 100644 index 0000000000..d3c0a643f1 --- /dev/null +++ b/thirdparty/embree/kernels/common/instance_stack.h @@ -0,0 +1,179 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore.h" + +namespace embree { +namespace instance_id_stack { + +static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, + "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0."); + +/******************************************************************************* + * Instance ID stack manipulation. + * This is used from the instance intersector. + ******************************************************************************/ + +/* + * Push an instance to the stack. + */ +RTC_FORCEINLINE bool push(RTCIntersectContext* context, + unsigned instanceId) +{ +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT; + /* We assert here because instances are silently dropped when the stack is full. + This might be quite hard to find in production. */ + assert(spaceAvailable); + if (likely(spaceAvailable)) + context->instID[context->instStackSize++] = instanceId; + return spaceAvailable; +#else + const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID); + assert(spaceAvailable); + if (likely(spaceAvailable)) + context->instID[0] = instanceId; + return spaceAvailable; +#endif +} + + +/* + * Pop the last instance pushed to the stack. + * Do not call on an empty stack. + */ +RTC_FORCEINLINE void pop(RTCIntersectContext* context) +{ + assert(context); +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + assert(context->instStackSize > 0); + context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID; +#else + assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID); + context->instID[0] = RTC_INVALID_GEOMETRY_ID; +#endif +} + +/* + * Optimized instance id stack copy. + * The copy() functions will either copy full + * stacks or copy only until the last valid element has been copied, depending + * on RTC_MAX_INSTANCE_LEVEL_COUNT. + */ +RTC_FORCEINLINE void copy_UU(const unsigned* src, unsigned* tgt) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + tgt[0] = src[0]; + +#else + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) { + tgt[l] = src[l]; + if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) + if (src[l] == RTC_INVALID_GEOMETRY_ID) + break; + } +#endif +} + +template <int K> +RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + tgt[0] = src[0]; + +#else + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) { + tgt[l] = src[l]; + if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) + if (src[l] == RTC_INVALID_GEOMETRY_ID) + break; + } +#endif +} + +template <int K> +RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt, size_t j) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + tgt[0][j] = src[0]; + +#else + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) { + tgt[l][j] = src[l]; + if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) + if (src[l] == RTC_INVALID_GEOMETRY_ID) + break; + } +#endif +} + +template <int K> +RTC_FORCEINLINE void copy_UV(const unsigned* src, vuint<K>* tgt, const vbool<K>& mask) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + vuint<K>::store(mask, tgt, src[0]); + +#else + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) { + vuint<K>::store(mask, tgt + l, src[l]); + if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) + if (src[l] == RTC_INVALID_GEOMETRY_ID) + break; + } +#endif +} + +template <int K> +RTC_FORCEINLINE void copy_VU(const vuint<K>* src, unsigned* tgt, size_t i) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + tgt[0] = src[0][i]; + +#else + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) { + tgt[l] = src[l][i]; + if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) + if (src[l][i] == RTC_INVALID_GEOMETRY_ID) + break; + } +#endif +} + +template <int K> +RTC_FORCEINLINE void copy_VV(const vuint<K>* src, vuint<K>* tgt, size_t i, size_t j) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + tgt[0][j] = src[0][i]; + +#else + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) { + tgt[l][j] = src[l][i]; + if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) + if (src[l][i] == RTC_INVALID_GEOMETRY_ID) + break; + } +#endif +} + +template <int K> +RTC_FORCEINLINE void copy_VV(const vuint<K>* src, vuint<K>* tgt, const vbool<K>& mask) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + vuint<K>::store(mask, tgt, src[0]); + +#else + vbool<K> done = !mask; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) { + vuint<K>::store(mask, tgt + l, src[l]); + if (RTC_MAX_INSTANCE_LEVEL_COUNT > 4) { + done |= src[l] == RTC_INVALID_GEOMETRY_ID; + if (all(done)) break; + } + } +#endif +} + +} // namespace instance_id_stack +} // namespace embree diff --git a/thirdparty/embree/kernels/common/isa.h b/thirdparty/embree/kernels/common/isa.h new file mode 100644 index 0000000000..ae6556336c --- /dev/null +++ b/thirdparty/embree/kernels/common/isa.h @@ -0,0 +1,246 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/sys/platform.h" +#include "../../common/sys/sysinfo.h" + +namespace embree +{ +#define DEFINE_SYMBOL2(type,name) \ + typedef type (*name##Func)(); \ + name##Func name; + +#define DECLARE_SYMBOL2(type,name) \ + namespace sse2 { extern type name(); } \ + namespace sse42 { extern type name(); } \ + namespace avx { extern type name(); } \ + namespace avx2 { extern type name(); } \ + namespace avx512 { extern type name(); } \ + void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \ + type name##_error() { return type(name##_error2); } \ + type name##_zero() { return type(nullptr); } + +#define DECLARE_ISA_FUNCTION(type,symbol,args) \ + namespace sse2 { extern type symbol(args); } \ + namespace sse42 { extern type symbol(args); } \ + namespace avx { extern type symbol(args); } \ + namespace avx2 { extern type symbol(args); } \ + namespace avx512 { extern type symbol(args); } \ + inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \ + typedef type (*symbol##Ty)(args); \ + +#define DEFINE_ISA_FUNCTION(type,symbol,args) \ + typedef type (*symbol##Func)(args); \ + symbol##Func symbol; + +#define ZERO_SYMBOL(features,intersector) \ + intersector = intersector##_zero; + +#define INIT_SYMBOL(features,intersector) \ + intersector = decltype(intersector)(intersector##_error); + +#define SELECT_SYMBOL_DEFAULT(features,intersector) \ + intersector = isa::intersector; + +#if defined(__SSE__) +#if !defined(EMBREE_TARGET_SIMD4) +#define EMBREE_TARGET_SIMD4 +#endif +#endif + +#if defined(EMBREE_TARGET_SSE42) +#define SELECT_SYMBOL_SSE42(features,intersector) \ + if ((features & SSE42) == SSE42) intersector = sse42::intersector; +#else +#define SELECT_SYMBOL_SSE42(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX) || defined(__AVX__) +#if !defined(EMBREE_TARGET_SIMD8) +#define EMBREE_TARGET_SIMD8 +#endif +#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target +#define SELECT_SYMBOL_AVX(features,intersector) \ + if ((features & ISA) == ISA) intersector = isa::intersector; +#else +#define SELECT_SYMBOL_AVX(features,intersector) \ + if ((features & AVX) == AVX) intersector = avx::intersector; +#endif +#else +#define SELECT_SYMBOL_AVX(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX2) +#if !defined(EMBREE_TARGET_SIMD8) +#define EMBREE_TARGET_SIMD8 +#endif +#define SELECT_SYMBOL_AVX2(features,intersector) \ + if ((features & AVX2) == AVX2) intersector = avx2::intersector; +#else +#define SELECT_SYMBOL_AVX2(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX512) +#if !defined(EMBREE_TARGET_SIMD16) +#define EMBREE_TARGET_SIMD16 +#endif +#define SELECT_SYMBOL_AVX512(features,intersector) \ + if ((features & AVX512) == AVX512) intersector = avx512::intersector; +#else +#define SELECT_SYMBOL_AVX512(features,intersector) +#endif + +#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX512(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX512(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX512(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512(features,intersector) \ + ZERO_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX512(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX512(features,intersector); + +#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + + struct VerifyMultiTargetLinking { + static __noinline int getISA(int depth = 5) { + if (depth == 0) return ISA; + else return getISA(depth-1); + } + }; + namespace sse2 { int getISA(); }; + namespace sse42 { int getISA(); }; + namespace avx { int getISA(); }; + namespace avx2 { int getISA(); }; + namespace avx512 { int getISA(); }; +} diff --git a/thirdparty/embree/kernels/common/motion_derivative.h b/thirdparty/embree/kernels/common/motion_derivative.h new file mode 100644 index 0000000000..c619d6a675 --- /dev/null +++ b/thirdparty/embree/kernels/common/motion_derivative.h @@ -0,0 +1,325 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/math/affinespace.h" +#include "../../common/math/interval.h" + +#include <functional> + +namespace embree { + +#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f + +static void motion_derivative_coefficients(const float *p, float *coeff); + +struct MotionDerivativeCoefficients +{ + float theta; + float coeffs[3*8*7]; + + MotionDerivativeCoefficients() {} + + // xfm0 and xfm1 are interpret as quaternion decomposition + MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1) + { + // cosTheta of the two quaternions + const float cosTheta = min(1.f, max(-1.f, + xfm0.l.vx.w * xfm1.l.vx.w + + xfm0.l.vy.w * xfm1.l.vy.w + + xfm0.l.vz.w * xfm1.l.vz.w + + xfm0.p.w * xfm1.p.w)); + + theta = std::acos(cosTheta); + Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w); + if (cosTheta < 0.995f) { + // compute perpendicular quaternion + qperp.x = xfm1.p.w - cosTheta * xfm0.p.w; + qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w; + qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w; + qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w; + qperp = normalize(qperp); + } + const float p[33] = { + theta, + xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0 + xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1 + xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0 + qperp.x, qperp.y, qperp.z, qperp.w, + xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0 + xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y, + xfm0.l.vz.z, xfm0.p.z, + xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1 + xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y, + xfm1.l.vz.z, xfm1.p.z + }; + motion_derivative_coefficients(p, coeffs); + } +}; + +struct MotionDerivative +{ + float twoTheta; + float c[8]; + + MotionDerivative(MotionDerivativeCoefficients const& mdc, + int dim, Vec3fa const& p0, Vec3fa const& p1) + : twoTheta(2.f*mdc.theta) + { + const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z }; + for (int i = 0; i < 8; ++i) { + c[i] = 0; + for (int j = 0; j < 7; ++j) { + c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j]; + } + } + } + + template<typename T> + struct EvalMotionDerivative + { + MotionDerivative const& md; + float offset; + + EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {} + + T operator()(T const& time) const { + return md.c[0] + md.c[1] * time + + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time) + + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time) + + offset; + } + }; + + unsigned int findRoots( + Interval1f const& interval, + float offset, + float* roots, + unsigned int maxNumRoots) + { + unsigned int numRoots = 0; + EvalMotionDerivative<Interval1f> eval(*this, offset); + findRoots(eval, interval, numRoots, roots, maxNumRoots); + return numRoots; + } + + template<typename Eval> + static void findRoots( + + Eval const& eval, + Interval1f const& interval, + unsigned int& numRoots, + float* roots, + unsigned int maxNumRoots) + { + Interval1f range = eval(interval); + if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return; + + const float split = 0.5f * (interval.upper + interval.lower); + if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f || abs(split-interval.upper) < 1e-7f) + { + // check if the root already exists + for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) { + if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON) + return; + } + if (numRoots < maxNumRoots) { + roots[numRoots++] = split; + } + if (numRoots > maxNumRoots) { + printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS + return; + } + return; + } + + findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots); + findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots); + } +}; + +/****************************************************************************** + * Code generated with sympy 1.4 * + * See http://www.sympy.org/ for more information. * + * * + * see * + * * + * scripts/generate_motion_derivative_coefficients.py * + * * + * for how this code is generated * + * * + ******************************************************************************/ +static void motion_derivative_coefficients(const float *p, float *coeff) +{ + coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27]; + coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24]; + coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25]; + coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26]; + coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15]; + coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16]; + coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17]; + coeff[7] = 0; + coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24]; + coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25]; + coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26]; + coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24]; + coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25]; + coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26]; + coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27]; + coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24]; + coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25]; + coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26]; + coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15]; + coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16]; + coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17]; + coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0]; + coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24]; + coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25]; + coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26]; + coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24]; + coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25]; + coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26]; + coeff[28] = 0; + coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0]; + coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0]; + coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0]; + coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0]; + coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0]; + coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0]; + coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27]; + coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24]; + coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25]; + coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26]; + coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15]; + coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16]; + coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17]; + coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0]; + coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24]; + coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25]; + coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26]; + coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24]; + coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25]; + coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26]; + coeff[49] = 0; + coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0]; + coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0]; + coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0]; + coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0]; + coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0]; + coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0]; + coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30]; + coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24]; + coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28]; + coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29]; + coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15]; + coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19]; + coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20]; + coeff[63] = 0; + coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24]; + coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28]; + coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29]; + coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24]; + coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28]; + coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29]; + coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30]; + coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24]; + coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28]; + coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29]; + coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15]; + coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19]; + coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20]; + coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0]; + coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24]; + coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28]; + coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29]; + coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24]; + coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28]; + coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29]; + coeff[84] = 0; + coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0]; + coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0]; + coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0]; + coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0]; + coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0]; + coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0]; + coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30]; + coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24]; + coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28]; + coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29]; + coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15]; + coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19]; + coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20]; + coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0]; + coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24]; + coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28]; + coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29]; + coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24]; + coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28]; + coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29]; + coeff[105] = 0; + coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0]; + coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0]; + coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0]; + coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0]; + coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0]; + coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0]; + coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32]; + coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24]; + coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28]; + coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31]; + coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15]; + coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19]; + coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22]; + coeff[119] = 0; + coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24]; + coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28]; + coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31]; + coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24]; + coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28]; + coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31]; + coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30]; + coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24]; + coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28]; + coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29]; + coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15]; + coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19]; + coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20]; + coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0]; + coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24]; + coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28]; + coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29]; + coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24]; + coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28]; + coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29]; + coeff[140] = 0; + coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0]; + coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0]; + coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0]; + coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0]; + coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0]; + coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0]; + coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32]; + coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24]; + coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28]; + coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31]; + coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15]; + coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19]; + coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22]; + coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0]; + coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24]; + coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28]; + coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31]; + coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24]; + coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28]; + coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31]; + coeff[161] = 0; + coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0]; + coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0]; + coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0]; + coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0]; + coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0]; + coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0]; +} + +} // namespace embree diff --git a/thirdparty/embree/kernels/common/point_query.h b/thirdparty/embree/kernels/common/point_query.h new file mode 100644 index 0000000000..7d55c91fff --- /dev/null +++ b/thirdparty/embree/kernels/common/point_query.h @@ -0,0 +1,136 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /* Point query structure for closest point query */ + template<int K> + struct RTC_ALIGN(16) PointQueryK + { + /* Default construction does nothing */ + __forceinline PointQueryK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline PointQueryK(const Vec3vf<K>& p, const vfloat<K>& radius = inf, const vfloat<K>& time = zero) + : p(p), time(time), radius(radius) {} + + /* Returns the size of the ray */ + static __forceinline size_t size() { return K; } + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline vbool<K> valid() const + { + const vbool<K> vx = (abs(p.x) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vy = (abs(p.y) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vz = (abs(p.z) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vn = radius >= vfloat<K>(0); + const vbool<K> vf = abs(time) < vfloat<K>(inf); + return vx & vy & vz & vn & vf; + } + + __forceinline void get(PointQueryK<1>* ray) const; + __forceinline void get(size_t i, PointQueryK<1>& ray) const; + __forceinline void set(const PointQueryK<1>* ray); + __forceinline void set(size_t i, const PointQueryK<1>& ray); + + Vec3vf<K> p; // location of the query point + vfloat<K> time; // time for motion blur + vfloat<K> radius; // radius for the point query + }; + + /* Specialization for a single point query */ + template<> + struct RTC_ALIGN(16) PointQueryK<1> + { + /* Default construction does nothing */ + __forceinline PointQueryK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero) + : p(p), time(time), radius(radius) {} + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline bool valid() const { + return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf); + } + + Vec3f p; + float time; + float radius; + }; + + /* Converts point query packet to single point query */ + template<int K> + __forceinline void PointQueryK<K>::get(PointQueryK<1>* query) const + { + for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose + { + query[i].p.x = p.x[i]; + query[i].p.y = p.y[i]; + query[i].p.z = p.z[i]; + query[i].time = time[i]; + query[i].radius = radius[i]; + } + } + + /* Extracts a single point query out of a point query packet*/ + template<int K> + __forceinline void PointQueryK<K>::get(size_t i, PointQueryK<1>& query) const + { + query.p.x = p.x[i]; + query.p.y = p.y[i]; + query.p.z = p.z[i]; + query.radius = radius[i]; + query.time = time[i]; + } + + /* Converts single point query to point query packet */ + template<int K> + __forceinline void PointQueryK<K>::set(const PointQueryK<1>* query) + { + for (size_t i = 0; i < K; i++) + { + p.x[i] = query[i].p.x; + p.y[i] = query[i].p.y; + p.z[i] = query[i].p.z; + radius[i] = query[i].radius; + time[i] = query[i].time; + } + } + + /* inserts a single point query into a point query packet element */ + template<int K> + __forceinline void PointQueryK<K>::set(size_t i, const PointQueryK<1>& query) + { + p.x[i] = query.p.x; + p.y[i] = query.p.y; + p.z[i] = query.p.z; + radius[i] = query.radius; + time[i] = query.time; + } + + /* Shortcuts */ + typedef PointQueryK<1> PointQuery; + typedef PointQueryK<4> PointQuery4; + typedef PointQueryK<8> PointQuery8; + typedef PointQueryK<16> PointQuery16; + struct PointQueryN; + + /* Outputs point query to stream */ + template<int K> + __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK<K>& query) + { + cout << "{ " << embree_endl + << " p = " << query.p << embree_endl + << " r = " << query.radius << embree_endl + << " time = " << query.time << embree_endl + << "}"; + return cout; + } +} diff --git a/thirdparty/embree/kernels/common/primref.h b/thirdparty/embree/kernels/common/primref.h new file mode 100644 index 0000000000..d61763487b --- /dev/null +++ b/thirdparty/embree/kernels/common/primref.h @@ -0,0 +1,138 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct __aligned(32) PrimRef + { + __forceinline PrimRef () {} + +#if defined(__AVX__) + __forceinline PrimRef(const PrimRef& v) { + vfloat8::store((float*)this,vfloat8::load((float*)&v)); + } + __forceinline PrimRef& operator=(const PrimRef& v) { + vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this; + } +#endif + + __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) + { + lower = Vec3fx(bounds.lower, geomID); + upper = Vec3fx(bounds.upper, primID); + } + + __forceinline PrimRef (const BBox3fa& bounds, size_t id) + { +#if defined(__64BIT__) + lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF)); + upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF)); +#else + lower = Vec3fx(bounds.lower, (unsigned)id); + upper = Vec3fx(bounds.upper, (unsigned)0); +#endif + } + + /*! calculates twice the center of the primitive */ + __forceinline const Vec3fa center2() const { + return lower+upper; + } + + /*! return the bounding box of the primitive */ + __forceinline const BBox3fa bounds() const { + return BBox3fa(lower,upper); + } + + /*! size for bin heuristic is 1 */ + __forceinline unsigned size() const { + return 1; + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = embree::center2(bounds_o); + } + + __forceinline unsigned& geomIDref() { // FIXME: remove !!!!!!! + return lower.u; + } + __forceinline unsigned& primIDref() { // FIXME: remove !!!!!!! + return upper.u; + } + + /*! returns the geometry ID */ + __forceinline unsigned geomID() const { + return lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned primID() const { + return upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__64BIT__) + return size_t(lower.u) + (size_t(upper.u) << 32); +#else + return size_t(lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) { + return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }"; + } + + public: + Vec3fx lower; //!< lower bounds and geomID + Vec3fx upper; //!< upper bounds and primID + }; + + /*! fast exchange for PrimRefs */ + __forceinline void xchg(PrimRef& a, PrimRef& b) + { +#if defined(__AVX__) + const vfloat8 aa = vfloat8::load((float*)&a); + const vfloat8 bb = vfloat8::load((float*)&b); + vfloat8::store((float*)&a,bb); + vfloat8::store((float*)&b,aa); +#else + std::swap(a,b); +#endif + } + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + struct SubGridBuildData { + unsigned short sx,sy; + unsigned int primID; + + __forceinline SubGridBuildData() {}; + __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {}; + + __forceinline size_t x() const { return (size_t)sx & 0x7fff; } + __forceinline size_t y() const { return (size_t)sy & 0x7fff; } + + }; +} diff --git a/thirdparty/embree/kernels/common/primref_mb.h b/thirdparty/embree/kernels/common/primref_mb.h new file mode 100644 index 0000000000..fb08a05003 --- /dev/null +++ b/thirdparty/embree/kernels/common/primref_mb.h @@ -0,0 +1,262 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +#define MBLUR_BIN_LBBOX 1 + +namespace embree +{ +#if MBLUR_BIN_LBBOX + + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct PrimRefMB + { + typedef LBBox3fa BBox; + + __forceinline PrimRefMB () {} + + __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); + lbounds.bounds0.lower.a = geomID; + lbounds.bounds0.upper.a = primID; + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__64BIT__) + lbounds.bounds0.lower.a = id & 0xFFFFFFFF; + lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF; +#else + lbounds.bounds0.lower.a = id; + lbounds.bounds0.upper.a = 0; +#endif + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__64BIT__) + lbounds.bounds0.lower.u = id & 0xFFFFFFFF; + lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF; +#else + lbounds.bounds0.lower.u = id; + lbounds.bounds0.upper.u = 0; +#endif + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + /*! returns bounds for binning */ + __forceinline LBBox3fa bounds() const { + return lbounds; + } + + /*! returns the number of time segments of this primref */ + __forceinline unsigned size() const { + return lbounds.bounds1.lower.a; + } + + __forceinline unsigned totalTimeSegments() const { + return lbounds.bounds1.upper.a; + } + + /* calculate overlapping time segment range */ + __forceinline range<int> timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,float(totalTimeSegments())); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<=(int)totalTimeSegments()); + return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments()); + } + + /*! checks if time range overlaps */ + __forceinline bool time_range_overlap(const BBox1f& range) const + { + if (0.9999f*time_range.upper <= range.lower) return false; + if (1.0001f*time_range.lower >= range.upper) return false; + return true; + } + + /*! returns center for binning */ + __forceinline Vec3fa binCenter() const { + return center2(lbounds.interpolate(0.5f)); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = binCenter(); + } + + /*! returns the geometry ID */ + __forceinline unsigned geomID() const { + return lbounds.bounds0.lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned primID() const { + return lbounds.bounds0.upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__64BIT__) + return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32); +#else + return size_t(lbounds.bounds0.lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) { + return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }"; + } + + public: + LBBox3fx lbounds; + BBox1f time_range; // entire geometry time range + }; + +#else + + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct __aligned(16) PrimRefMB + { + typedef BBox3fa BBox; + + __forceinline PrimRefMB () {} + + __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID) + : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) + { + assert(activeTimeSegments > 0); + bbox.lower.a = geomID; + bbox.upper.a = primID; + } + + __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__64BIT__) + bbox.lower.u = id & 0xFFFFFFFF; + bbox.upper.u = (id >> 32) & 0xFFFFFFFF; +#else + bbox.lower.u = id; + bbox.upper.u = 0; +#endif + } + + /*! returns bounds for binning */ + __forceinline BBox3fa bounds() const { + return bbox; + } + + /*! returns the number of time segments of this primref */ + __forceinline unsigned int size() const { + return _activeTimeSegments; + } + + __forceinline unsigned int totalTimeSegments() const { + return _totalTimeSegments; + } + + /* calculate overlapping time segment range */ + __forceinline range<int> timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,float(_totalTimeSegments)); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<=(int)_totalTimeSegments); + return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments); + } + + /*! checks if time range overlaps */ + __forceinline bool time_range_overlap(const BBox1f& range) const + { + if (0.9999f*time_range.upper <= range.lower) return false; + if (1.0001f*time_range.lower >= range.upper) return false; + return true; + } + + /*! returns center for binning */ + __forceinline Vec3fa binCenter() const { + return center2(bounds()); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = center2(bounds()); + } + + /*! returns the geometry ID */ + __forceinline unsigned int geomID() const { + return bbox.lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned int primID() const { + return bbox.upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__64BIT__) + return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32); +#else + return size_t(bbox.lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) { + return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }"; + } + + public: + BBox3fa bbox; // bounds, geomID, primID + unsigned int _activeTimeSegments; + unsigned int _totalTimeSegments; + BBox1f time_range; // entire geometry time range + }; + +#endif +} diff --git a/thirdparty/embree/kernels/common/profile.h b/thirdparty/embree/kernels/common/profile.h new file mode 100644 index 0000000000..5ef7f6ec0f --- /dev/null +++ b/thirdparty/embree/kernels/common/profile.h @@ -0,0 +1,159 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! helper structure for the implementation of the profile functions below */ + struct ProfileTimer + { + static const size_t N = 20; + + ProfileTimer () {} + + ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0) + { + for (size_t i=0; i<N; i++) names[i] = nullptr; + for (size_t i=0; i<N; i++) dt_fst[i] = 0.0; + for (size_t i=0; i<N; i++) dt_min[i] = pos_inf; + for (size_t i=0; i<N; i++) dt_avg[i] = 0.0; + for (size_t i=0; i<N; i++) dt_max[i] = neg_inf; + } + + __forceinline void begin() + { + j=0; + t0 = tj = getSeconds(); + } + + __forceinline void end() { + absolute("total"); + i++; + } + + __forceinline void operator() (const char* name) { + relative(name); + } + + __forceinline void absolute (const char* name) + { + const double t1 = getSeconds(); + const double dt = t1-t0; + assert(names[j] == nullptr || names[j] == name); + names[j] = name; + if (i == 0) dt_fst[j] = dt; + if (i>=numSkip) { + dt_min[j] = min(dt_min[j],dt); + dt_avg[j] = dt_avg[j] + dt; + dt_max[j] = max(dt_max[j],dt); + } + j++; + maxJ = max(maxJ,j); + } + + __forceinline void relative (const char* name) + { + const double t1 = getSeconds(); + const double dt = t1-tj; + tj = t1; + assert(names[j] == nullptr || names[j] == name); + names[j] = name; + if (i == 0) dt_fst[j] = dt; + if (i>=numSkip) { + dt_min[j] = min(dt_min[j],dt); + dt_avg[j] = dt_avg[j] + dt; + dt_max[j] = max(dt_max[j],dt); + } + j++; + maxJ = max(maxJ,j); + } + + void print(size_t numElements) + { + for (size_t k=0; k<N; k++) + dt_avg[k] /= double(i-numSkip); + + printf(" profile [M/s]:\n"); + for (size_t j=0; j<maxJ; j++) + printf("%20s: fst = %7.2f M/s, min = %7.2f M/s, avg = %7.2f M/s, max = %7.2f M/s\n", + names[j],numElements/dt_fst[j]*1E-6,numElements/dt_max[j]*1E-6,numElements/dt_avg[j]*1E-6,numElements/dt_min[j]*1E-6); + + printf(" profile [ms]:\n"); + for (size_t j=0; j<maxJ; j++) + printf("%20s: fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n", + names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]); + } + + void print() + { + printf(" profile:\n"); + + for (size_t k=0; k<N; k++) + dt_avg[k] /= double(i-numSkip); + + for (size_t j=0; j<maxJ; j++) { + printf("%20s: fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n", + names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]); + } + } + + double avg() { + return dt_avg[maxJ-1]/double(i-numSkip); + } + + private: + size_t i; + size_t j; + size_t maxJ; + size_t numSkip; + double t0; + double tj; + const char* names[N]; + double dt_fst[N]; + double dt_min[N]; + double dt_avg[N]; + double dt_max[N]; + }; + + /*! This function executes some code block multiple times and measured sections of it. + Use the following way: + + profile(1,10,1000,[&](ProfileTimer& timer) { + // code + timer("A"); + // code + timer("B"); + }); + */ + template<typename Closure> + void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) + { + ProfileTimer timer(numSkip); + + for (size_t i=0; i<numSkip+numIter; i++) + { + timer.begin(); + closure(timer); + timer.end(); + } + timer.print(numElements); + } + + /*! similar as the function above, but the timer object comes externally */ + template<typename Closure> + void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) + { + timer = ProfileTimer(numSkip); + + for (size_t i=0; i<numSkip+numIter; i++) + { + timer.begin(); + closure(timer); + timer.end(); + } + timer.print(numElements); + } +} diff --git a/thirdparty/embree/kernels/common/ray.h b/thirdparty/embree/kernels/common/ray.h new file mode 100644 index 0000000000..7b951cc1e8 --- /dev/null +++ b/thirdparty/embree/kernels/common/ray.h @@ -0,0 +1,1517 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "instance_stack.h" + +// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted + +namespace embree +{ + static const size_t MAX_INTERNAL_STREAM_SIZE = 32; + + /* Ray structure for K rays */ + template<int K> + struct RayK + { + /* Default construction does nothing */ + __forceinline RayK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayK(const Vec3vf<K>& org, const Vec3vf<K>& dir, + const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf, + const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0) + : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {} + + /* Returns the size of the ray */ + static __forceinline size_t size() { return K; } + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline vbool<K> valid() const + { + const vbool<K> vx = (abs(org.x) <= vfloat<K>(FLT_LARGE)) & (abs(dir.x) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vy = (abs(org.y) <= vfloat<K>(FLT_LARGE)) & (abs(dir.y) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vz = (abs(org.z) <= vfloat<K>(FLT_LARGE)) & (abs(dir.z) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vn = abs(tnear()) <= vfloat<K>(inf); + const vbool<K> vf = abs(tfar) <= vfloat<K>(inf); + return vx & vy & vz & vn & vf; + } + + __forceinline void get(RayK<1>* ray) const; + __forceinline void get(size_t i, RayK<1>& ray) const; + __forceinline void set(const RayK<1>* ray); + __forceinline void set(size_t i, const RayK<1>& ray); + + __forceinline void copy(size_t dest, size_t source); + + __forceinline vint<K> octant() const + { + return select(dir.x < 0.0f, vint<K>(1), vint<K>(zero)) | + select(dir.y < 0.0f, vint<K>(2), vint<K>(zero)) | + select(dir.z < 0.0f, vint<K>(4), vint<K>(zero)); + } + + /* Ray data */ + Vec3vf<K> org; // ray origin + vfloat<K> _tnear; // start of ray segment + Vec3vf<K> dir; // ray direction + vfloat<K> _time; // time of this ray for motion blur + vfloat<K> tfar; // end of ray segment + vint<K> mask; // used to mask out objects during traversal + vint<K> id; + vint<K> flags; + + __forceinline vfloat<K>& tnear() { return _tnear; } + __forceinline vfloat<K>& time() { return _time; } + __forceinline const vfloat<K>& tnear() const { return _tnear; } + __forceinline const vfloat<K>& time() const { return _time; } + }; + + /* Ray+hit structure for K rays */ + template<int K> + struct RayHitK : RayK<K> + { + using RayK<K>::org; + using RayK<K>::_tnear; + using RayK<K>::dir; + using RayK<K>::_time; + using RayK<K>::tfar; + using RayK<K>::mask; + using RayK<K>::id; + using RayK<K>::flags; + + using RayK<K>::tnear; + using RayK<K>::time; + + /* Default construction does nothing */ + __forceinline RayHitK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayHitK(const Vec3vf<K>& org, const Vec3vf<K>& dir, + const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf, + const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0) + : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags), + geomID(RTC_INVALID_GEOMETRY_ID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + } + + __forceinline RayHitK(const RayK<K>& ray) + : RayK<K>(ray), + geomID(RTC_INVALID_GEOMETRY_ID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + } + + __forceinline RayHitK<K>& operator =(const RayK<K>& ray) + { + org = ray.org; + _tnear = ray._tnear; + dir = ray.dir; + _time = ray._time; + tfar = ray.tfar; + mask = ray.mask; + id = ray.id; + flags = ray.flags; + + geomID = RTC_INVALID_GEOMETRY_ID; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + + return *this; + } + + /* Calculates if the hit is valid */ + __forceinline void verifyHit(const vbool<K>& valid0) const + { + vbool<K> valid = valid0 & geomID != vuint<K>(RTC_INVALID_GEOMETRY_ID); + const vbool<K> vt = (abs(tfar) <= vfloat<K>(FLT_LARGE)) | (tfar == vfloat<K>(neg_inf)); + const vbool<K> vu = (abs(u) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vv = (abs(u) <= vfloat<K>(FLT_LARGE)); + const vbool<K> vnx = abs(Ng.x) <= vfloat<K>(FLT_LARGE); + const vbool<K> vny = abs(Ng.y) <= vfloat<K>(FLT_LARGE); + const vbool<K> vnz = abs(Ng.z) <= vfloat<K>(FLT_LARGE); + if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t"); + if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u"); + if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v"); + if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x"); + if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y"); + if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z"); + } + + __forceinline void get(RayHitK<1>* ray) const; + __forceinline void get(size_t i, RayHitK<1>& ray) const; + __forceinline void set(const RayHitK<1>* ray); + __forceinline void set(size_t i, const RayHitK<1>& ray); + + __forceinline void copy(size_t dest, size_t source); + + /* Hit data */ + Vec3vf<K> Ng; // geometry normal + vfloat<K> u; // barycentric u coordinate of hit + vfloat<K> v; // barycentric v coordinate of hit + vuint<K> primID; // primitive ID + vuint<K> geomID; // geometry ID + vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Specialization for a single ray */ + template<> + struct RayK<1> + { + /* Default construction does nothing */ + __forceinline RayK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0) + : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {} + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline bool valid() const { + return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf); + } + + /* Ray data */ + Vec3ff org; // 3 floats for ray origin, 1 float for tnear + //float tnear; // start of ray segment + Vec3ff dir; // 3 floats for ray direction, 1 float for time + // float time; + float tfar; // end of ray segment + int mask; // used to mask out objects during traversal + int id; // ray ID + int flags; // ray flags + + __forceinline float& tnear() { return org.w; }; + __forceinline const float& tnear() const { return org.w; }; + + __forceinline float& time() { return dir.w; }; + __forceinline const float& time() const { return dir.w; }; + + }; + + template<> + struct RayHitK<1> : RayK<1> + { + /* Default construction does nothing */ + __forceinline RayHitK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0) + : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags), + geomID(RTC_INVALID_GEOMETRY_ID) {} + + __forceinline RayHitK(const RayK<1>& ray) + : RayK<1>(ray), + geomID(RTC_INVALID_GEOMETRY_ID) {} + + __forceinline RayHitK<1>& operator =(const RayK<1>& ray) + { + org = ray.org; + dir = ray.dir; + tfar = ray.tfar; + mask = ray.mask; + id = ray.id; + flags = ray.flags; + + geomID = RTC_INVALID_GEOMETRY_ID; + + return *this; + } + + /* Calculates if the hit is valid */ + __forceinline void verifyHit() const + { + if (geomID == RTC_INVALID_GEOMETRY_ID) return; + const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf)); + const bool vu = (abs(u) <= FLT_LARGE); + const bool vv = (abs(u) <= FLT_LARGE); + const bool vnx = abs(Ng.x) <= FLT_LARGE; + const bool vny = abs(Ng.y) <= FLT_LARGE; + const bool vnz = abs(Ng.z) <= FLT_LARGE; + if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t"); + if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u"); + if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v"); + if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x"); + if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y"); + if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z"); + } + + /* Hit data */ + Vec3f Ng; // not normalized geometry normal + float u; // barycentric u coordinate of hit + float v; // barycentric v coordinate of hit + unsigned int primID; // primitive ID + unsigned int geomID; // geometry ID + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Converts ray packet to single rays */ + template<int K> + __forceinline void RayK<K>::get(RayK<1>* ray) const + { + for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose + { + ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i]; + ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time() = time()[i]; + ray[i].tfar = tfar[i]; ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i]; + } + } + + template<int K> + __forceinline void RayHitK<K>::get(RayHitK<1>* ray) const + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + get(i, ray[i]); + } + + /* Extracts a single ray out of a ray packet*/ + template<int K> + __forceinline void RayK<K>::get(size_t i, RayK<1>& ray) const + { + ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; + ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time() = time()[i]; + ray.tfar = tfar[i]; ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i]; + } + + template<int K> + __forceinline void RayHitK<K>::get(size_t i, RayHitK<1>& ray) const + { + ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; + ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar = tfar[i]; ray.time() = time()[i]; + ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i]; + ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i]; + ray.u = u[i]; ray.v = v[i]; + ray.primID = primID[i]; ray.geomID = geomID[i]; + + instance_id_stack::copy_VU<K>(instID, ray.instID, i); + } + + /* Converts single rays to ray packet */ + template<int K> + __forceinline void RayK<K>::set(const RayK<1>* ray) + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + set(i, ray[i]); + } + + template<int K> + __forceinline void RayHitK<K>::set(const RayHitK<1>* ray) + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + set(i, ray[i]); + } + + /* inserts a single ray into a ray packet element */ + template<int K> + __forceinline void RayK<K>::set(size_t i, const RayK<1>& ray) + { + org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear(); + dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time(); + tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags; + } + + template<int K> + __forceinline void RayHitK<K>::set(size_t i, const RayHitK<1>& ray) + { + org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear(); + dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time(); + tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags; + Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z; + u[i] = ray.u; v[i] = ray.v; + primID[i] = ray.primID; geomID[i] = ray.geomID; + + instance_id_stack::copy_UV<K>(ray.instID, instID, i); + } + + /* copies a ray packet element into another element*/ + template<int K> + __forceinline void RayK<K>::copy(size_t dest, size_t source) + { + org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source]; + dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; + tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; + } + + template<int K> + __forceinline void RayHitK<K>::copy(size_t dest, size_t source) + { + org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source]; + dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; + tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; + Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source]; + u[dest] = u[source]; v[dest] = v[source]; + primID[dest] = primID[source]; geomID[dest] = geomID[source]; + + instance_id_stack::copy_VV<K>(instID, instID, source, dest); + } + + /* Shortcuts */ + typedef RayK<1> Ray; + typedef RayK<4> Ray4; + typedef RayK<8> Ray8; + typedef RayK<16> Ray16; + struct RayN; + + typedef RayHitK<1> RayHit; + typedef RayHitK<4> RayHit4; + typedef RayHitK<8> RayHit8; + typedef RayHitK<16> RayHit16; + struct RayHitN; + + template<int K, bool intersect> + struct RayTypeHelper; + + template<int K> + struct RayTypeHelper<K, true> + { + typedef RayHitK<K> Ty; + }; + + template<int K> + struct RayTypeHelper<K, false> + { + typedef RayK<K> Ty; + }; + + template<bool intersect> + using RayType = typename RayTypeHelper<1, intersect>::Ty; + + template<int K, bool intersect> + using RayTypeK = typename RayTypeHelper<K, intersect>::Ty; + + /* Outputs ray to stream */ + template<int K> + __forceinline embree_ostream operator <<(embree_ostream cout, const RayK<K>& ray) + { + return cout << "{ " << embree_endl + << " org = " << ray.org << embree_endl + << " dir = " << ray.dir << embree_endl + << " near = " << ray.tnear() << embree_endl + << " far = " << ray.tfar << embree_endl + << " time = " << ray.time() << embree_endl + << " mask = " << ray.mask << embree_endl + << " id = " << ray.id << embree_endl + << " flags = " << ray.flags << embree_endl + << "}"; + } + + template<int K> + __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK<K>& ray) + { + cout << "{ " << embree_endl + << " org = " << ray.org << embree_endl + << " dir = " << ray.dir << embree_endl + << " near = " << ray.tnear() << embree_endl + << " far = " << ray.tfar << embree_endl + << " time = " << ray.time() << embree_endl + << " mask = " << ray.mask << embree_endl + << " id = " << ray.id << embree_endl + << " flags = " << ray.flags << embree_endl + << " Ng = " << ray.Ng + << " u = " << ray.u << embree_endl + << " v = " << ray.v << embree_endl + << " primID = " << ray.primID << embree_endl + << " geomID = " << ray.geomID << embree_endl + << " instID ="; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + { + cout << " " << ray.instID[l]; + } + cout << embree_endl; + return cout << "}"; + } + + struct RayStreamSOA + { + __forceinline RayStreamSOA(void* rays, size_t N) + : ptr((char*)rays), N(N) {} + + /* ray data access functions */ + __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; } // x coordinate of ray origin + __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; } // y coordinate of ray origin + __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin + __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment + + __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction + __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction + __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction + __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur + + __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance) + __forceinline int* mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset]; }; // used to mask out objects during traversal (optional) + __forceinline int* id (size_t offset = 0) { return (int*)&ptr[10*4*N+offset]; }; // id + __forceinline int* flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset]; }; // flags + + /* hit data access functions */ + __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal + __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal + __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal + + __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; }; // barycentric u coordinate of hit + __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; }; // barycentric v coordinate of hit + + __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; }; // primitive ID + __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; }; // geometry ID + __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; }; // instance ID + + __forceinline Ray getRayByOffset(size_t offset) + { + Ray ray; + ray.org.x = org_x(offset)[0]; + ray.org.y = org_y(offset)[0]; + ray.org.z = org_z(offset)[0]; + ray.tnear() = tnear(offset)[0]; + ray.dir.x = dir_x(offset)[0]; + ray.dir.y = dir_y(offset)[0]; + ray.dir.z = dir_z(offset)[0]; + ray.time() = time(offset)[0]; + ray.tfar = tfar(offset)[0]; + ray.mask = mask(offset)[0]; + ray.id = id(offset)[0]; + ray.flags = flags(offset)[0]; + return ray; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(size_t offset) + { + RayK<K> ray; + ray.org.x = vfloat<K>::loadu(org_x(offset)); + ray.org.y = vfloat<K>::loadu(org_y(offset)); + ray.org.z = vfloat<K>::loadu(org_z(offset)); + ray.tnear = vfloat<K>::loadu(tnear(offset)); + ray.dir.x = vfloat<K>::loadu(dir_x(offset)); + ray.dir.y = vfloat<K>::loadu(dir_y(offset)); + ray.dir.z = vfloat<K>::loadu(dir_z(offset)); + ray.time = vfloat<K>::loadu(time(offset)); + ray.tfar = vfloat<K>::loadu(tfar(offset)); + ray.mask = vint<K>::loadu(mask(offset)); + ray.id = vint<K>::loadu(id(offset)); + ray.flags = vint<K>::loadu(flags(offset)); + return ray; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset) + { + RayK<K> ray; + ray.org.x = vfloat<K>::loadu(valid, org_x(offset)); + ray.org.y = vfloat<K>::loadu(valid, org_y(offset)); + ray.org.z = vfloat<K>::loadu(valid, org_z(offset)); + ray.tnear() = vfloat<K>::loadu(valid, tnear(offset)); + ray.dir.x = vfloat<K>::loadu(valid, dir_x(offset)); + ray.dir.y = vfloat<K>::loadu(valid, dir_y(offset)); + ray.dir.z = vfloat<K>::loadu(valid, dir_z(offset)); + ray.time() = vfloat<K>::loadu(valid, time(offset)); + ray.tfar = vfloat<K>::loadu(valid, tfar(offset)); + +#if !defined(__AVX__) + /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults, + because the SSE masked loads always access the entire vector */ + if (unlikely(!all(valid))) + { + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + ray.mask[k] = mask(offset)[k]; + ray.id[k] = id(offset)[k]; + ray.flags[k] = flags(offset)[k]; + } + } + } + else +#endif + { + ray.mask = vint<K>::loadu(valid, mask(offset)); + ray.id = vint<K>::loadu(valid, id(offset)); + ray.flags = vint<K>::loadu(valid, flags(offset)); + } + + return ray; + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray) + { + /* + * valid_i: stores which of the input rays exist (do not access nonexistent rays!) + * valid: stores which of the rays actually hit something. + */ + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + vfloat<K>::storeu(valid, tfar(offset), ray.tfar); + vfloat<K>::storeu(valid, Ng_x(offset), ray.Ng.x); + vfloat<K>::storeu(valid, Ng_y(offset), ray.Ng.y); + vfloat<K>::storeu(valid, Ng_z(offset), ray.Ng.z); + vfloat<K>::storeu(valid, u(offset), ray.u); + vfloat<K>::storeu(valid, v(offset), ray.v); + +#if !defined(__AVX__) + /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults, + because the SSE masked stores always access the entire vector */ + if (unlikely(!all(valid_i))) + { + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + primID(offset)[k] = ray.primID[k]; + geomID(offset)[k] = ray.geomID[k]; + + instID(0, offset)[k] = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + instID(l, offset)[k] = ray.instID[l][k]; +#endif + } + } + } + else +#endif + { + vuint<K>::storeu(valid, primID(offset), ray.primID); + vuint<K>::storeu(valid, geomID(offset), ray.geomID); + + vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]); +#endif + } + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + vfloat<K>::storeu(valid, tfar(offset), ray.tfar); + } + + __forceinline size_t getOctantByOffset(size_t offset) + { + const float dx = dir_x(offset)[0]; + const float dy = dir_y(offset)[0]; + const float dz = dir_z(offset)[0]; + const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0); + return octantID; + } + + __forceinline bool isValidByOffset(size_t offset) + { + const float nnear = tnear(offset)[0]; + const float ffar = tfar(offset)[0]; + return nnear <= ffar; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset) + { + RayK<K> ray; + +#if defined(__AVX2__) + ray.org.x = vfloat<K>::template gather<1>(valid, org_x(), offset); + ray.org.y = vfloat<K>::template gather<1>(valid, org_y(), offset); + ray.org.z = vfloat<K>::template gather<1>(valid, org_z(), offset); + ray.tnear() = vfloat<K>::template gather<1>(valid, tnear(), offset); + ray.dir.x = vfloat<K>::template gather<1>(valid, dir_x(), offset); + ray.dir.y = vfloat<K>::template gather<1>(valid, dir_y(), offset); + ray.dir.z = vfloat<K>::template gather<1>(valid, dir_z(), offset); + ray.time() = vfloat<K>::template gather<1>(valid, time(), offset); + ray.tfar = vfloat<K>::template gather<1>(valid, tfar(), offset); + ray.mask = vint<K>::template gather<1>(valid, mask(), offset); + ray.id = vint<K>::template gather<1>(valid, id(), offset); + ray.flags = vint<K>::template gather<1>(valid, flags(), offset); +#else + ray.org = zero; + ray.tnear() = zero; + ray.dir = zero; + ray.time() = zero; + ray.tfar = zero; + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + const size_t ofs = offset[k]; + + ray.org.x[k] = *org_x(ofs); + ray.org.y[k] = *org_y(ofs); + ray.org.z[k] = *org_z(ofs); + ray.tnear()[k] = *tnear(ofs); + ray.dir.x[k] = *dir_x(ofs); + ray.dir.y[k] = *dir_y(ofs); + ray.dir.z[k] = *dir_z(ofs); + ray.time()[k] = *time(ofs); + ray.tfar[k] = *tfar(ofs); + ray.mask[k] = *mask(ofs); + ray.id[k] = *id(ofs); + ray.flags[k] = *flags(ofs); + } + } +#endif + + return ray; + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar); + vfloat<K>::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x); + vfloat<K>::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y); + vfloat<K>::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z); + vfloat<K>::template scatter<1>(valid, u(), offset, ray.u); + vfloat<K>::template scatter<1>(valid, v(), offset, ray.v); + vuint<K>::template scatter<1>(valid, primID(), offset, ray.primID); + vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID); + + vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]); +#endif +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *tfar(ofs) = ray.tfar[k]; + + *Ng_x(ofs) = ray.Ng.x[k]; + *Ng_y(ofs) = ray.Ng.y[k]; + *Ng_z(ofs) = ray.Ng.z[k]; + *u(ofs) = ray.u[k]; + *v(ofs) = ray.v[k]; + *primID(ofs) = ray.primID[k]; + *geomID(ofs) = ray.geomID[k]; + + *instID(0, ofs) = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + *instID(l, ofs) = ray.instID[l][k]; +#endif + } +#endif + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *tfar(ofs) = ray.tfar[k]; + } +#endif + } + } + + char* __restrict__ ptr; + size_t N; + }; + + template<size_t MAX_K> + struct StackRayStreamSOA : public RayStreamSOA + { + __forceinline StackRayStreamSOA(size_t K) + : RayStreamSOA(data, K) { assert(K <= MAX_K); } + + char data[MAX_K / 4 * sizeof(RayHit4)]; + }; + + + struct RayStreamSOP + { + template<class T> + __forceinline void init(T& t) + { + org_x = (float*)&t.org.x; + org_y = (float*)&t.org.y; + org_z = (float*)&t.org.z; + tnear = (float*)&t.tnear; + dir_x = (float*)&t.dir.x; + dir_y = (float*)&t.dir.y; + dir_z = (float*)&t.dir.z; + time = (float*)&t.time; + tfar = (float*)&t.tfar; + mask = (unsigned int*)&t.mask; + id = (unsigned int*)&t.id; + flags = (unsigned int*)&t.flags; + + Ng_x = (float*)&t.Ng.x; + Ng_y = (float*)&t.Ng.y; + Ng_z = (float*)&t.Ng.z; + u = (float*)&t.u; + v = (float*)&t.v; + primID = (unsigned int*)&t.primID; + geomID = (unsigned int*)&t.geomID; + + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = (unsigned int*)&t.instID[l]; + } + + __forceinline Ray getRayByOffset(size_t offset) + { + Ray ray; + ray.org.x = *(float* __restrict__)((char*)org_x + offset); + ray.org.y = *(float* __restrict__)((char*)org_y + offset); + ray.org.z = *(float* __restrict__)((char*)org_z + offset); + ray.dir.x = *(float* __restrict__)((char*)dir_x + offset); + ray.dir.y = *(float* __restrict__)((char*)dir_y + offset); + ray.dir.z = *(float* __restrict__)((char*)dir_z + offset); + ray.tfar = *(float* __restrict__)((char*)tfar + offset); + ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f; + ray.time() = time ? *(float* __restrict__)((char*)time + offset) : 0.0f; + ray.mask = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1; + ray.id = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1; + ray.flags = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1; + return ray; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset) + { + RayK<K> ray; + ray.org.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_x + offset)); + ray.org.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_y + offset)); + ray.org.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_z + offset)); + ray.dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset)); + ray.dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset)); + ray.dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset)); + ray.tfar = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset)); + ray.tnear() = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f; + ray.time() = time ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f; + ray.mask = mask ? vint<K>::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1; + ray.id = id ? vint<K>::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1; + ray.flags = flags ? vint<K>::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1; + return ray; + } + + template<int K> + __forceinline Vec3vf<K> getDirByOffset(const vbool<K>& valid, size_t offset) + { + Vec3vf<K> dir; + dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset)); + dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset)); + dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset)); + return dir; + } + + __forceinline void setHitByOffset(size_t offset, const RayHit& ray) + { + if (ray.geomID != RTC_INVALID_GEOMETRY_ID) + { + *(float* __restrict__)((char*)tfar + offset) = ray.tfar; + + if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x; + if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y; + if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z; + *(float* __restrict__)((char*)u + offset) = ray.u; + *(float* __restrict__)((char*)v + offset) = ray.v; + *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID; + *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID; + + if (likely(instID[0])) { + *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l) + *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l]; +#endif + } + } + } + + __forceinline void setHitByOffset(size_t offset, const Ray& ray) + { + *(float* __restrict__)((char*)tfar + offset) = ray.tfar; + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar); + + if (likely(Ng_x)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x); + if (likely(Ng_y)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y); + if (likely(Ng_z)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z); + vfloat<K>::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u); + vfloat<K>::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v); + vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID); + vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID); + + if (likely(instID[0])) { + vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]); +#endif + } + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar); + } + + __forceinline size_t getOctantByOffset(size_t offset) + { + const float dx = *(float* __restrict__)((char*)dir_x + offset); + const float dy = *(float* __restrict__)((char*)dir_y + offset); + const float dz = *(float* __restrict__)((char*)dir_z + offset); + const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0); + return octantID; + } + + __forceinline bool isValidByOffset(size_t offset) + { + const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f; + const float ffar = *(float* __restrict__)((char*)tfar + offset); + return nnear <= ffar; + } + + template<int K> + __forceinline vbool<K> isValidByOffset(const vbool<K>& valid, size_t offset) + { + const vfloat<K> nnear = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f; + const vfloat<K> ffar = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset)); + return nnear <= ffar; + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset) + { + RayK<K> ray; + +#if defined(__AVX2__) + ray.org.x = vfloat<K>::template gather<1>(valid, org_x, offset); + ray.org.y = vfloat<K>::template gather<1>(valid, org_y, offset); + ray.org.z = vfloat<K>::template gather<1>(valid, org_z, offset); + ray.dir.x = vfloat<K>::template gather<1>(valid, dir_x, offset); + ray.dir.y = vfloat<K>::template gather<1>(valid, dir_y, offset); + ray.dir.z = vfloat<K>::template gather<1>(valid, dir_z, offset); + ray.tfar = vfloat<K>::template gather<1>(valid, tfar, offset); + ray.tnear() = tnear ? vfloat<K>::template gather<1>(valid, tnear, offset) : vfloat<K>(zero); + ray.time() = time ? vfloat<K>::template gather<1>(valid, time, offset) : vfloat<K>(zero); + ray.mask = mask ? vint<K>::template gather<1>(valid, (int*)mask, offset) : vint<K>(-1); + ray.id = id ? vint<K>::template gather<1>(valid, (int*)id, offset) : vint<K>(-1); + ray.flags = flags ? vint<K>::template gather<1>(valid, (int*)flags, offset) : vint<K>(-1); +#else + ray.org = zero; + ray.tnear() = zero; + ray.dir = zero; + ray.tfar = zero; + ray.time() = zero; + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + const size_t ofs = offset[k]; + + ray.org.x[k] = *(float* __restrict__)((char*)org_x + ofs); + ray.org.y[k] = *(float* __restrict__)((char*)org_y + ofs); + ray.org.z[k] = *(float* __restrict__)((char*)org_z + ofs); + ray.dir.x[k] = *(float* __restrict__)((char*)dir_x + ofs); + ray.dir.y[k] = *(float* __restrict__)((char*)dir_y + ofs); + ray.dir.z[k] = *(float* __restrict__)((char*)dir_z + ofs); + ray.tfar[k] = *(float* __restrict__)((char*)tfar + ofs); + ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f; + ray.time()[k] = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f; + ray.mask[k] = mask ? *(int* __restrict__)((char*)mask + ofs) : -1; + ray.id[k] = id ? *(int* __restrict__)((char*)id + ofs) : -1; + ray.flags[k] = flags ? *(int* __restrict__)((char*)flags + ofs) : -1; + } + } +#endif + + return ray; + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar); + + if (likely(Ng_x)) vfloat<K>::template scatter<1>(valid, Ng_x, offset, ray.Ng.x); + if (likely(Ng_y)) vfloat<K>::template scatter<1>(valid, Ng_y, offset, ray.Ng.y); + if (likely(Ng_z)) vfloat<K>::template scatter<1>(valid, Ng_z, offset, ray.Ng.z); + vfloat<K>::template scatter<1>(valid, u, offset, ray.u); + vfloat<K>::template scatter<1>(valid, v, offset, ray.v); + vuint<K>::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID); + vuint<K>::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID); + + if (likely(instID[0])) { + vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]); +#endif + } +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k]; + + if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k]; + if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k]; + if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k]; + *(float* __restrict__)((char*)u + ofs) = ray.u[k]; + *(float* __restrict__)((char*)v + ofs) = ray.v[k]; + *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k]; + *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k]; + + if (likely(instID[0])) { + *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k]; +#endif + } + } +#endif + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k]; + } +#endif + } + } + + /* ray data */ + float* __restrict__ org_x; // x coordinate of ray origin + float* __restrict__ org_y; // y coordinate of ray origin + float* __restrict__ org_z; // z coordinate of ray origin + float* __restrict__ tnear; // start of ray segment (optional) + + float* __restrict__ dir_x; // x coordinate of ray direction + float* __restrict__ dir_y; // y coordinate of ray direction + float* __restrict__ dir_z; // z coordinate of ray direction + float* __restrict__ time; // time of this ray for motion blur (optional) + + float* __restrict__ tfar; // end of ray segment (set to hit distance) + unsigned int* __restrict__ mask; // used to mask out objects during traversal (optional) + unsigned int* __restrict__ id; // ray ID + unsigned int* __restrict__ flags; // ray flags + + /* hit data */ + float* __restrict__ Ng_x; // x coordinate of geometry normal (optional) + float* __restrict__ Ng_y; // y coordinate of geometry normal (optional) + float* __restrict__ Ng_z; // z coordinate of geometry normal (optional) + + float* __restrict__ u; // barycentric u coordinate of hit + float* __restrict__ v; // barycentric v coordinate of hit + + unsigned int* __restrict__ primID; // primitive ID + unsigned int* __restrict__ geomID; // geometry ID + unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional) + }; + + + struct RayStreamAOS + { + __forceinline RayStreamAOS(void* rays) + : ptr((Ray*)rays) {} + + __forceinline Ray& getRayByOffset(size_t offset) + { + return *(Ray*)((char*)ptr + offset); + } + + template<int K> + __forceinline RayK<K> getRayByOffset(const vint<K>& offset); + + template<int K> + __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset) + { + const vint<K> valid_offset = select(valid, offset, vintx(zero)); + return getRayByOffset<K>(valid_offset); + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u); + vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v); + vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID); + vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID); + + vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]); +#endif +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]); + ray_k->tfar = ray.tfar[k]; + ray_k->Ng.x = ray.Ng.x[k]; + ray_k->Ng.y = ray.Ng.y[k]; + ray_k->Ng.z = ray.Ng.z[k]; + ray_k->u = ray.u[k]; + ray_k->v = ray.v[k]; + ray_k->primID = ray.primID[k]; + ray_k->geomID = ray.geomID[k]; + + instance_id_stack::copy_VU<K>(ray.instID, ray_k->instID, k); + } +#endif + } + } + + template<int K> + __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]); + ray_k->tfar = ray.tfar[k]; + } +#endif + } + } + + Ray* __restrict__ ptr; + }; + + template<> + __forceinline Ray4 RayStreamAOS::getRayByOffset<4>(const vint4& offset) + { + Ray4 ray; + + /* load and transpose: org.x, org.y, org.z, tnear */ + const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org); + const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org); + const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org); + const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org); + + transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear()); + + /* load and transpose: dir.x, dir.y, dir.z, time */ + const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir); + const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir); + const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir); + const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir); + + transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar); + + vfloat4 maskf, idf, flagsf; + transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } + +#if defined(__AVX__) + template<> + __forceinline Ray8 RayStreamAOS::getRayByOffset<8>(const vint8& offset) + { + Ray8 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org); + const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org); + const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org); + const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org); + const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org); + const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org); + const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org); + const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar); + const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar); + const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar); + const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar); + const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar); + + vfloat8 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + +#if defined(__AVX512F__) + template<> + __forceinline Ray16 RayStreamAOS::getRayByOffset<16>(const vint16& offset) + { + Ray16 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org); + const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org); + const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org); + const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org); + const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org); + const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org); + const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org); + const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org); + const vfloat8 ab8 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org); + const vfloat8 ab9 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org); + const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org); + const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org); + const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org); + const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org); + const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org); + const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15, + ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar); + const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar); + const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar); + const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar); + const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar); + const vfloat4 c8 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar); + const vfloat4 c9 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar); + const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar); + const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar); + const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar); + const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar); + const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar); + const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar); + + vfloat16 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15, + ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + + + struct RayStreamAOP + { + __forceinline RayStreamAOP(void* rays) + : ptr((Ray**)rays) {} + + __forceinline Ray& getRayByIndex(size_t index) + { + return *ptr[index]; + } + + template<int K> + __forceinline RayK<K> getRayByIndex(const vint<K>& index); + + template<int K> + __forceinline RayK<K> getRayByIndex(const vbool<K>& valid, const vint<K>& index) + { + const vint<K> valid_index = select(valid, index, vintx(zero)); + return getRayByIndex<K>(valid_index); + } + + template<int K> + __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayHitK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]]; + + ray_k->tfar = ray.tfar[k]; + ray_k->Ng.x = ray.Ng.x[k]; + ray_k->Ng.y = ray.Ng.y[k]; + ray_k->Ng.z = ray.Ng.z[k]; + ray_k->u = ray.u[k]; + ray_k->v = ray.v[k]; + ray_k->primID = ray.primID[k]; + ray_k->geomID = ray.geomID[k]; + instance_id_stack::copy_VU<K>(ray.instID, ray_k->instID, k); + } + } + } + + template<int K> + __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayK<K>& ray) + { + vbool<K> valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + Ray* __restrict__ ray_k = ptr[index[k]]; + + ray_k->tfar = ray.tfar[k]; + } + } + } + + Ray** __restrict__ ptr; + }; + + template<> + __forceinline Ray4 RayStreamAOP::getRayByIndex<4>(const vint4& index) + { + Ray4 ray; + + /* load and transpose: org.x, org.y, org.z, tnear */ + const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org); + const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org); + const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org); + const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org); + + transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear()); + + /* load and transpose: dir.x, dir.y, dir.z, time */ + const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir); + const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir); + const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir); + const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir); + + transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + + vfloat4 maskf, idf, flagsf; + transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } + +#if defined(__AVX__) + template<> + __forceinline Ray8 RayStreamAOP::getRayByIndex<8>(const vint8& index) + { + Ray8 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org); + const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org); + const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org); + const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org); + const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org); + const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org); + const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org); + const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar); + const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar); + const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar); + const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar); + + vfloat8 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + +#if defined(__AVX512F__) + template<> + __forceinline Ray16 RayStreamAOP::getRayByIndex<16>(const vint16& index) + { + Ray16 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org); + const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org); + const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org); + const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org); + const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org); + const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org); + const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org); + const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org); + const vfloat8 ab8 = vfloat8::loadu(&ptr[index[8]]->org); + const vfloat8 ab9 = vfloat8::loadu(&ptr[index[9]]->org); + const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org); + const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org); + const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org); + const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org); + const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org); + const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15, + ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar); + const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar); + const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar); + const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar); + const vfloat4 c8 = vfloat4::loadu(&ptr[index[8]]->tfar); + const vfloat4 c9 = vfloat4::loadu(&ptr[index[9]]->tfar); + const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar); + const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar); + const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar); + const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar); + const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar); + const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar); + + vfloat16 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15, + ray.tfar, maskf, idf, flagsf); + + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif +} diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp new file mode 100644 index 0000000000..94b3819e42 --- /dev/null +++ b/thirdparty/embree/kernels/common/rtcore.cpp @@ -0,0 +1,1766 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_EXPORT_API + +#include "default.h" +#include "device.h" +#include "scene.h" +#include "context.h" +#include "../../include/embree3/rtcore_ray.h" +using namespace embree; + +RTC_NAMESPACE_BEGIN; + + /* mutex to make API thread safe */ + static MutexSys g_mutex; + + RTC_API RTCDevice rtcNewDevice(const char* config) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewDevice); + Lock<MutexSys> lock(g_mutex); + Device* device = new Device(config); + return (RTCDevice) device->refInc(); + RTC_CATCH_END(nullptr); + return (RTCDevice) nullptr; + } + + RTC_API void rtcRetainDevice(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainDevice); + RTC_VERIFY_HANDLE(hdevice); + Lock<MutexSys> lock(g_mutex); + device->refInc(); + RTC_CATCH_END(nullptr); + } + + RTC_API void rtcReleaseDevice(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseDevice); + RTC_VERIFY_HANDLE(hdevice); + Lock<MutexSys> lock(g_mutex); + device->refDec(); + RTC_CATCH_END(nullptr); + } + + RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetDeviceProperty); + RTC_VERIFY_HANDLE(hdevice); + Lock<MutexSys> lock(g_mutex); + return device->getProperty(prop); + RTC_CATCH_END(device); + return 0; + } + + RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceProperty); + const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004; + if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings + Lock<MutexSys> lock(g_mutex); + device->setProperty(prop,val); + RTC_CATCH_END(device); + } + + RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetDeviceError); + if (device == nullptr) return Device::getThreadErrorCode(); + else return device->getDeviceErrorCode(); + RTC_CATCH_END(device); + return RTC_ERROR_UNKNOWN; + } + + RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceErrorFunction); + RTC_VERIFY_HANDLE(hdevice); + device->setErrorFunction(error, userPtr); + RTC_CATCH_END(device); + } + + RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceMemoryMonitorFunction); + device->setMemoryMonitorFunction(memoryMonitor, userPtr); + RTC_CATCH_END(device); + } + + RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewBuffer); + RTC_VERIFY_HANDLE(hdevice); + Buffer* buffer = new Buffer((Device*)hdevice, byteSize); + return (RTCBuffer)buffer->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewSharedBuffer); + RTC_VERIFY_HANDLE(hdevice); + Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr); + return (RTCBuffer)buffer->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API void* rtcGetBufferData(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetBufferData); + RTC_VERIFY_HANDLE(hbuffer); + return buffer->data(); + RTC_CATCH_END2(buffer); + return nullptr; + } + + RTC_API void rtcRetainBuffer(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainBuffer); + RTC_VERIFY_HANDLE(hbuffer); + buffer->refInc(); + RTC_CATCH_END2(buffer); + } + + RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseBuffer); + RTC_VERIFY_HANDLE(hbuffer); + buffer->refDec(); + RTC_CATCH_END2(buffer); + } + + RTC_API RTCScene rtcNewScene (RTCDevice hdevice) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewScene); + RTC_VERIFY_HANDLE(hdevice); + Scene* scene = new Scene((Device*)hdevice); + return (RTCScene) scene->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneDevice); + RTC_VERIFY_HANDLE(hscene); + return (RTCDevice)scene->device->refInc(); // user will own one additional device reference + RTC_CATCH_END2(scene); + return (RTCDevice)nullptr; + } + + RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneProgressMonitorFunction); + RTC_VERIFY_HANDLE(hscene); + Lock<MutexSys> lock(g_mutex); + scene->setProgressMonitorFunction(progress,ptr); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneBuildQuality); + RTC_VERIFY_HANDLE(hscene); + if (quality != RTC_BUILD_QUALITY_LOW && + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH) + // -- GODOT start -- + // throw std::runtime_error("invalid build quality"); + abort(); + // -- GODOT end -- + scene->setBuildQuality(quality); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneFlags); + RTC_VERIFY_HANDLE(hscene); + scene->setSceneFlags(flags); + RTC_CATCH_END2(scene); + } + + RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneFlags); + RTC_VERIFY_HANDLE(hscene); + return scene->getSceneFlags(); + RTC_CATCH_END2(scene); + return RTC_SCENE_FLAG_NONE; + } + + RTC_API void rtcCommitScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCommitScene); + RTC_VERIFY_HANDLE(hscene); + scene->commit(false); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcJoinCommitScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcJoinCommitScene); + RTC_VERIFY_HANDLE(hscene); + scene->commit(true); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneBounds); + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + BBox3fa bounds = scene->bounds.bounds(); + bounds_o->lower_x = bounds.lower.x; + bounds_o->lower_y = bounds.lower.y; + bounds_o->lower_z = bounds.lower.z; + bounds_o->align0 = 0; + bounds_o->upper_x = bounds.upper.x; + bounds_o->upper_y = bounds.upper.y; + bounds_o->upper_z = bounds.upper.z; + bounds_o->align1 = 0; + RTC_CATCH_END2(scene); + } + + RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneBounds); + RTC_VERIFY_HANDLE(hscene); + if (bounds_o == nullptr) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer"); + if (scene->isModified()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + + bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x; + bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y; + bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z; + bounds_o->bounds0.align0 = 0; + bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x; + bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y; + bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z; + bounds_o->bounds0.align1 = 0; + bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x; + bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y; + bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z; + bounds_o->bounds1.align0 = 0; + bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x; + bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y; + bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z; + bounds_o->bounds1.align1 = 0; + RTC_CATCH_END2(scene); + } + + RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr) + { + Scene* scene0 = (Scene*) hscene0; + Scene* scene1 = (Scene*) hscene1; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCollide); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene0); + RTC_VERIFY_HANDLE(hscene1); + if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices"); + auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false); + auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false); + if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep"); +#endif + scene0->intersectors.collide(scene0,scene1,callback,userPtr); + RTC_CATCH_END(scene0->device); + } + + inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr) + { + bool changed = false; + if (userContext->instStackSize > 0) + { + const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]); + + float similarityScale = 0.f; + const bool similtude = similarityTransform(transform, &similarityScale); + assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f)); + + PointQuery query_inst; + query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); + query_inst.radius = query->radius * similarityScale; + query_inst.time = query->time; + + PointQueryContext context_inst(scene, (PointQuery*)query, + similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB, + queryFunc, userContext, similarityScale, userPtr); + changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst); + } + else + { + PointQueryContext context(scene, (PointQuery*)query, + POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr); + changed = scene->intersectors.pointQuery((PointQuery*)query, &context); + } + return changed; + } + + RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(userContext); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); + if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes"); +#endif + + return pointQuery(scene, query, userContext, queryFunc, userPtr); + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery4* query4 = (PointQuery4*)query; + PointQuery query1; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + query4->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query4->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery8* query8 = (PointQuery8*)query; + PointQuery query1; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + query8->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query8->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery16* query16 = (PointQuery16*)query; + PointQuery query1; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + PointQuery query1; query16->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query16->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + STAT3(normal.travs,1,1,1); + IntersectContext context(scene,user_context); + scene->intersectors.intersect(*rayhit,&context); +#if defined(DEBUG) + ((RayHit*)rayhit)->verifyHit(); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray4* ray4 = (Ray4*) rayhit; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + RayHit ray1; ray4->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + ray4->set(i,ray1); + } +#else + scene->intersectors.intersect4(valid,*rayhit,&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes"); + if (((size_t)rayhit) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray8* ray8 = (Ray8*) rayhit; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + RayHit ray1; ray8->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + ray8->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector8)) + scene->intersectors.intersect8(valid,*rayhit,&context); + else + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes"); + if (((size_t)rayhit) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray16* ray16 = (Ray16*) rayhit; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + RayHit ray1; ray16->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + ray16->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector16)) + scene->intersectors.intersect16(valid,*rayhit,&context); + else + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1M); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for single rays */ + if (likely(M == 1)) { + if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) + scene->intersectors.intersect(*rayhit,&context); + } + + /* codepath for streams */ + else { + scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1Mp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for single rays */ + if (likely(M == 1)) { + if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) + scene->intersectors.intersect(*rn[0],&context); + } + + /* codepath for streams */ + else { + scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersectNM); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,N*M,N*M,N*M); + IntersectContext context(scene,user_context); + + /* code path for single ray streams */ + if (likely(N == 1)) + { + /* fast code path for streams of size 1 */ + if (likely(M == 1)) { + if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar)) + scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context); + } + /* normal codepath for single ray streams */ + else { + scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context); + } + } + /* code path for ray packet streams */ + else { + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersectNp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes"); + if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes"); + if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes"); + if (((size_t)rayhit->ray.time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes"); + if (((size_t)rayhit->ray.mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes"); + if (((size_t)rayhit->hit.u ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes"); + if (((size_t)rayhit->hit.v ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes"); + if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes"); + if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes"); + if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes"); +#endif + STAT3(normal.travs,N,N,N); + IntersectContext context(scene,user_context); + scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1); + STAT3(shadow.travs,1,1,1); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + IntersectContext context(scene,user_context); + scene->intersectors.occluded(*ray,&context); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit4* ray4 = (RayHit4*) ray; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + RayHit ray1; ray4->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray4->geomID[i] = ray1.geomID; + } +#else + scene->intersectors.occluded4(valid,*ray,&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes"); + if (((size_t)ray) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit8* ray8 = (RayHit8*) ray; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + RayHit ray1; ray8->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray8->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector8)) + scene->intersectors.occluded8(valid,*ray,&context); + else + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes"); + if (((size_t)ray) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit16* ray16 = (RayHit16*) ray; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + RayHit ray1; ray16->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray16->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector16)) + scene->intersectors.occluded16(valid,*ray,&context); + else + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1M); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,M,M,M); + IntersectContext context(scene,user_context); + /* fast codepath for streams of size 1 */ + if (likely(M == 1)) { + if (likely(ray->tnear <= ray->tfar)) + scene->intersectors.occluded (*ray,&context); + } + /* codepath for normal streams */ + else { + scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1Mp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for streams of size 1 */ + if (likely(M == 1)) { + if (likely(ray[0]->tnear <= ray[0]->tfar)) + scene->intersectors.occluded (*ray[0],&context); + } + /* codepath for normal streams */ + else { + scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccludedNM); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small"); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,N*M,N*N,N*N); + IntersectContext context(scene,user_context); + + /* codepath for single rays */ + if (likely(N == 1)) + { + /* fast path for streams of size 1 */ + if (likely(M == 1)) { + if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar)) + scene->intersectors.occluded (*(RTCRay*)ray,&context); + } + /* codepath for normal ray streams */ + else { + scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context); + } + } + /* code path for ray packet streams */ + else { + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccludedNp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes"); + if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes"); + if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes"); + if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes"); + if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes"); + if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes"); + if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes"); + if (((size_t)ray->tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes"); + if (((size_t)ray->time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes"); + if (((size_t)ray->mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,N,N,N); + IntersectContext context(scene,user_context); + scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcRetainScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainScene); + RTC_VERIFY_HANDLE(hscene); + scene->refInc(); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcReleaseScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseScene); + RTC_VERIFY_HANDLE(hscene); + scene->refDec(); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene) + { + Geometry* geometry = (Geometry*) hgeometry; + Ref<Scene> scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryInstancedScene); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(hscene); + geometry->setInstancedScene(scene); + RTC_CATCH_END2(geometry); + } + + AffineSpace3fa loadTransform(RTCFormat format, const float* xfm) + { + AffineSpace3fa space = one; + switch (format) + { + case RTC_FORMAT_FLOAT3X4_ROW_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]), + Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]), + Vec3fa(xfm[ 2], xfm[ 6], xfm[10]), + Vec3fa(xfm[ 3], xfm[ 7], xfm[11])); + break; + + case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]), + Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]), + Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]), + Vec3fa(xfm[ 9], xfm[10], xfm[11])); + break; + + case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]), + Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]), + Vec3fa(xfm[ 8], xfm[ 9], xfm[10]), + Vec3fa(xfm[12], xfm[13], xfm[14])); + break; + + default: + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format"); + break; + } + return space; + } + + void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm) + { + switch (format) + { + case RTC_FORMAT_FLOAT3X4_ROW_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vy.x; xfm[ 2] = space.l.vz.x; xfm[ 3] = space.p.x; + xfm[ 4] = space.l.vx.y; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vz.y; xfm[ 7] = space.p.y; + xfm[ 8] = space.l.vx.z; xfm[ 9] = space.l.vy.z; xfm[10] = space.l.vz.z; xfm[11] = space.p.z; + break; + + case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; + xfm[ 3] = space.l.vy.x; xfm[ 4] = space.l.vy.y; xfm[ 5] = space.l.vy.z; + xfm[ 6] = space.l.vz.x; xfm[ 7] = space.l.vz.y; xfm[ 8] = space.l.vz.z; + xfm[ 9] = space.p.x; xfm[10] = space.p.y; xfm[11] = space.p.z; + break; + + case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; xfm[ 3] = 0.f; + xfm[ 4] = space.l.vy.x; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vy.z; xfm[ 7] = 0.f; + xfm[ 8] = space.l.vz.x; xfm[ 9] = space.l.vz.y; xfm[10] = space.l.vz.z; xfm[11] = 0.f; + xfm[12] = space.p.x; xfm[13] = space.p.y; xfm[14] = space.p.z; xfm[15] = 1.f; + break; + + default: + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format"); + break; + } + } + + RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTransform); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(xfm); + const AffineSpace3fa transform = loadTransform(format, (const float*)xfm); + geometry->setTransform(transform, timeStep); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTransformQuaternion); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(qd); + + AffineSpace3fx transform; + transform.l.vx.x = qd->scale_x; + transform.l.vy.y = qd->scale_y; + transform.l.vz.z = qd->scale_z; + transform.l.vy.x = qd->skew_xy; + transform.l.vz.x = qd->skew_xz; + transform.l.vz.y = qd->skew_yz; + transform.l.vx.y = qd->translation_x; + transform.l.vx.z = qd->translation_y; + transform.l.vy.z = qd->translation_z; + transform.p.x = qd->shift_x; + transform.p.y = qd->shift_y; + transform.p.z = qd->shift_z; + + // normalize quaternion + Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k); + q = normalize(q); + transform.l.vx.w = q.i; + transform.l.vy.w = q.j; + transform.l.vz.w = q.k; + transform.p.w = q.r; + + geometry->setQuaternionDecomposition(transform, timeStep); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryTransform); + const AffineSpace3fa transform = geometry->getTransform(time); + storeTransform(transform, format, (float*)xfm); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) + { + IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i; + args->report(args,filter_args); + } + + RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) + { + OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i; + args->report(args,filter_args); + } + + RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewGeometry); + RTC_VERIFY_HANDLE(hdevice); + + switch (type) + { + case RTC_GEOMETRY_TYPE_TRIANGLE: + { +#if defined(EMBREE_GEOMETRY_TRIANGLE) + createTriangleMeshTy createTriangleMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createTriangleMesh); + Geometry* geom = createTriangleMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_QUAD: + { +#if defined(EMBREE_GEOMETRY_QUAD) + createQuadMeshTy createQuadMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createQuadMesh); + Geometry* geom = createQuadMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_SPHERE_POINT: + case RTC_GEOMETRY_TYPE_DISC_POINT: + case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT: + { +#if defined(EMBREE_GEOMETRY_POINT) + createPointsTy createPoints = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_builder_cpu_features, createPoints); + + Geometry *geom; + switch(type) { + case RTC_GEOMETRY_TYPE_SPHERE_POINT: + geom = createPoints(device, Geometry::GTY_SPHERE_POINT); + break; + case RTC_GEOMETRY_TYPE_DISC_POINT: + geom = createPoints(device, Geometry::GTY_DISC_POINT); + break; + case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT: + geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT); + break; + default: + geom = nullptr; + break; + } + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE: + case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE: + { +#if defined(EMBREE_GEOMETRY_CURVE) + createLineSegmentsTy createLineSegments = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createLineSegments); + createCurvesTy createCurves = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createCurves); + + Geometry* geom; + switch (type) { + case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break; + case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break; + //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break; + default: geom = nullptr; break; + } + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_SUBDIVISION: + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + createSubdivMeshTy createSubdivMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh); + //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason? + Geometry* geom = createSubdivMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_USER: + { +#if defined(EMBREE_GEOMETRY_USER) + createUserGeometryTy createUserGeometry = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createUserGeometry); + Geometry* geom = createUserGeometry(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_INSTANCE: + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + createInstanceTy createInstance = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createInstance); + Geometry* geom = createInstance(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_GRID: + { +#if defined(EMBREE_GEOMETRY_GRID) + createGridMeshTy createGridMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512(device->enabled_cpu_features,createGridMesh); + Geometry* geom = createGridMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported"); +#endif + } + + default: + throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type"); + } + + RTC_CATCH_END(device); + return nullptr; + } + + RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryUserPrimitiveCount); + RTC_VERIFY_HANDLE(hgeometry); + + if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); + + geometry->setNumPrimitives(userPrimitiveCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTimeStepCount); + RTC_VERIFY_HANDLE(hgeometry); + + if (timeStepCount > RTC_MAX_TIME_STEP_COUNT) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range"); + + geometry->setNumTimeSteps(timeStepCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime) + { + Ref<Geometry> geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTimeRange); + RTC_VERIFY_HANDLE(hgeometry); + + if (startTime > endTime) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime"); + + geometry->setTimeRange(BBox1f(startTime,endTime)); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryVertexAttributeCount); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setVertexAttributeCount(N); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTopologyCount); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setTopologyCount(N); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBuildQuality); + RTC_VERIFY_HANDLE(hgeometry); + if (quality != RTC_BUILD_QUALITY_LOW && + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH && + quality != RTC_BUILD_QUALITY_REFIT) + // -- GODOT start -- + // throw std::runtime_error("invalid build quality"); + abort(); + // -- GODOT end -- + geometry->setBuildQuality(quality); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryMaxRadiusScale); + RTC_VERIFY_HANDLE(hgeometry); +#if RTC_MIN_WIDTH + if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1"); + geometry->setMaxRadiusScale(maxRadiusScale); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled"); +#endif + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryMask); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setMask(mask); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometrySubdivisionMode); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setSubdivisionMode(topologyID,mode); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryVertexAttributeTopology); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setVertexAttributeTopology(vertexAttributeID, topologyID); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + Ref<Buffer> buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(hbuffer); + + if (geometry->device != buffer->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSharedGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset); + geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetNewGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */ + size_t bytes = itemCount*byteStride; + if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + bytes += (16 - (byteStride%16))%16; + + Ref<Buffer> buffer = new Buffer(geometry->device, bytes); + geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount); + return buffer->data(); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryBufferData); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->getBuffer(type, slot); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcEnableGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->enable(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcUpdateGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + geometry->updateBuffer(type, slot); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcDisableGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->disable(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTessellationRate); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setTessellationRate(tessellationRate); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryUserData); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setUserData(ptr); + RTC_CATCH_END2(geometry); + } + + RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; // no ref counting here! + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryUserData); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->getUserData(); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBoundsFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setBoundsFunction(bounds,userPtr); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryDisplacementFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setDisplacementFunction(displacement); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryIntersectFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setIntersectFunctionN(intersect); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryPointQueryFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setPointQueryFunction(pointQuery); + RTC_CATCH_END2(geometry); + } + + RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryFirstHalfEdge); + return geometry->getFirstHalfEdge(faceID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryFace); + return geometry->getFace(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryNextHalfEdge); + return geometry->getNextHalfEdge(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryPreviousHalfEdge); + return geometry->getPreviousHalfEdge(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryOppositeHalfEdge); + return geometry->getOppositeHalfEdge(topologyID,edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetOccludedFunctionN); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setOccludedFunctionN(occluded); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryIntersectFilterFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setIntersectionFilterFunctionN(filter); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryOccludedFilterFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setOcclusionFilterFunctionN(filter); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args) + { + Geometry* geometry = (Geometry*) args->geometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcInterpolate); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(args->geometry); +#endif + geometry->interpolate(args); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args) + { + Geometry* geometry = (Geometry*) args->geometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcInterpolateN); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(args->geometry); +#endif + geometry->interpolateN(args); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcCommitGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCommitGeometry); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->commit(); + RTC_CATCH_END2(geometry); + } + + RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry) + { + Scene* scene = (Scene*) hscene; + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcAttachGeometry); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(hgeometry); + if (scene->device != geometry->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry); + RTC_CATCH_END2(scene); + return -1; + } + + RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcAttachGeometryByID); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_GEOMID(geomID); + if (scene->device != geometry->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + scene->bind(geomID,geometry); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcDetachGeometry); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_GEOMID(geomID); + scene->detachGeometry(geomID); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcRetainGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->refInc(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->refDec(); + RTC_CATCH_END2(geometry); + } + + RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometry); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_GEOMID(geomID); +#endif + return (RTCGeometry) scene->get(geomID); + RTC_CATCH_END2(scene); + return nullptr; + } + +RTC_NAMESPACE_END diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h new file mode 100644 index 0000000000..373e49a689 --- /dev/null +++ b/thirdparty/embree/kernels/common/rtcore.h @@ -0,0 +1,142 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../include/embree3/rtcore.h" +RTC_NAMESPACE_USE + +namespace embree +{ + /*! decoding of intersection flags */ + __forceinline bool isCoherent (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; } + __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; } + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8) +# define USE_TASK_ARENA 1 +#else +# define USE_TASK_ARENA 0 +#endif + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9 +# define TASKING_TBB_USE_TASK_ISOLATION 1 +#else +# define TASKING_TBB_USE_TASK_ISOLATION 0 +#endif + +/*! Macros used in the rtcore API implementation */ +// -- GODOT start -- +// #define RTC_CATCH_BEGIN try { +#define RTC_CATCH_BEGIN + +// #define RTC_CATCH_END(device) \ +// } catch (std::bad_alloc&) { \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// } catch (rtcore_error& e) { \ +// Device::process_error(device,e.error,e.what()); \ +// } catch (std::exception& e) { \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// } catch (...) { \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// } +#define RTC_CATCH_END(device) + +// #define RTC_CATCH_END2(scene) \ +// } catch (std::bad_alloc&) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// } catch (rtcore_error& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,e.error,e.what()); \ +// } catch (std::exception& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// } catch (...) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// } +#define RTC_CATCH_END2(scene) + +// #define RTC_CATCH_END2_FALSE(scene) \ +// } catch (std::bad_alloc&) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// return false; \ +// } catch (rtcore_error& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,e.error,e.what()); \ +// return false; \ +// } catch (std::exception& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// return false; \ +// } catch (...) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// return false; \ +// } +#define RTC_CATCH_END2_FALSE(scene) return false; +// -- GODOT end -- + +#define RTC_VERIFY_HANDLE(handle) \ + if (handle == nullptr) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_GEOMID(id) \ + if (id == RTC_INVALID_GEOMETRY_ID) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_UPPER(id,upper) \ + if (id > upper) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_RANGE(id,lower,upper) \ + if (id < lower || id > upper) \ + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds"); + +#if 0 // enable to debug print all API calls +#define RTC_TRACE(x) std::cout << #x << std::endl; +#else +#define RTC_TRACE(x) +#endif + +// -- GODOT begin -- +// /*! used to throw embree API errors */ +// struct rtcore_error : public std::exception +// { +// __forceinline rtcore_error(RTCError error, const std::string& str) +// : error(error), str(str) {} +// +// ~rtcore_error() throw() {} +// +// const char* what () const throw () { +// return str.c_str(); +// } +// +// RTCError error; +// std::string str; +// }; +// -- GODOT end -- + +#if defined(DEBUG) // only report file and line in debug mode + // -- GODOT begin -- + // #define throw_RTCError(error,str) \ + // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define throw_RTCError(error,str) \ + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); + // -- GODOT end -- +#else + // -- GODOT begin -- + // #define throw_RTCError(error,str) \ + // throw rtcore_error(error,str); + #define throw_RTCError(error,str) \ + abort(); + // -- GODOT end -- +#endif + +#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ + (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) +} diff --git a/thirdparty/embree/kernels/common/rtcore_builder.cpp b/thirdparty/embree/kernels/common/rtcore_builder.cpp new file mode 100644 index 0000000000..1f1b6f6ddf --- /dev/null +++ b/thirdparty/embree/kernels/common/rtcore_builder.cpp @@ -0,0 +1,442 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_EXPORT_API + +#include "default.h" +#include "device.h" +#include "scene.h" +#include "context.h" +#include "alloc.h" + +#include "../builders/bvh_builder_sah.h" +#include "../builders/bvh_builder_morton.h" + +namespace embree +{ + namespace isa // FIXME: support more ISAs for builders + { + struct BVH : public RefCount + { + BVH (Device* device) + : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0) + { + device->refInc(); + } + + ~BVH() { + device->refDec(); + } + + public: + Device* device; + FastAllocator allocator; + mvector<BVHBuilderMorton::BuildPrim> morton_src; + mvector<BVHBuilderMorton::BuildPrim> morton_tmp; + }; + + void* rtcBuildBVHMorton(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims_i = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic<size_t> progress(0); + + /* initialize temporary arrays for morton builder */ + PrimRef* prims = (PrimRef*) prims_i; + mvector<BVHBuilderMorton::BuildPrim>& morton_src = bvh->morton_src; + mvector<BVHBuilderMorton::BuildPrim>& morton_tmp = bvh->morton_tmp; + morton_src.resize(primitiveCount); + morton_tmp.resize(primitiveCount); + + /* compute centroid bounds */ + const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range<size_t>& r) -> BBox3fa { + + BBox3fa bounds(empty); + for (size_t i=r.begin(); i<r.end(); i++) + bounds.extend(prims[i].bounds().center2()); + return bounds; + }, BBox3fa::merge); + + /* compute morton codes */ + BVHBuilderMorton::MortonCodeMapping mapping(centBounds); + parallel_for ( size_t(0), primitiveCount, [&](const range<size_t>& r) { + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]); + for (size_t i=r.begin(); i<r.end(); i++) { + generator(prims[i].bounds(),(unsigned) i); + } + }); + + /* start morton build */ + std::pair<void*,BBox3fa> root = BVHBuilderMorton::build<std::pair<void*,BBox3fa>>( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that allocates BVH nodes */ + [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* { + return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + }, + + /* lambda function that sets bounds */ + [&] (void* node, const std::pair<void*,BBox3fa>* children, size_t N) -> std::pair<void*,BBox3fa> + { + BBox3fa bounds = empty; + void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR]; + const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i<N; i++) { + bounds.extend(children[i].second); + childptrs[i] = children[i].first; + cbounds[i] = (const RTCBounds*)&children[i].second; + } + setNodeBounds(node,cbounds,(unsigned int)N,userPtr); + setNodeChildren(node,childptrs, (unsigned int)N,userPtr); + return std::make_pair(node,bounds); + }, + + /* lambda function that creates BVH leaves */ + [&]( const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) -> std::pair<void*,BBox3fa> + { + RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF]; + BBox3fa bounds = empty; + for (size_t i=0;i<current.size();i++) + { + const size_t id = morton_src[current.begin()+i].index; + bounds.extend(prims[id].bounds()); + localBuildPrims[i] = prims_i[id]; + } + void* node = createLeaf((RTCThreadLocalAllocator)&alloc,localBuildPrims,current.size(),userPtr); + return std::make_pair(node,bounds); + }, + + /* lambda that calculates the bounds for some primitive */ + [&] (const BVHBuilderMorton::BuildPrim& morton) -> BBox3fa { + return prims[morton.index].bounds(); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + morton_src.data(),morton_tmp.data(),primitiveCount, + *arguments); + + bvh->allocator.cleanup(); + return root.first; + } + + void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic<size_t> progress(0); + + /* calculate priminfo */ + auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa + { + CentGeomBBox3fa bounds(empty); + for (size_t j=r.begin(); j<r.end(); j++) + bounds.extend((BBox3fa&)prims[j]); + return bounds; + }; + const CentGeomBBox3fa bounds = + parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2); + + const PrimInfo pinfo(0,primitiveCount,bounds); + + /* build BVH */ + void* root = BVHBuilderBinnedSAH::build<void*>( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that creates BVH nodes */ + [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void* + { + void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds; + setNodeBounds(node,cbounds, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that updates BVH nodes */ + [&](const BVHBuilderBinnedSAH::BuildRecord& precord, const BVHBuilderBinnedSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* { + setNodeChildren(node,children, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that creates BVH leaves */ + [&](const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* { + return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + (PrimRef*)prims,pinfo,*arguments); + + bvh->allocator.cleanup(); + return root; + } + + static __forceinline const std::pair<CentGeomBBox3fa,unsigned int> mergePair(const std::pair<CentGeomBBox3fa,unsigned int>& a, const std::pair<CentGeomBBox3fa,unsigned int>& b) { + CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first); + unsigned int maxGeomID = max(a.second,b.second); + return std::pair<CentGeomBBox3fa,unsigned int>(centBounds,maxGeomID); + } + + void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic<size_t> progress(0); + + /* calculate priminfo */ + + auto computeBounds = [&](const range<size_t>& r) -> std::pair<CentGeomBBox3fa,unsigned int> + { + CentGeomBBox3fa bounds(empty); + unsigned maxGeomID = 0; + for (size_t j=r.begin(); j<r.end(); j++) + { + bounds.extend((BBox3fa&)prims[j]); + maxGeomID = max(maxGeomID,prims[j].geomID); + } + return std::pair<CentGeomBBox3fa,unsigned int>(bounds,maxGeomID); + }; + + + const std::pair<CentGeomBBox3fa,unsigned int> pair = + parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair<CentGeomBBox3fa,unsigned int>(CentGeomBBox3fa(empty),0), computeBounds, mergePair); + + CentGeomBBox3fa bounds = pair.first; + const unsigned int maxGeomID = pair.second; + + if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)))) + { + /* fallback code for max geomID larger than threshold */ + return rtcBuildBVHBinnedSAH(arguments); + } + + const PrimInfo pinfo(0,primitiveCount,bounds); + + /* function that splits a build primitive */ + struct Splitter + { + Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr) + : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {} + + __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const + { + prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK; + splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr); + left_o.geomIDref() = geomID; left_o.primIDref() = primID; + right_o.geomIDref() = geomID; right_o.primIDref() = primID; + } + + __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const + { + PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID); + splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr); + } + + RTCSplitPrimitiveFunction splitPrimitive; + unsigned geomID; + unsigned primID; + void* userPtr; + }; + + /* build BVH */ + void* root = BVHBuilderBinnedFastSpatialSAH::build<void*>( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that creates BVH nodes */ + [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void* + { + void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds; + setNodeBounds(node,cbounds, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that updates BVH nodes */ + [&] (const BVHBuilderBinnedFastSpatialSAH::BuildRecord& precord, const BVHBuilderBinnedFastSpatialSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* { + setNodeChildren(node,children, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that creates BVH leaves */ + [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* { + return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr); + }, + + /* returns the splitter */ + [&] ( const PrimRef& prim ) -> Splitter { + return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + (PrimRef*)prims, + arguments->primitiveArrayCapacity, + pinfo,*arguments); + + bvh->allocator.cleanup(); + return root; + } + } +} + +using namespace embree; +using namespace embree::isa; + +RTC_NAMESPACE_BEGIN + + RTC_API RTCBVH rtcNewBVH(RTCDevice device) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewAllocator); + RTC_VERIFY_HANDLE(device); + BVH* bvh = new BVH((Device*)device); + return (RTCBVH) bvh->refInc(); + RTC_CATCH_END((Device*)device); + return nullptr; + } + + RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcBuildBVH); + RTC_VERIFY_HANDLE(bvh); + RTC_VERIFY_HANDLE(arguments); + RTC_VERIFY_HANDLE(arguments->createNode); + RTC_VERIFY_HANDLE(arguments->setNodeChildren); + RTC_VERIFY_HANDLE(arguments->setNodeBounds); + RTC_VERIFY_HANDLE(arguments->createLeaf); + + if (arguments->primitiveArrayCapacity < arguments->primitiveCount) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount") + + /* initialize the allocator */ + bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa)); + bvh->allocator.reset(); + + /* switch between differnet builders based on quality level */ + if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW) + return rtcBuildBVHMorton(arguments); + else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM) + return rtcBuildBVHBinnedSAH(arguments); + else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) { + if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount) + return rtcBuildBVHBinnedSAH(arguments); + else + return rtcBuildBVHSpatialSAH(arguments); + } + else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality"); + + /* if we are in dynamic mode, then do not clear temporary data */ + if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC)) + { + bvh->morton_src.clear(); + bvh->morton_tmp.clear(); + } + + RTC_CATCH_END(bvh->device); + return nullptr; + } + + RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align) + { + FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcThreadLocalAlloc); + return alloc->malloc0(bytes,align); + RTC_CATCH_END(alloc->alloc->getDevice()); + return nullptr; + } + + RTC_API void rtcMakeStaticBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcStaticBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->morton_src.clear(); + bvh->morton_tmp.clear(); + RTC_CATCH_END(bvh->device); + } + + RTC_API void rtcRetainBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + Device* device = bvh ? bvh->device : nullptr; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->refInc(); + RTC_CATCH_END(device); + } + + RTC_API void rtcReleaseBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + Device* device = bvh ? bvh->device : nullptr; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->refDec(); + RTC_CATCH_END(device); + } + +RTC_NAMESPACE_END diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp new file mode 100644 index 0000000000..408d7eae6f --- /dev/null +++ b/thirdparty/embree/kernels/common/scene.cpp @@ -0,0 +1,955 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "scene.h" + +#include "../bvh/bvh4_factory.h" +#include "../bvh/bvh8_factory.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + /* error raising rtcIntersect and rtcOccluded functions */ + void missing_rtcCommit() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); } + void invalid_rtcIntersect1() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); } + void invalid_rtcIntersect4() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); } + void invalid_rtcIntersect8() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); } + void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); } + void invalid_rtcIntersectN() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); } + + Scene::Scene (Device* device) + : device(device), + flags_modified(true), enabled_geometry_types(0), + scene_flags(RTC_SCENE_FLAG_NONE), + quality_flags(RTC_BUILD_QUALITY_MEDIUM), + is_build(false), modified(true), + progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0) + { + device->refInc(); + + intersectors = Accel::Intersectors(missing_rtcCommit); + + /* one can overwrite flags through device for debugging */ + if (device->quality_flags != -1) + quality_flags = (RTCBuildQuality) device->quality_flags; + if (device->scene_flags != -1) + scene_flags = (RTCSceneFlags) device->scene_flags; + } + + Scene::~Scene() noexcept + { + device->refDec(); + } + + void Scene::printStatistics() + { + /* calculate maximum number of time segments */ + unsigned max_time_steps = 0; + for (size_t i=0; i<size(); i++) { + if (!get(i)) continue; + max_time_steps = max(max_time_steps,get(i)->numTimeSteps); + } + + /* initialize vectors*/ + std::vector<size_t> statistics[Geometry::GTY_END]; + for (size_t i=0; i<Geometry::GTY_END; i++) + statistics[i].resize(max_time_steps); + + /* gather statistics */ + for (size_t i=0; i<size(); i++) + { + if (!get(i)) continue; + int ty = get(i)->getType(); + assert(ty<Geometry::GTY_END); + int timesegments = get(i)->numTimeSegments(); + assert((unsigned int)timesegments < max_time_steps); + statistics[ty][timesegments] += get(i)->size(); + } + + /* print statistics */ + std::cout << std::setw(23) << "segments" << ": "; + for (size_t t=0; t<max_time_steps; t++) + std::cout << std::setw(10) << t; + std::cout << std::endl; + + std::cout << "-------------------------"; + for (size_t t=0; t<max_time_steps; t++) + std::cout << "----------"; + std::cout << std::endl; + + for (size_t p=0; p<Geometry::GTY_END; p++) + { + if (std::string(Geometry::gtype_names[p]) == "") continue; + std::cout << std::setw(23) << Geometry::gtype_names[p] << ": "; + for (size_t t=0; t<max_time_steps; t++) + std::cout << std::setw(10) << statistics[p][t]; + std::cout << std::endl; + } + } + + void Scene::createTriangleAccel() + { +#if defined(EMBREE_GEOMETRY_TRIANGLE) + if (device->tri_accel == "default") + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + + break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else /* dynamic */ + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + } + else if (device->tri_accel == "bvh4.triangle4") accels_add(device->bvh4_factory->BVH4Triangle4 (this)); + else if (device->tri_accel == "bvh4.triangle4v") accels_add(device->bvh4_factory->BVH4Triangle4v(this)); + else if (device->tri_accel == "bvh4.triangle4i") accels_add(device->bvh4_factory->BVH4Triangle4i(this)); + else if (device->tri_accel == "qbvh4.triangle4i") accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->tri_accel == "bvh8.triangle4") accels_add(device->bvh8_factory->BVH8Triangle4 (this)); + else if (device->tri_accel == "bvh8.triangle4v") accels_add(device->bvh8_factory->BVH8Triangle4v(this)); + else if (device->tri_accel == "bvh8.triangle4i") accels_add(device->bvh8_factory->BVH8Triangle4i(this)); + else if (device->tri_accel == "qbvh8.triangle4i") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this)); + else if (device->tri_accel == "qbvh8.triangle4") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel); +#endif + } + + void Scene::createTriangleMBAccel() + { +#if defined(EMBREE_GEOMETRY_TRIANGLE) + if (device->tri_accel_mb == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this)); + else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this)); + else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb); +#endif + } + + void Scene::createQuadAccel() + { +#if defined(EMBREE_GEOMETRY_QUAD) + if (device->quad_accel == "default") + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) + { + /* static */ + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + break; + + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else /* dynamic */ + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + } + else if (device->quad_accel == "bvh4.quad4v") accels_add(device->bvh4_factory->BVH4Quad4v(this)); + else if (device->quad_accel == "bvh4.quad4i") accels_add(device->bvh4_factory->BVH4Quad4i(this)); + else if (device->quad_accel == "qbvh4.quad4i") accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->quad_accel == "bvh8.quad4v") accels_add(device->bvh8_factory->BVH8Quad4v(this)); + else if (device->quad_accel == "bvh8.quad4i") accels_add(device->bvh8_factory->BVH8Quad4i(this)); + else if (device->quad_accel == "qbvh8.quad4i") accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel); +#endif + } + + void Scene::createQuadMBAccel() + { +#if defined(EMBREE_GEOMETRY_QUAD) + if (device->quad_accel_mb == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + break; + + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb); +#endif + } + + void Scene::createHairAccel() + { +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + if (device->hair_accel == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel); +#endif + } + + void Scene::createHairMBAccel() + { +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + if (device->hair_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower + { + if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST)); + else accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST)); + else accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST)); + } + } + else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb); +#endif + } + + void Scene::createSubdivAccel() + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + if (device->subdiv_accel == "default") { + accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + } + else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel); +#endif + } + + void Scene::createSubdivMBAccel() + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + if (device->subdiv_accel_mb == "default") { + accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this)); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb); +#endif + } + + void Scene::createUserGeometryAccel() + { +#if defined(EMBREE_GEOMETRY_USER) + if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel); +#endif + } + + void Scene::createUserGeometryMBAccel() + { +#if defined(EMBREE_GEOMETRY_USER) + if (device->object_accel_mb == "default" ) { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8UserGeometryMB(this)); + else +#endif + accels_add(device->bvh4_factory->BVH4UserGeometryMB(this)); + } + else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb); +#endif + } + + void Scene::createInstanceAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + // if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel); +#endif + } + + void Scene::createInstanceMBAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + //if (device->instance_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8InstanceMB(this, false)); + else +#endif + accels_add(device->bvh4_factory->BVH4InstanceMB(this, false)); + } + //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb); +#endif + } + + void Scene::createInstanceExpensiveAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + // if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel); +#endif + } + + void Scene::createInstanceExpensiveMBAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + //if (device->instance_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8InstanceMB(this, true)); + else +#endif + accels_add(device->bvh4_factory->BVH4InstanceMB(this, true)); + } + //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb); +#endif + } + + void Scene::createGridAccel() + { + BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST; +#if defined(EMBREE_GEOMETRY_GRID) + if (device->grid_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + { + accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); + } + else +#endif + { + accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); + } + } + else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel); +#endif + + } + + void Scene::createGridMBAccel() + { +#if defined(EMBREE_GEOMETRY_GRID) + if (device->grid_accel_mb == "default") + { + accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC)); + } + else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this)); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel); +#endif + + } + + void Scene::clear() { + } + + unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry) + { + Lock<SpinLock> lock(geometriesMutex); + if (geomID == RTC_INVALID_GEOMETRY_ID) { + geomID = id_pool.allocate(); + if (geomID == RTC_INVALID_GEOMETRY_ID) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene"); + } + else + { + if (!id_pool.add(geomID)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided"); + } + if (geomID >= geometries.size()) { + geometries.resize(geomID+1); + vertices.resize(geomID+1); + geometryModCounters_.resize(geomID+1); + } + geometries[geomID] = geometry; + geometryModCounters_[geomID] = 0; + if (geometry->isEnabled()) { + setModified (); + } + return geomID; + } + + void Scene::detachGeometry(size_t geomID) + { + Lock<SpinLock> lock(geometriesMutex); + + if (geomID >= geometries.size()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID"); + + Ref<Geometry>& geometry = geometries[geomID]; + if (geometry == null) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry"); + + if (geometry->isEnabled()) { + setModified (); + } + accels_deleteGeometry(unsigned(geomID)); + id_pool.deallocate((unsigned)geomID); + geometries[geomID] = null; + vertices[geomID] = nullptr; + geometryModCounters_[geomID] = 0; + } + + void Scene::updateInterface() + { + is_build = true; + } + + void Scene::commit_task () + { + checkIfModifiedAndSet (); + if (!isModified()) { + return; + } + + /* print scene statistics */ + if (device->verbosity(2)) + printStatistics(); + + progress_monitor_counter = 0; + + /* gather scene stats and call preCommit function of each geometry */ + this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), + [this](const range<size_t>& r)->GeometryCounts + { + GeometryCounts c; + for (auto i=r.begin(); i<r.end(); ++i) + { + if (geometries[i] && geometries[i]->isEnabled()) + { + geometries[i]->preCommit(); + geometries[i]->addElementsToCount (c); + c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions(); + } + } + return c; + }, + std::plus<GeometryCounts>() + ); + + /* select acceleration structures to build */ + unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask(); + if (flags_modified || new_enabled_geometry_types != enabled_geometry_types) + { + accels_init(); + + /* we need to make all geometries modified, otherwise two level builder will + not rebuild currently not modified geometries */ + parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) { + geometryModCounters_[i] = 0; + }); + + if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel(); + if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel(); + if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel(); + if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel(); + if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel(); + if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel(); + if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel(); + if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel(); + if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel(); + if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel(); + if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel(); + if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel(); + + flags_modified = false; + enabled_geometry_types = new_enabled_geometry_types; + } + + /* select fast code path if no filter function is present */ + accels_select(hasFilterFunction()); + + /* build all hierarchies of this scene */ + accels_build(); + + /* make static geometry immutable */ + if (!isDynamicAccel()) { + accels_immutable(); + flags_modified = true; // in non-dynamic mode we have to re-create accels + } + + /* call postCommit function of each geometry */ + parallel_for(geometries.size(), [&] ( const size_t i ) { + if (geometries[i] && geometries[i]->isEnabled()) { + geometries[i]->postCommit(); + vertices[i] = geometries[i]->getCompactVertexArray(); + geometryModCounters_[i] = geometries[i]->getModCounter(); + } + }); + + updateInterface(); + + if (device->verbosity(2)) { + std::cout << "created scene intersector" << std::endl; + accels_print(2); + std::cout << "selected scene intersector" << std::endl; + intersectors.print(2); + } + + setModified(false); + } + + void Scene::setBuildQuality(RTCBuildQuality quality_flags_i) + { + if (quality_flags == quality_flags_i) return; + quality_flags = quality_flags_i; + flags_modified = true; + } + + RTCBuildQuality Scene::getBuildQuality() const { + return quality_flags; + } + + void Scene::setSceneFlags(RTCSceneFlags scene_flags_i) + { + if (scene_flags == scene_flags_i) return; + scene_flags = scene_flags_i; + flags_modified = true; + } + + RTCSceneFlags Scene::getSceneFlags() const { + return scene_flags; + } + +#if defined(TASKING_INTERNAL) + + void Scene::commit (bool join) + { + Lock<MutexSys> buildLock(buildMutex,false); + + /* allocates own taskscheduler for each build */ + Ref<TaskScheduler> scheduler = nullptr; + { + Lock<MutexSys> lock(schedulerMutex); + scheduler = this->scheduler; + if (scheduler == null) { + buildLock.lock(); + this->scheduler = scheduler = new TaskScheduler; + } + } + + /* worker threads join build */ + if (!buildLock.isLocked()) + { + if (!join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation"); + + scheduler->join(); + return; + } + + /* initiate build */ + // -- GODOT start -- + // try { + scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join); + // } + // catch (...) { + // accels_clear(); + // updateInterface(); + // Lock<MutexSys> lock(schedulerMutex); + // this->scheduler = nullptr; + // throw; + // } + // -- GODOT end -- + } + +#endif + +#if defined(TASKING_TBB) + + void Scene::commit (bool join) + { +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) + if (join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version"); +#endif + + /* try to obtain build lock */ + Lock<MutexSys> lock(buildMutex,buildMutex.try_lock()); + + /* join hierarchy build */ + if (!lock.isLocked()) + { +#if !TASKING_TBB_USE_TASK_ISOLATION + if (!join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version"); +#endif + + do { + +#if USE_TASK_ARENA + if (join) { + device->arena->execute([&]{ group.wait(); }); + } + else +#endif + { + group.wait(); + } + + pause_cpu(); + yield(); + } while (!buildMutex.try_lock()); + + buildMutex.unlock(); + return; + } + + /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ + const unsigned int mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); + + try { +#if TBB_INTERFACE_VERSION_MAJOR < 8 + tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits); +#else + tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings ); +#endif + //ctx.set_priority(tbb::priority_high); + +#if USE_TASK_ARENA + if (join) + { + device->arena->execute([&]{ + group.run([&]{ + tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx); + }); + group.wait(); + }); + } + else +#endif + { + group.run([&]{ + tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx); + }); + group.wait(); + } + + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + } + catch (...) + { + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + + accels_clear(); + updateInterface(); + throw; + } + } +#endif + +#if defined(TASKING_PPL) + + void Scene::commit (bool join) + { +#if defined(TASKING_PPL) + if (join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL"); +#endif + + /* try to obtain build lock */ + Lock<MutexSys> lock(buildMutex); + + checkIfModifiedAndSet (); + if (!isModified()) { + return; + } + + /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ + const unsigned int mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); + + try { + + group.run([&]{ + concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); }); + }); + group.wait(); + + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + } + catch (...) + { + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + + accels_clear(); + updateInterface(); + throw; + } + } +#endif + + void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr) + { + progress_monitor_function = func; + progress_monitor_ptr = ptr; + } + + void Scene::progressMonitor(double dn) + { + if (progress_monitor_function) { + size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn)); + if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) { + throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination"); + } + } + } +} diff --git a/thirdparty/embree/kernels/common/scene.h b/thirdparty/embree/kernels/common/scene.h new file mode 100644 index 0000000000..5ed80a63f6 --- /dev/null +++ b/thirdparty/embree/kernels/common/scene.h @@ -0,0 +1,390 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" +#include "builder.h" +#include "../../common/algorithms/parallel_any_of.h" +#include "scene_triangle_mesh.h" +#include "scene_quad_mesh.h" +#include "scene_user_geometry.h" +#include "scene_instance.h" +#include "scene_curves.h" +#include "scene_line_segments.h" +#include "scene_subdiv_mesh.h" +#include "scene_grid_mesh.h" +#include "scene_points.h" +#include "../subdiv/tessellation_cache.h" + +#include "acceln.h" +#include "geometry.h" + +namespace embree +{ + /*! Base class all scenes are derived from */ + class Scene : public AccelN + { + ALIGNED_CLASS_(std::alignment_of<Scene>::value); + + public: + template<typename Ty, bool mblur = false> + class Iterator + { + public: + Iterator () {} + + Iterator (Scene* scene, bool all = false) + : scene(scene), all(all) {} + + __forceinline Ty* at(const size_t i) + { + Geometry* geom = scene->geometries[i].ptr; + if (geom == nullptr) return nullptr; + if (!all && !geom->isEnabled()) return nullptr; + const size_t mask = geom->getTypeMask() & Ty::geom_type; + if (!(mask)) return nullptr; + if ((geom->numTimeSteps != 1) != mblur) return nullptr; + return (Ty*) geom; + } + + __forceinline Ty* operator[] (const size_t i) { + return at(i); + } + + __forceinline size_t size() const { + return scene->size(); + } + + __forceinline size_t numPrimitives() const { + return scene->getNumPrimitives(Ty::geom_type,mblur); + } + + __forceinline size_t maxPrimitivesPerGeometry() + { + size_t ret = 0; + for (size_t i=0; i<scene->size(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,mesh->size()); + } + return ret; + } + + __forceinline unsigned int maxGeomID() + { + unsigned int ret = 0; + for (size_t i=0; i<scene->size(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,(unsigned int)i); + } + return ret; + } + + __forceinline unsigned maxTimeStepsPerGeometry() + { + unsigned ret = 0; + for (size_t i=0; i<scene->size(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,mesh->numTimeSteps); + } + return ret; + } + + private: + Scene* scene; + bool all; + }; + + class Iterator2 + { + public: + Iterator2 () {} + + Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) + : scene(scene), typemask(typemask), mblur(mblur) {} + + __forceinline Geometry* at(const size_t i) + { + Geometry* geom = scene->geometries[i].ptr; + if (geom == nullptr) return nullptr; + if (!geom->isEnabled()) return nullptr; + if (!(geom->getTypeMask() & typemask)) return nullptr; + if ((geom->numTimeSteps != 1) != mblur) return nullptr; + return geom; + } + + __forceinline Geometry* operator[] (const size_t i) { + return at(i); + } + + __forceinline size_t size() const { + return scene->size(); + } + + private: + Scene* scene; + Geometry::GTypeMask typemask; + bool mblur; + }; + + public: + + /*! Scene construction */ + Scene (Device* device); + + /*! Scene destruction */ + ~Scene () noexcept; + + private: + /*! class is non-copyable */ + Scene (const Scene& other) DELETED; // do not implement + Scene& operator= (const Scene& other) DELETED; // do not implement + + public: + void createTriangleAccel(); + void createTriangleMBAccel(); + void createQuadAccel(); + void createQuadMBAccel(); + void createHairAccel(); + void createHairMBAccel(); + void createSubdivAccel(); + void createSubdivMBAccel(); + void createUserGeometryAccel(); + void createUserGeometryMBAccel(); + void createInstanceAccel(); + void createInstanceMBAccel(); + void createInstanceExpensiveAccel(); + void createInstanceExpensiveMBAccel(); + void createGridAccel(); + void createGridMBAccel(); + + /*! prints statistics about the scene */ + void printStatistics(); + + /*! clears the scene */ + void clear(); + + /*! detaches some geometry */ + void detachGeometry(size_t geomID); + + void setBuildQuality(RTCBuildQuality quality_flags); + RTCBuildQuality getBuildQuality() const; + + void setSceneFlags(RTCSceneFlags scene_flags); + RTCSceneFlags getSceneFlags() const; + + void commit (bool join); + void commit_task (); + void build () {} + + void updateInterface(); + + /* return number of geometries */ + __forceinline size_t size() const { return geometries.size(); } + + /* bind geometry to the scene */ + unsigned int bind (unsigned geomID, Ref<Geometry> geometry); + + /* determines if scene is modified */ + __forceinline bool isModified() const { return modified; } + + /* sets modified flag */ + __forceinline void setModified(bool f = true) { + modified = f; + } + + __forceinline bool isGeometryModified(size_t geomID) + { + Ref<Geometry>& g = geometries[geomID]; + if (!g) return false; + return g->getModCounter() > geometryModCounters_[geomID]; + } + + protected: + + __forceinline void checkIfModifiedAndSet () + { + if (isModified ()) return; + + auto geometryIsModified = [this](size_t geomID)->bool { + return isGeometryModified(geomID); + }; + + if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) { + setModified (); + } + } + + public: + + /* get mesh by ID */ + __forceinline Geometry* get(size_t i) { assert(i < geometries.size()); return geometries[i].ptr; } + __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; } + + template<typename Mesh> + __forceinline Mesh* get(size_t i) { + assert(i < geometries.size()); + assert(geometries[i]->getTypeMask() & Mesh::geom_type); + return (Mesh*)geometries[i].ptr; + } + template<typename Mesh> + __forceinline const Mesh* get(size_t i) const { + assert(i < geometries.size()); + assert(geometries[i]->getTypeMask() & Mesh::geom_type); + return (Mesh*)geometries[i].ptr; + } + + template<typename Mesh> + __forceinline Mesh* getSafe(size_t i) { + assert(i < geometries.size()); + if (geometries[i] == null) return nullptr; + if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr; + else return (Mesh*) geometries[i].ptr; + } + + __forceinline Ref<Geometry> get_locked(size_t i) { + Lock<SpinLock> lock(geometriesMutex); + assert(i < geometries.size()); + return geometries[i]; + } + + /* flag decoding */ + __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); } + __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; } + __forceinline bool isRobustAccel() const { return scene_flags & RTC_SCENE_FLAG_ROBUST; } + __forceinline bool isStaticAccel() const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); } + __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; } + + __forceinline bool hasContextFilterFunction() const { + return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION; + } + + __forceinline bool hasGeometryFilterFunction() { + return world.numFilterFunctions != 0; + } + + __forceinline bool hasFilterFunction() { + return hasContextFilterFunction() || hasGeometryFilterFunction(); + } + + /* test if scene got already build */ + __forceinline bool isBuild() const { return is_build; } + + public: + IDPool<unsigned,0xFFFFFFFE> id_pool; + vector<Ref<Geometry>> geometries; //!< list of all user geometries + vector<unsigned int> geometryModCounters_; + vector<float*> vertices; + + public: + Device* device; + + /* these are to detect if we need to recreate the acceleration structures */ + bool flags_modified; + unsigned int enabled_geometry_types; + + RTCSceneFlags scene_flags; + RTCBuildQuality quality_flags; + MutexSys buildMutex; + SpinLock geometriesMutex; + bool is_build; + private: + bool modified; //!< true if scene got modified + + public: + + /*! global lock step task scheduler */ +#if defined(TASKING_INTERNAL) + MutexSys schedulerMutex; + Ref<TaskScheduler> scheduler; +#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION + tbb::isolated_task_group group; +#elif defined(TASKING_TBB) + tbb::task_group group; +#elif defined(TASKING_PPL) + concurrency::task_group group; +#endif + + public: + struct BuildProgressMonitorInterface : public BuildProgressMonitor { + BuildProgressMonitorInterface(Scene* scene) + : scene(scene) {} + void operator() (size_t dn) const { scene->progressMonitor(double(dn)); } + private: + Scene* scene; + }; + BuildProgressMonitorInterface progressInterface; + RTCProgressMonitorFunction progress_monitor_function; + void* progress_monitor_ptr; + std::atomic<size_t> progress_monitor_counter; + void progressMonitor(double nprims); + void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr); + + private: + GeometryCounts world; //!< counts for geometry + + public: + + __forceinline size_t numPrimitives() const { + return world.size(); + } + + __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const + { + size_t count = 0; + + if (mask & Geometry::MTY_TRIANGLE_MESH) + count += mblur ? world.numMBTriangles : world.numTriangles; + + if (mask & Geometry::MTY_QUAD_MESH) + count += mblur ? world.numMBQuads : world.numQuads; + + if (mask & Geometry::MTY_CURVE2) + count += mblur ? world.numMBLineSegments : world.numLineSegments; + + if (mask & Geometry::MTY_CURVE4) + count += mblur ? world.numMBBezierCurves : world.numBezierCurves; + + if (mask & Geometry::MTY_POINTS) + count += mblur ? world.numMBPoints : world.numPoints; + + if (mask & Geometry::MTY_SUBDIV_MESH) + count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches; + + if (mask & Geometry::MTY_USER_GEOMETRY) + count += mblur ? world.numMBUserGeometries : world.numUserGeometries; + + if (mask & Geometry::MTY_INSTANCE_CHEAP) + count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap; + + if (mask & Geometry::MTY_INSTANCE_EXPENSIVE) + count += mblur ? world.numMBInstancesExpensive : world.numInstancesExpensive; + + if (mask & Geometry::MTY_GRID_MESH) + count += mblur ? world.numMBGrids : world.numGrids; + + return count; + } + + template<typename Mesh, bool mblur> + __forceinline unsigned getNumTimeSteps() + { + if (!mblur) + return 1; + + Scene::Iterator<Mesh,mblur> iter(this); + return iter.maxTimeStepsPerGeometry(); + } + + template<typename Mesh, bool mblur> + __forceinline unsigned int getMaxGeomID() + { + Scene::Iterator<Mesh,mblur> iter(this); + return iter.maxGeomID(); + } + }; +} diff --git a/thirdparty/embree/kernels/common/scene_curves.h b/thirdparty/embree/kernels/common/scene_curves.h new file mode 100644 index 0000000000..a5a39e42d4 --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_curves.h @@ -0,0 +1,688 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "geometry.h" +#include "buffer.h" + +#include "../subdiv/bezier_curve.h" +#include "../subdiv/hermite_curve.h" +#include "../subdiv/bspline_curve.h" +#include "../subdiv/catmullrom_curve.h" +#include "../subdiv/linear_bezier_patch.h" + +namespace embree +{ + /*! represents an array of bicubic bezier curves */ + struct CurveGeometry : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4; + + public: + + /*! bezier curve construction */ + CurveGeometry (Device* device, Geometry::GType gtype); + + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void setTessellationRate(float N); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns the i'th curve */ + __forceinline const unsigned int& curve(size_t i) const { + return curves[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th tangent of the first time step */ + __forceinline Vec3ff tangent(size_t i) const { + return tangents0[i]; + } + + /*! returns i'th normal derivative of the first time step */ + __forceinline Vec3fa dnormal(size_t i) const { + return dnormals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th tangent of itime'th timestep */ + __forceinline Vec3ff tangent(size_t i, size_t itime) const { + return tangents[itime][i]; + } + + /*! returns i'th normal derivative of itime'th timestep */ + __forceinline Vec3fa dnormal(size_t i, size_t itime) const { + return dnormals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! gathers the curve starting with i'th vertex */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const + { + p0 = vertex(i+0); + p1 = vertex(i+1); + p2 = vertex(i+2); + p3 = vertex(i+3); + } + + /*! gathers the curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const + { + p0 = vertex(i+0,itime); + p1 = vertex(i+1,itime); + p2 = vertex(i+2,itime); + p3 = vertex(i+3,itime); + } + + /*! gathers the curve starting with i'th vertex */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const + { + p0 = vertex(i+0); + p1 = vertex(i+1); + p2 = vertex(i+2); + p3 = vertex(i+3); + n0 = normal(i+0); + n1 = normal(i+1); + n2 = normal(i+2); + n3 = normal(i+3); + } + + /*! gathers the curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const + { + p0 = vertex(i+0,itime); + p1 = vertex(i+1,itime); + p2 = vertex(i+2,itime); + p3 = vertex(i+3,itime); + n0 = normal(i+0,itime); + n1 = normal(i+1,itime); + n2 = normal(i+2,itime); + n3 = normal(i+3,itime); + } + + /*! prefetches the curve starting with i'th vertex of itime'th timestep */ + __forceinline void prefetchL1_vertices(size_t i) const + { + prefetchL1(vertices0.getPtr(i)+0); + prefetchL1(vertices0.getPtr(i)+64); + } + + /*! prefetches the curve starting with i'th vertex of itime'th timestep */ + __forceinline void prefetchL2_vertices(size_t i) const + { + prefetchL2(vertices0.getPtr(i)+0); + prefetchL2(vertices0.getPtr(i)+64); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + + const float t0 = 1.0f - ftime; + const float t1 = ftime; + Vec3ff a0,a1,a2,a3; + gather(a0,a1,a2,a3,i,itime); + Vec3ff b0,b1,b2,b3; + gather(b0,b1,b2,b3,i,itime+1); + p0 = madd(Vec3ff(t0),a0,t1*b0); + p1 = madd(Vec3ff(t0),a1,t1*b1); + p2 = madd(Vec3ff(t0),a2,t1*b2); + p3 = madd(Vec3ff(t0),a3,t1*b3); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + + const float t0 = 1.0f - ftime; + const float t1 = ftime; + Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3; + gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime); + Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3; + gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1); + p0 = madd(Vec3ff(t0),a0,t1*b0); + p1 = madd(Vec3ff(t0),a1,t1*b1); + p2 = madd(Vec3ff(t0),a2,t1*b2); + p3 = madd(Vec3ff(t0),a3,t1*b3); + n0 = madd(Vec3ff(t0),an0,t1*bn0); + n1 = madd(Vec3ff(t0),an1,t1*bn1); + n2 = madd(Vec3ff(t0),an2,t1*bn2); + n3 = madd(Vec3ff(t0),an3,t1*bn3); + } + + template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa> + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const + { + Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3; + unsigned int vertexID = curve(primID); + gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime); + SourceCurve3ff ccurve(v0,v1,v2,v3); + SourceCurve3fa ncurve(n0,n1,n2,n3); + ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve); + return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + } + + template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa> + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0); + const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1); + return clerp(curve0,curve1,ftime); + } + + /*! gathers the hermite curve starting with i'th vertex */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const + { + p0 = vertex (i+0); + p1 = vertex (i+1); + t0 = tangent(i+0); + t1 = tangent(i+1); + } + + /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const + { + p0 = vertex (i+0,itime); + p1 = vertex (i+1,itime); + t0 = tangent(i+0,itime); + t1 = tangent(i+1,itime); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const float f0 = 1.0f - ftime, f1 = ftime; + Vec3ff ap0,at0,ap1,at1; + gather_hermite(ap0,at0,ap1,at1,i,itime); + Vec3ff bp0,bt0,bp1,bt1; + gather_hermite(bp0,bt0,bp1,bt1,i,itime+1); + p0 = madd(Vec3ff(f0),ap0,f1*bp0); + t0 = madd(Vec3ff(f0),at0,f1*bt0); + p1 = madd(Vec3ff(f0),ap1,f1*bp1); + t1 = madd(Vec3ff(f0),at1,f1*bt1); + } + + /*! gathers the hermite curve starting with i'th vertex */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const + { + p0 = vertex (i+0); + p1 = vertex (i+1); + t0 = tangent(i+0); + t1 = tangent(i+1); + n0 = normal(i+0); + n1 = normal(i+1); + dn0 = dnormal(i+0); + dn1 = dnormal(i+1); + } + + /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const + { + p0 = vertex (i+0,itime); + p1 = vertex (i+1,itime); + t0 = tangent(i+0,itime); + t1 = tangent(i+1,itime); + n0 = normal(i+0,itime); + n1 = normal(i+1,itime); + dn0 = dnormal(i+0,itime); + dn1 = dnormal(i+1,itime); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const float f0 = 1.0f - ftime, f1 = ftime; + Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1; + gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime); + Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1; + gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1); + p0 = madd(Vec3ff(f0),ap0,f1*bp0); + t0 = madd(Vec3ff(f0),at0,f1*bt0); + n0 = madd(Vec3ff(f0),an0,f1*bn0); + dn0= madd(Vec3ff(f0),adn0,f1*bdn0); + p1 = madd(Vec3ff(f0),ap1,f1*bp1); + t1 = madd(Vec3ff(f0),at1,f1*bt1); + n1 = madd(Vec3ff(f0),an1,f1*bn1); + dn1= madd(Vec3ff(f0),adn1,f1*bdn1); + } + + template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa> + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const + { + Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1; + unsigned int vertexID = curve(primID); + gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime); + + SourceCurve3ff ccurve(v0,t0,v1,t1); + SourceCurve3fa ncurve(n0,dn0,n1,dn1); + ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve); + return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + } + + template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa> + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0); + const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1); + return clerp(curve0,curve1,ftime); + } + + private: + void resizeBuffers(unsigned int numSteps); + + public: + BufferView<unsigned int> curves; //!< array of curve indices + BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer + BufferView<Vec3fa> normals0; //!< fast access to first normal buffer + BufferView<Vec3ff> tangents0; //!< fast access to first tangent buffer + BufferView<Vec3fa> dnormals0; //!< fast access to first normal derivative buffer + vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep + vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep + vector<BufferView<Vec3ff>> tangents; //!< tangent array for each timestep + vector<BufferView<Vec3fa>> dnormals; //!< normal derivative array for each timestep + BufferView<char> flags; //!< start, end flag per segment + vector<BufferView<char>> vertexAttribs; //!< user buffers + int tessellationRate; //!< tessellation rate for flat curve + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + namespace isa + { + + template<template<typename Ty> class Curve> + struct CurveGeometryInterface : public CurveGeometry + { + typedef Curve<Vec3ff> Curve3ff; + typedef Curve<Vec3fa> Curve3fa; + + CurveGeometryInterface (Device* device, Geometry::GType gtype) + : CurveGeometry(device,gtype) {} + + __forceinline const Curve3ff getCurveScaledRadius(size_t i, size_t itime = 0) const + { + const unsigned int index = curve(i); + Vec3ff v0 = vertex(index+0,itime); + Vec3ff v1 = vertex(index+1,itime); + Vec3ff v2 = vertex(index+2,itime); + Vec3ff v3 = vertex(index+3,itime); + v0.w *= maxRadiusScale; + v1.w *= maxRadiusScale; + v2.w *= maxRadiusScale; + v3.w *= maxRadiusScale; + return Curve3ff (v0,v1,v2,v3); + } + + __forceinline const Curve3ff getCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const + { + const unsigned int index = curve(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + const Vec3ff v2 = vertex(index+2,itime); + const Vec3ff v3 = vertex(index+3,itime); + const Vec3ff w0(xfmPoint(space,(Vec3fa)v0), maxRadiusScale*v0.w); + const Vec3ff w1(xfmPoint(space,(Vec3fa)v1), maxRadiusScale*v1.w); + const Vec3ff w2(xfmPoint(space,(Vec3fa)v2), maxRadiusScale*v2.w); + const Vec3ff w3(xfmPoint(space,(Vec3fa)v3), maxRadiusScale*v3.w); + return Curve3ff(w0,w1,w2,w3); + } + + __forceinline const Curve3ff getCurveScaledRadius(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const + { + const float r_scale = r_scale0*scale; + const unsigned int index = curve(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + const Vec3ff v2 = vertex(index+2,itime); + const Vec3ff v3 = vertex(index+3,itime); + const Vec3ff w0(xfmPoint(space,((Vec3fa)v0-ofs)*Vec3fa(scale)), maxRadiusScale*v0.w*r_scale); + const Vec3ff w1(xfmPoint(space,((Vec3fa)v1-ofs)*Vec3fa(scale)), maxRadiusScale*v1.w*r_scale); + const Vec3ff w2(xfmPoint(space,((Vec3fa)v2-ofs)*Vec3fa(scale)), maxRadiusScale*v2.w*r_scale); + const Vec3ff w3(xfmPoint(space,((Vec3fa)v3-ofs)*Vec3fa(scale)), maxRadiusScale*v3.w*r_scale); + return Curve3ff(w0,w1,w2,w3); + } + + __forceinline const Curve3fa getNormalCurve(size_t i, size_t itime = 0) const + { + const unsigned int index = curve(i); + const Vec3fa n0 = normal(index+0,itime); + const Vec3fa n1 = normal(index+1,itime); + const Vec3fa n2 = normal(index+2,itime); + const Vec3fa n3 = normal(index+3,itime); + return Curve3fa (n0,n1,n2,n3); + } + + __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(size_t i, size_t itime = 0) const + { + const Curve3ff center = getCurveScaledRadius(i,itime); + const Curve3fa normal = getNormalCurve(i,itime); + const TensorLinearCubicBezierSurface3fa ocurve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(center,normal); + return ocurve; + } + + __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const { + return getOrientedCurveScaledRadius(i,itime).xfm(space); + } + + __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const Vec3fa& ofs, const float scale, const LinearSpace3fa& space, size_t i, size_t itime = 0) const { + return getOrientedCurveScaledRadius(i,itime).xfm(space,ofs,scale); + } + + /*! check if the i'th primitive is valid at the itime'th time step */ + __forceinline bool valid(Geometry::GType ctype, size_t i, const range<size_t>& itime_range) const + { + const unsigned int index = curve(i); + if (index+3 >= numVertices()) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + const float r0 = radius(index+0,itime); + const float r1 = radius(index+1,itime); + const float r2 = radius(index+2,itime); + const float r3 = radius(index+3,itime); + if (!isvalid(r0) || !isvalid(r1) || !isvalid(r2) || !isvalid(r3)) + return false; + + const Vec3fa v0 = vertex(index+0,itime); + const Vec3fa v1 = vertex(index+1,itime); + const Vec3fa v2 = vertex(index+2,itime); + const Vec3fa v3 = vertex(index+3,itime); + if (!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3)) + return false; + + if (ctype == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) + { + const Vec3fa n0 = normal(index+0,itime); + const Vec3fa n1 = normal(index+1,itime); + if (!isvalid(n0) || !isvalid(n1)) + return false; + } + } + + return true; + } + + template<int N> + void interpolate_impl(const RTCInterpolateArguments* const args) + { + unsigned int primID = args->primID; + float u = args->u; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* ddPdudu = args->ddPdudu; + unsigned int valueCount = args->valueCount; + + /* calculate base pointer and stride */ + assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || + (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); + const char* src = nullptr; + size_t stride = 0; + if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { + src = vertexAttribs[bufferSlot].getPtr(); + stride = vertexAttribs[bufferSlot].getStride(); + } else { + src = vertices[bufferSlot].getPtr(); + stride = vertices[bufferSlot].getStride(); + } + + for (unsigned int i=0; i<valueCount; i+=N) + { + size_t ofs = i*sizeof(float); + const size_t index = curves[primID]; + const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount); + const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+0)*stride+ofs]); + const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+1)*stride+ofs]); + const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+2)*stride+ofs]); + const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[(index+3)*stride+ofs]); + + const Curve<vfloat<N>> curve(p0,p1,p2,p3); + if (P ) mem<vfloat<N>>::storeu(valid,P+i, curve.eval(u)); + if (dPdu ) mem<vfloat<N>>::storeu(valid,dPdu+i, curve.eval_du(u)); + if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,curve.eval_dudu(u)); + } + } + + void interpolate(const RTCInterpolateArguments* const args) { + interpolate_impl<4>(args); + } + }; + + template<template<typename Ty> class Curve> + struct HermiteCurveGeometryInterface : public CurveGeometry + { + typedef Curve<Vec3ff> HermiteCurve3ff; + typedef Curve<Vec3fa> HermiteCurve3fa; + + HermiteCurveGeometryInterface (Device* device, Geometry::GType gtype) + : CurveGeometry(device,gtype) {} + + __forceinline const HermiteCurve3ff getCurveScaledRadius(size_t i, size_t itime = 0) const + { + const unsigned int index = curve(i); + Vec3ff v0 = vertex(index+0,itime); + Vec3ff v1 = vertex(index+1,itime); + Vec3ff t0 = tangent(index+0,itime); + Vec3ff t1 = tangent(index+1,itime); + v0.w *= maxRadiusScale; + v1.w *= maxRadiusScale; + t0.w *= maxRadiusScale; + t1.w *= maxRadiusScale; + return HermiteCurve3ff (v0,t0,v1,t1); + } + + __forceinline const HermiteCurve3ff getCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const + { + const unsigned int index = curve(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + const Vec3ff t0 = tangent(index+0,itime); + const Vec3ff t1 = tangent(index+1,itime); + const Vec3ff V0(xfmPoint(space,(Vec3fa)v0),maxRadiusScale*v0.w); + const Vec3ff V1(xfmPoint(space,(Vec3fa)v1),maxRadiusScale*v1.w); + const Vec3ff T0(xfmVector(space,(Vec3fa)t0),maxRadiusScale*t0.w); + const Vec3ff T1(xfmVector(space,(Vec3fa)t1),maxRadiusScale*t1.w); + return HermiteCurve3ff(V0,T0,V1,T1); + } + + __forceinline const HermiteCurve3ff getCurveScaledRadius(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const + { + const float r_scale = r_scale0*scale; + const unsigned int index = curve(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + const Vec3ff t0 = tangent(index+0,itime); + const Vec3ff t1 = tangent(index+1,itime); + const Vec3ff V0(xfmPoint(space,(v0-ofs)*Vec3fa(scale)), maxRadiusScale*v0.w*r_scale); + const Vec3ff V1(xfmPoint(space,(v1-ofs)*Vec3fa(scale)), maxRadiusScale*v1.w*r_scale); + const Vec3ff T0(xfmVector(space,t0*Vec3fa(scale)), maxRadiusScale*t0.w*r_scale); + const Vec3ff T1(xfmVector(space,t1*Vec3fa(scale)), maxRadiusScale*t1.w*r_scale); + return HermiteCurve3ff(V0,T0,V1,T1); + } + + __forceinline const HermiteCurve3fa getNormalCurve(size_t i, size_t itime = 0) const + { + const unsigned int index = curve(i); + const Vec3fa n0 = normal(index+0,itime); + const Vec3fa n1 = normal(index+1,itime); + const Vec3fa dn0 = dnormal(index+0,itime); + const Vec3fa dn1 = dnormal(index+1,itime); + return HermiteCurve3fa (n0,dn0,n1,dn1); + } + + __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(size_t i, size_t itime = 0) const + { + const HermiteCurve3ff center = getCurveScaledRadius(i,itime); + const HermiteCurve3fa normal = getNormalCurve(i,itime); + const TensorLinearCubicBezierSurface3fa ocurve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(center,normal); + return ocurve; + } + + __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const LinearSpace3fa& space, size_t i, size_t itime = 0) const { + return getOrientedCurveScaledRadius(i,itime).xfm(space); + } + + __forceinline const TensorLinearCubicBezierSurface3fa getOrientedCurveScaledRadius(const Vec3fa& ofs, const float scale, const LinearSpace3fa& space, size_t i, size_t itime = 0) const { + return getOrientedCurveScaledRadius(i,itime).xfm(space,ofs,scale); + } + + /*! check if the i'th primitive is valid at the itime'th time step */ + __forceinline bool valid(Geometry::GType ctype, size_t i, const range<size_t>& itime_range) const + { + const unsigned int index = curve(i); + if (index+1 >= numVertices()) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + if (!isvalid4(v0) || !isvalid4(v1)) + return false; + + const Vec3ff t0 = tangent(index+0,itime); + const Vec3ff t1 = tangent(index+1,itime); + if (!isvalid4(t0) || !isvalid4(t1)) + return false; + + if (ctype == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) + { + const Vec3fa n0 = normal(index+0,itime); + const Vec3fa n1 = normal(index+1,itime); + if (!isvalid(n0) || !isvalid(n1)) + return false; + + const Vec3fa dn0 = dnormal(index+0,itime); + const Vec3fa dn1 = dnormal(index+1,itime); + if (!isvalid(dn0) || !isvalid(dn1)) + return false; + } + } + + return true; + } + + template<int N> + void interpolate_impl(const RTCInterpolateArguments* const args) + { + unsigned int primID = args->primID; + float u = args->u; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* ddPdudu = args->ddPdudu; + unsigned int valueCount = args->valueCount; + + /* we interpolate vertex attributes linearly for hermite basis */ + if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + assert(bufferSlot <= vertexAttribs.size()); + const char* vsrc = vertexAttribs[bufferSlot].getPtr(); + const size_t vstride = vertexAttribs[bufferSlot].getStride(); + + for (unsigned int i=0; i<valueCount; i+=N) + { + const size_t ofs = i*sizeof(float); + const size_t index = curves[primID]; + const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount); + const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+0)*vstride+ofs]); + const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+1)*vstride+ofs]); + + if (P ) mem<vfloat<N>>::storeu(valid,P+i, madd(1.0f-u,p0,u*p1)); + if (dPdu ) mem<vfloat<N>>::storeu(valid,dPdu+i, p1-p0); + if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero)); + } + } + + /* interpolation for vertex buffers */ + else + { + assert(bufferSlot < numTimeSteps); + const char* vsrc = vertices[bufferSlot].getPtr(); + const char* tsrc = tangents[bufferSlot].getPtr(); + const size_t vstride = vertices[bufferSlot].getStride(); + const size_t tstride = vertices[bufferSlot].getStride(); + + for (unsigned int i=0; i<valueCount; i+=N) + { + const size_t ofs = i*sizeof(float); + const size_t index = curves[primID]; + const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>((int)valueCount); + const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+0)*vstride+ofs]); + const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&vsrc[(index+1)*vstride+ofs]); + const vfloat<N> t0 = mem<vfloat<N>>::loadu(valid,(float*)&tsrc[(index+0)*tstride+ofs]); + const vfloat<N> t1 = mem<vfloat<N>>::loadu(valid,(float*)&tsrc[(index+1)*tstride+ofs]); + + const HermiteCurveT<vfloat<N>> curve(p0,t0,p1,t1); + if (P ) mem<vfloat<N>>::storeu(valid,P+i, curve.eval(u)); + if (dPdu ) mem<vfloat<N>>::storeu(valid,dPdu+i, curve.eval_du(u)); + if (ddPdudu) mem<vfloat<N>>::storeu(valid,ddPdudu+i,curve.eval_dudu(u)); + } + } + } + + void interpolate(const RTCInterpolateArguments* const args) { + interpolate_impl<4>(args); + } + }; + } + + DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType); +} diff --git a/thirdparty/embree/kernels/common/scene_grid_mesh.h b/thirdparty/embree/kernels/common/scene_grid_mesh.h new file mode 100644 index 0000000000..fb6fed445b --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_grid_mesh.h @@ -0,0 +1,294 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! Grid Mesh */ + struct GridMesh : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH; + + /*! grid */ + struct Grid + { + unsigned int startVtxID; + unsigned int lineVtxOffset; + unsigned short resX,resY; + + /* border flags due to 3x3 vertex pattern */ + __forceinline unsigned int get3x3FlagsX(const unsigned int x) const + { + return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0; + } + + /* border flags due to 3x3 vertex pattern */ + __forceinline unsigned int get3x3FlagsY(const unsigned int y) const + { + return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0; + } + + /*! outputs grid structure */ + __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) { + return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }"; + } + }; + + public: + + /*! grid mesh construction */ + GridMesh (Device* device); + + /* geometry interface */ + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + + template<int N> + void interpolate_impl(const RTCInterpolateArguments* const args) + { + unsigned int primID = args->primID; + float U = args->u; + float V = args->v; + + /* clamp input u,v to [0;1] range */ + U = max(min(U,1.0f),0.0f); + V = max(min(V,1.0f),0.0f); + + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* dPdv = args->dPdv; + float* ddPdudu = args->ddPdudu; + float* ddPdvdv = args->ddPdvdv; + float* ddPdudv = args->ddPdudv; + unsigned int valueCount = args->valueCount; + + /* calculate base pointer and stride */ + assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || + (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); + const char* src = nullptr; + size_t stride = 0; + if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { + src = vertexAttribs[bufferSlot].getPtr(); + stride = vertexAttribs[bufferSlot].getStride(); + } else { + src = vertices[bufferSlot].getPtr(); + stride = vertices[bufferSlot].getStride(); + } + + const Grid& grid = grids[primID]; + const int grid_width = grid.resX-1; + const int grid_height = grid.resY-1; + const float rcp_grid_width = rcp(float(grid_width)); + const float rcp_grid_height = rcp(float(grid_height)); + const int iu = min((int)floor(U*grid_width ),grid_width); + const int iv = min((int)floor(V*grid_height),grid_height); + const float u = U*grid_width-float(iu); + const float v = V*grid_height-float(iv); + + for (unsigned int i=0; i<valueCount; i+=N) + { + const size_t ofs = i*sizeof(float); + const unsigned int idx0 = grid.startVtxID + (iv+0)*grid.lineVtxOffset + iu; + const unsigned int idx1 = grid.startVtxID + (iv+1)*grid.lineVtxOffset + iu; + + const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount)); + const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx0+0)*stride+ofs]); + const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx0+1)*stride+ofs]); + const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx1+1)*stride+ofs]); + const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[(idx1+0)*stride+ofs]); + const vbool<N> left = u+v <= 1.0f; + const vfloat<N> Q0 = select(left,p0,p2); + const vfloat<N> Q1 = select(left,p1,p3); + const vfloat<N> Q2 = select(left,p3,p1); + const vfloat<N> U = select(left,u,vfloat<N>(1.0f)-u); + const vfloat<N> V = select(left,v,vfloat<N>(1.0f)-v); + const vfloat<N> W = 1.0f-U-V; + + if (P) { + mem<vfloat<N>>::storeu(valid,P+i,madd(W,Q0,madd(U,Q1,V*Q2))); + } + if (dPdu) { + assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,select(left,Q1-Q0,Q0-Q1)*rcp_grid_width); + assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,select(left,Q2-Q0,Q0-Q2)*rcp_grid_height); + } + if (ddPdudu) { + assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero)); + assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero)); + assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero)); + } + } + } + + void addElementsToCount (GeometryCounts & counts) const; + + __forceinline unsigned int getNumSubGrids(const size_t gridID) + { + const Grid &g = grid(gridID); + return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1)); + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th grid*/ + __forceinline const Grid& grid(size_t i) const { + return grids[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th vertex of the first timestep */ + __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const { + assert(x < (size_t)g.resX); + assert(y < (size_t)g.resY); + return g.startVtxID + x + y * g.lineVtxOffset; + } + + /*! returns i'th vertex of the first timestep */ + __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const { + const size_t index = grid_vertex_index(g,x,y); + return vertex(index); + } + + /*! returns i'th vertex of the itime'th timestep */ + __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const { + const size_t index = grid_vertex_index(g,x,y); + return vertex(index,itime); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const + { + BBox3fa b(empty); + for (size_t t=0; t<numTimeSteps; t++) + { + for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++) + for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++) + { + const Vec3fa v = grid_vertex(g,x,y,t); + if (unlikely(!isvalid(v))) return false; + b.extend(v); + } + } + + bbox = b; + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const + { + assert(itime < numTimeSteps); + BBox3fa b0(empty); + for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++) + for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++) + { + const Vec3fa v = grid_vertex(g,x,y,itime); + if (unlikely(!isvalid(v))) return false; + b0.extend(v); + } + + /* use bounds of first time step in builder */ + bbox = b0; + return true; + } + + __forceinline bool valid(size_t gridID, size_t itime=0) const { + return valid(gridID, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const + { + if (unlikely(gridID >= grids.size())) return false; + const Grid &g = grid(gridID); + if (unlikely(g.startVtxID + 0 >= vertices0.size())) return false; + if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false; + + for (size_t y=0;y<g.resY;y++) + for (size_t x=0;x<g.resX;x++) + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + if (!isvalid(grid_vertex(g,x,y,itime))) return false; + return true; + } + + + __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const + { + BBox3fa box(empty); + buildBounds(g,sx,sy,itime,box); + return box; + } + + __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const { + BBox3fa bounds0, bounds1; + buildBounds(g,sx,sy,itime+0,bounds0); + buildBounds(g,sx,sy,itime+1,bounds1); + return LBBox3fa(bounds0,bounds1); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments); + } + + public: + BufferView<Grid> grids; //!< array of triangles + BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer + vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep + vector<RawBufferView> vertexAttribs; //!< vertex attributes + }; + + namespace isa + { + struct GridMeshISA : public GridMesh + { + GridMeshISA (Device* device) + : GridMesh(device) {} + }; + } + + DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*); +} diff --git a/thirdparty/embree/kernels/common/scene_instance.h b/thirdparty/embree/kernels/common/scene_instance.h new file mode 100644 index 0000000000..773f2b6fec --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_instance.h @@ -0,0 +1,272 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "accel.h" + +namespace embree +{ + struct MotionDerivativeCoefficients; + + /*! Instanced acceleration structure */ + struct Instance : public Geometry + { + ALIGNED_STRUCT_(16); + static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE; + + public: + Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1); + ~Instance(); + + private: + Instance (const Instance& other) DELETED; // do not implement + Instance& operator= (const Instance& other) DELETED; // do not implement + + private: + LBBox3fa nonlinearBounds(const BBox1f& time_range_in, + const BBox1f& geom_time_range, + float geom_time_segments) const; + + BBox3fa boundSegment(size_t itime, + BBox3fa const& obbox0, BBox3fa const& obbox1, + BBox3fa const& bbox0, BBox3fa const& bbox1, + float t_min, float t_max) const; + + /* calculates the (correct) interpolated bounds */ + __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(slerp(local2world[itime0], local2world[itime1], f), + lerp(getObjectBounds(itime0), getObjectBounds(itime1), f)); + return xfmBounds(lerp(local2world[itime0], local2world[itime1], f), + lerp(getObjectBounds(itime0), getObjectBounds(itime1), f)); + } + + public: + virtual void setNumTimeSteps (unsigned int numTimeSteps) override; + virtual void setInstancedScene(const Ref<Scene>& scene) override; + virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override; + virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override; + virtual AffineSpace3fa getTransform(float time) override; + virtual void setMask (unsigned mask) override; + virtual void build() {} + virtual void addElementsToCount (GeometryCounts & counts) const override; + virtual void commit() override; + + public: + + /*! calculates the bounds of instance */ + __forceinline BBox3fa bounds(size_t i) const { + assert(i == 0); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds()); + return xfmBounds(local2world[0],object->bounds.bounds()); + } + + /*! gets the bounds of the instanced scene */ + __forceinline BBox3fa getObjectBounds(size_t itime) const { + return object->getBounds(timeStep(itime)); + } + + /*! calculates the bounds of instance */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const { + assert(i == 0); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime)); + return xfmBounds(local2world[itime],getObjectBounds(itime)); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const { + assert(i == 0); + LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments); + return lbbox; + } + + /*! calculates the build bounds of the i'th item, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + assert(i==0); + const BBox3fa b = bounds(i); + if (bbox) *bbox = b; + return isvalid(b); + } + + /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + assert(i==0); + const LBBox3fa bounds = linearBounds(i,itime); + bbox = bounds.bounds (); + return isvalid(bounds); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return numPrimitives; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return numPrimitives != otherVersion; + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + assert(i == 0); + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + if (!isvalid(bounds(i,itime))) return false; + + return true; + } + + __forceinline AffineSpace3fa getLocal2World() const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return quaternionDecompositionToAffineSpace(local2world[0]); + return local2world[0]; + } + + __forceinline AffineSpace3fa getLocal2World(float t) const + { + float ftime; const unsigned int itime = timeSegment(t, ftime); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return slerp(local2world[itime+0],local2world[itime+1],ftime); + return lerp(local2world[itime+0],local2world[itime+1],ftime); + } + + __forceinline AffineSpace3fa getWorld2Local() const { + return world2local0; + } + + __forceinline AffineSpace3fa getWorld2Local(float t) const { + return rcp(getLocal2World(t)); + } + + template<int K> + __forceinline AffineSpace3vf<K> getWorld2Local(const vbool<K>& valid, const vfloat<K>& t) const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return getWorld2LocalSlerp<K>(valid, t); + return getWorld2LocalLerp<K>(valid, t); + } + + private: + + template<int K> + __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(const vbool<K>& valid, const vfloat<K>& t) const + { + vfloat<K> ftime; + const vint<K> itime_k = timeSegment<K>(t, ftime); + assert(any(valid)); + const size_t index = bsf(movemask(valid)); + const int itime = itime_k[index]; + if (likely(all(valid, itime_k == vint<K>(itime)))) { + return rcp(slerp(AffineSpace3vff<K>(local2world[itime+0]), + AffineSpace3vff<K>(local2world[itime+1]), + ftime)); + } + else { + AffineSpace3vff<K> space0,space1; + vbool<K> valid1 = valid; + while (any(valid1)) { + vbool<K> valid2; + const int itime = next_unique(valid1, itime_k, valid2); + space0 = select(valid2, AffineSpace3vff<K>(local2world[itime+0]), space0); + space1 = select(valid2, AffineSpace3vff<K>(local2world[itime+1]), space1); + } + return rcp(slerp(space0, space1, ftime)); + } + } + + template<int K> + __forceinline AffineSpace3vf<K> getWorld2LocalLerp(const vbool<K>& valid, const vfloat<K>& t) const + { + vfloat<K> ftime; + const vint<K> itime_k = timeSegment<K>(t, ftime); + assert(any(valid)); + const size_t index = bsf(movemask(valid)); + const int itime = itime_k[index]; + if (likely(all(valid, itime_k == vint<K>(itime)))) { + return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), + AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), + ftime)); + } else { + AffineSpace3vf<K> space0,space1; + vbool<K> valid1 = valid; + while (any(valid1)) { + vbool<K> valid2; + const int itime = next_unique(valid1, itime_k, valid2); + space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), space0); + space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), space1); + } + return rcp(lerp(space0, space1, ftime)); + } + } + + public: + Accel* object; //!< pointer to instanced acceleration structure + AffineSpace3ff* local2world; //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition) + AffineSpace3fa world2local0; //!< transformation from world space to local space for timestep 0 + }; + + namespace isa + { + struct InstanceISA : public Instance + { + InstanceISA (Device* device) + : Instance(device) {} + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfo pinfo(empty); + BBox3fa b = empty; + if (!buildBounds(0,&b)) return pinfo; + // const BBox3fa b = bounds(0); + // if (!isvalid(b)) return pinfo; + + const PrimRef prim(b,geomID,unsigned(0)); + pinfo.add_center2(prim); + prims[k++] = prim; + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfo pinfo(empty); + BBox3fa b = empty; + if (!buildBounds(0,&b)) return pinfo; + // if (!valid(0,range<size_t>(itime))) return pinfo; + // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0)); + const PrimRef prim(b,geomID,unsigned(0)); + pinfo.add_center2(prim); + prims[k++] = prim; + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfoMB pinfo(empty); + if (!valid(0, timeSegmentRange(t0t1))) return pinfo; + const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0)); + pinfo.add_primref(prim); + prims[k++] = prim; + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*); +} diff --git a/thirdparty/embree/kernels/common/scene_line_segments.h b/thirdparty/embree/kernels/common/scene_line_segments.h new file mode 100644 index 0000000000..3c9fdb39db --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_line_segments.h @@ -0,0 +1,345 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! represents an array of line segments */ + struct LineSegments : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2; + + public: + + /*! line segments construction */ + LineSegments (Device* device, Geometry::GType gtype); + + public: + void setMask (unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify (); + void interpolate(const RTCInterpolateArguments* const args); + void setTessellationRate(float N); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + template<int N> + void interpolate_impl(const RTCInterpolateArguments* const args) + { + unsigned int primID = args->primID; + float u = args->u; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* ddPdudu = args->ddPdudu; + unsigned int valueCount = args->valueCount; + + /* calculate base pointer and stride */ + assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || + (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); + const char* src = nullptr; + size_t stride = 0; + if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { + src = vertexAttribs[bufferSlot].getPtr(); + stride = vertexAttribs[bufferSlot].getStride(); + } else { + src = vertices[bufferSlot].getPtr(); + stride = vertices[bufferSlot].getStride(); + } + + for (unsigned int i=0; i<valueCount; i+=N) + { + const size_t ofs = i*sizeof(float); + const size_t segment = segments[primID]; + const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount)); + const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[(segment+0)*stride+ofs]); + const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[(segment+1)*stride+ofs]); + if (P ) mem<vfloat<N>>::storeu(valid,P+i,lerp(p0,p1,u)); + if (dPdu ) mem<vfloat<N>>::storeu(valid,dPdu+i,p1-p0); + if (ddPdudu) mem<vfloat<N>>::storeu(valid,dPdu+i,vfloat<N>(zero)); + } + } + + public: + + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns the i'th segment */ + __forceinline const unsigned int& segment(size_t i) const { + return segments[i]; + } + + /*! returns the segment to the left of the i'th segment */ + __forceinline bool segmentLeftExists(size_t i) const { + assert (flags); + return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0; + } + + /*! returns the segment to the right of the i'th segment */ + __forceinline bool segmentRightExists(size_t i) const { + assert (flags); + return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0; + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const + { + const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1)); + return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w))); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(size_t i) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0); + const Vec3ff v1 = vertex(index+1); + return bounds(v0,v1); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + return bounds(v0,v1); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0); + const Vec3ff v1 = vertex(index+1); + const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w); + const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w); + return bounds(w0,w1); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w); + const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w); + return bounds(w0,w1); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + const unsigned int index = segment(i); + if (index+1 >= numVertices()) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false; + const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false; + if (min(v0.w,v1.w) < 0.0f) return false; + } + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const + { + if (!valid(i,0)) return false; + *bbox = bounds(i); + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + if (!valid(i,itime+0) || !valid(i,itime+1)) return false; + bbox = bounds(i,itime); // use bounds of first time step in builder + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + BufferView<unsigned int> segments; //!< array of line segment indices + BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer + BufferView<Vec3fa> normals0; //!< fast access to first normal buffer + BufferView<char> flags; //!< start, end flag per segment + vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep + vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep + vector<BufferView<char>> vertexAttribs; //!< user buffers + int tessellationRate; //!< tessellation rate for bezier curve + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + namespace isa + { + struct LineSegmentsISA : public LineSegments + { + LineSegmentsISA (Device* device, Geometry::GType gtype) + : LineSegments(device,gtype) {} + + Vec3fa computeDirection(unsigned int primID) const + { + const unsigned vtxID = segment(primID); + const Vec3fa v0 = vertex(vtxID+0); + const Vec3fa v1 = vertex(vtxID+1); + return v1-v0; + } + + Vec3fa computeDirection(unsigned int primID, size_t time) const + { + const unsigned vtxID = segment(primID); + const Vec3fa v0 = vertex(vtxID+0,time); + const Vec3fa v1 = vertex(vtxID+1,time); + return v1-v0; + } + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,itime,bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!valid(j, timeSegmentRange(t0t1))) continue; + const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + + BBox3fa vbounds(size_t i) const { + return bounds(i); + } + + BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const { + return bounds(space,i); + } + + LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const { + return linearBounds(primID,time_range); + } + + LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + return linearBounds(space,primID,time_range); + } + }; + } + + DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType); +} diff --git a/thirdparty/embree/kernels/common/scene_points.h b/thirdparty/embree/kernels/common/scene_points.h new file mode 100644 index 0000000000..017e098a51 --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_points.h @@ -0,0 +1,282 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "buffer.h" +#include "default.h" +#include "geometry.h" + +namespace embree +{ + /*! represents an array of points */ + struct Points : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS; + + public: + /*! line segments construction */ + Points(Device* device, Geometry::GType gtype); + + public: + void setMask(unsigned mask); + void setNumTimeSteps(unsigned int numTimeSteps); + void setVertexAttributeCount(unsigned int N); + void setBuffer(RTCBufferType type, + unsigned int slot, + RTCFormat format, + const Ref<Buffer>& buffer, + size_t offset, + size_t stride, + unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + public: + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const Vec3ff& v0) const { + return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w)); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(size_t i) const + { + const Vec3ff v0 = vertex(i); + return bounds(v0); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Vec3ff v0 = vertex(i, itime); + return bounds(v0); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const + { + const Vec3ff v0 = vertex(i); + const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w); + return bounds(w0); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const + { + const Vec3ff v0 = vertex(i, itime); + const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w); + return bounds(w0); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + const unsigned int index = (unsigned int)i; + if (index >= numVertices()) + return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) { + const Vec3ff v0 = vertex(index + 0, itime); + if (unlikely(!isvalid4(v0))) + return false; + if (v0.w < 0.0f) + return false; + } + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const + { + if (!valid(i, 0)) + return false; + *bbox = bounds(i); + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + if (!valid(i, itime + 0) || !valid(i, itime + 1)) + return false; + bbox = bounds(i, itime); // use bounds of first time step in builder + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const { + return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + BufferView<Vec3ff> vertices0; //!< fast access to first vertex buffer + BufferView<Vec3fa> normals0; //!< fast access to first normal buffer + vector<BufferView<Vec3ff>> vertices; //!< vertex array for each timestep + vector<BufferView<Vec3fa>> normals; //!< normal array for each timestep + vector<BufferView<char>> vertexAttribs; //!< user buffers + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + namespace isa + { + struct PointsISA : public Points + { + PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {} + + Vec3fa computeDirection(unsigned int primID) const + { + return Vec3fa(1, 0, 0); + } + + Vec3fa computeDirection(unsigned int primID, size_t time) const + { + return Vec3fa(1, 0, 0); + } + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + BBox3fa bounds = empty; + if (!buildBounds(j, &bounds)) + continue; + const PrimRef prim(bounds, geomID, unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + BBox3fa bounds = empty; + if (!buildBounds(j, itime, bounds)) + continue; + const PrimRef prim(bounds, geomID, unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, + const BBox1f& t0t1, + const range<size_t>& r, + size_t k, + unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + if (!valid(j, timeSegmentRange(t0t1))) + continue; + const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + + BBox3fa vbounds(size_t i) const + { + return bounds(i); + } + + BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const + { + return bounds(space, i); + } + + LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const + { + return linearBounds(primID, time_range); + } + + LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const + { + return linearBounds(space, primID, time_range); + } + }; + } // namespace isa + + DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType); +} // namespace embree diff --git a/thirdparty/embree/kernels/common/scene_quad_mesh.h b/thirdparty/embree/kernels/common/scene_quad_mesh.h new file mode 100644 index 0000000000..bd8eeaaeb7 --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_quad_mesh.h @@ -0,0 +1,337 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! Quad Mesh */ + struct QuadMesh : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH; + + /*! triangle indices */ + struct Quad + { + uint32_t v[4]; + + /*! outputs triangle indices */ + __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) { + return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }"; + } + }; + + public: + + /*! quad mesh construction */ + QuadMesh (Device* device); + + /* geometry interface */ + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + void addElementsToCount (GeometryCounts & counts) const; + + template<int N> + void interpolate_impl(const RTCInterpolateArguments* const args) + { + unsigned int primID = args->primID; + float u = args->u; + float v = args->v; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* dPdv = args->dPdv; + float* ddPdudu = args->ddPdudu; + float* ddPdvdv = args->ddPdvdv; + float* ddPdudv = args->ddPdudv; + unsigned int valueCount = args->valueCount; + + /* calculate base pointer and stride */ + assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || + (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); + const char* src = nullptr; + size_t stride = 0; + if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { + src = vertexAttribs[bufferSlot].getPtr(); + stride = vertexAttribs[bufferSlot].getStride(); + } else { + src = vertices[bufferSlot].getPtr(); + stride = vertices[bufferSlot].getStride(); + } + + for (unsigned int i=0; i<valueCount; i+=N) + { + const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount)); + const size_t ofs = i*sizeof(float); + const Quad& tri = quad(primID); + const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]); + const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]); + const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]); + const vfloat<N> p3 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[3]*stride+ofs]); + const vbool<N> left = u+v <= 1.0f; + const vfloat<N> Q0 = select(left,p0,p2); + const vfloat<N> Q1 = select(left,p1,p3); + const vfloat<N> Q2 = select(left,p3,p1); + const vfloat<N> U = select(left,u,vfloat<N>(1.0f)-u); + const vfloat<N> V = select(left,v,vfloat<N>(1.0f)-v); + const vfloat<N> W = 1.0f-U-V; + if (P) { + mem<vfloat<N>>::storeu(valid,P+i,madd(W,Q0,madd(U,Q1,V*Q2))); + } + if (dPdu) { + assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,select(left,Q1-Q0,Q0-Q1)); + assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,select(left,Q2-Q0,Q0-Q2)); + } + if (ddPdudu) { + assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero)); + assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero)); + assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero)); + } + } + } + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th quad */ + __forceinline const Quad& quad(size_t i) const { + return quads[i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! calculates the bounds of the i'th quad */ + __forceinline BBox3fa bounds(size_t i) const + { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0]); + const Vec3fa v1 = vertex(q.v[1]); + const Vec3fa v2 = vertex(q.v[2]); + const Vec3fa v3 = vertex(q.v[3]); + return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3)); + } + + /*! calculates the bounds of the i'th quad at the itime'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0],itime); + const Vec3fa v1 = vertex(q.v[1],itime); + const Vec3fa v2 = vertex(q.v[2],itime); + const Vec3fa v3 = vertex(q.v[3],itime); + return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3)); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + const Quad& q = quad(i); + if (unlikely(q.v[0] >= numVertices())) return false; + if (unlikely(q.v[1] >= numVertices())) return false; + if (unlikely(q.v[2] >= numVertices())) return false; + if (unlikely(q.v[3] >= numVertices())) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + if (!isvalid(vertex(q.v[0],itime))) return false; + if (!isvalid(vertex(q.v[1],itime))) return false; + if (!isvalid(vertex(q.v[2],itime))) return false; + if (!isvalid(vertex(q.v[3],itime))) return false; + } + + return true; + } + + /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const Quad& q = quad(i); + if (q.v[0] >= numVertices()) return false; + if (q.v[1] >= numVertices()) return false; + if (q.v[2] >= numVertices()) return false; + if (q.v[3] >= numVertices()) return false; + + for (unsigned int t=0; t<numTimeSteps; t++) + { + const Vec3fa v0 = vertex(q.v[0],t); + const Vec3fa v1 = vertex(q.v[1],t); + const Vec3fa v2 = vertex(q.v[2],t); + const Vec3fa v3 = vertex(q.v[3],t); + + if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3))) + return false; + } + + if (bbox) + *bbox = bounds(i); + + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + const Quad& q = quad(i); + if (unlikely(q.v[0] >= numVertices())) return false; + if (unlikely(q.v[1] >= numVertices())) return false; + if (unlikely(q.v[2] >= numVertices())) return false; + if (unlikely(q.v[3] >= numVertices())) return false; + + assert(itime+1 < numTimeSteps); + const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false; + const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false; + const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false; + const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false; + const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false; + const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false; + const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false; + const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false; + + /* use bounds of first time step in builder */ + bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3)); + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(dt))) return false; + bbox = linearBounds(i, dt); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return quads.modCounter; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return quads.isModified(otherVersion); // || numPrimitivesChanged; + } + + /* returns the projected area */ + __forceinline float projectedPrimitiveArea(const size_t i) const { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0]); + const Vec3fa v1 = vertex(q.v[1]); + const Vec3fa v2 = vertex(q.v[2]); + const Vec3fa v3 = vertex(q.v[3]); + return areaProjectedTriangle(v0,v1,v3) + + areaProjectedTriangle(v1,v2,v3); + } + + public: + BufferView<Quad> quads; //!< array of quads + BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer + vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep + vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers + }; + + namespace isa + { + struct QuadMeshISA : public QuadMesh + { + QuadMeshISA (Device* device) + : QuadMesh(device) {} + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,itime,bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!valid(j, timeSegmentRange(t0t1))) continue; + const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*); +} diff --git a/thirdparty/embree/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h new file mode 100644 index 0000000000..1db170196d --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_subdiv_mesh.h @@ -0,0 +1,326 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" +#include "../subdiv/half_edge.h" +#include "../subdiv/tessellation_cache.h" +#include "../subdiv/catmullclark_coefficients.h" +#include "../subdiv/patch.h" +#include "../../common/algorithms/parallel_map.h" +#include "../../common/algorithms/parallel_set.h" + +namespace embree +{ + class SubdivMesh : public Geometry + { + ALIGNED_CLASS_(16); + public: + + typedef HalfEdge::Edge Edge; + + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH; + + /*! structure used to sort half edges using radix sort by their key */ + struct KeyHalfEdge + { + KeyHalfEdge() {} + + KeyHalfEdge (uint64_t key, HalfEdge* edge) + : key(key), edge(edge) {} + + __forceinline operator uint64_t() const { + return key; + } + + friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) { + return e0.key < e1.key; + } + + public: + uint64_t key; + HalfEdge* edge; + }; + + public: + + /*! subdiv mesh construction */ + SubdivMesh(Device* device); + + public: + void setMask (unsigned mask); + void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode); + void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setTopologyCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void setTessellationRate(float N); + bool verify(); + void commit(); + void addElementsToCount (GeometryCounts & counts) const; + void setDisplacementFunction (RTCDisplacementFunctionN func); + unsigned int getFirstHalfEdge(unsigned int faceID); + unsigned int getFace(unsigned int edgeID); + unsigned int getNextHalfEdge(unsigned int edgeID); + unsigned int getPreviousHalfEdge(unsigned int edgeID); + unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID); + + public: + + /*! return the number of faces */ + size_t numFaces() const { + return faceVertices.size(); + } + + /*! return the number of edges */ + size_t numEdges() const { + return topology[0].vertexIndices.size(); + } + + /*! return the number of vertices */ + size_t numVertices() const { + return vertices[0].size(); + } + + /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t j = 0) const { + return topology[0].getHalfEdge(i)->bounds(vertices[j]); + } + + /*! check if the i'th primitive is valid */ + __forceinline bool valid(size_t i) const { + return topology[0].valid(i) && !invalidFace(i); + } + + /*! check if the i'th primitive is valid for the j'th time range */ + __forceinline bool valid(size_t i, size_t j) const { + return topology[0].valid(i) && !invalidFace(i,j); + } + + /*! prints some statistics */ + void printStatistics(); + + /*! initializes the half edge data structure */ + void initializeHalfEdgeStructures (); + + public: + + /*! returns the vertex buffer for some time step */ + __forceinline const BufferView<Vec3fa>& getVertexBuffer( const size_t t = 0 ) const { + return vertices[t]; + } + + /* returns tessellation level of edge */ + __forceinline float getEdgeLevel(const size_t i) const + { + if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level? + else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level? + } + + public: + RTCDisplacementFunctionN displFunc; //!< displacement function + + /*! all buffers in this section are provided by the application */ + public: + + /*! the topology contains all data that may differ when + * interpolating different user data buffers */ + struct Topology + { + public: + + /*! Default topology construction */ + Topology () : halfEdges(nullptr,0) {} + + /*! Topology initialization */ + Topology (SubdivMesh* mesh); + + /*! make the class movable */ + public: + Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows + : mesh(std::move(other.mesh)), + vertexIndices(std::move(other.vertexIndices)), + subdiv_mode(std::move(other.subdiv_mode)), + halfEdges(std::move(other.halfEdges)), + halfEdges0(std::move(other.halfEdges0)), + halfEdges1(std::move(other.halfEdges1)) {} + + Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows + { + mesh = std::move(other.mesh); + vertexIndices = std::move(other.vertexIndices); + subdiv_mode = std::move(other.subdiv_mode); + halfEdges = std::move(other.halfEdges); + halfEdges0 = std::move(other.halfEdges0); + halfEdges1 = std::move(other.halfEdges1); + return *this; + } + + public: + /*! check if the i'th primitive is valid in this topology */ + __forceinline bool valid(size_t i) const + { + if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) { + if (getHalfEdge(i)->faceHasBorder()) return false; + } + return true; + } + + /*! updates the interpolation mode for the topology */ + void setSubdivisionMode (RTCSubdivisionMode mode); + + /*! marks all buffers as modified */ + void update (); + + /*! verifies index array */ + bool verify (size_t numVertices); + + /*! initializes the half edge data structure */ + void initializeHalfEdgeStructures (); + + private: + + /*! recalculates the half edges */ + void calculateHalfEdges(); + + /*! updates half edges when recalculation is not necessary */ + void updateHalfEdges(); + + /*! user input data */ + public: + + SubdivMesh* mesh; + + /*! indices of the vertices composing each face */ + BufferView<unsigned int> vertexIndices; + + /*! subdiv interpolation mode */ + RTCSubdivisionMode subdiv_mode; + + /*! generated data */ + public: + + /*! returns the start half edge for face f */ + __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { + return &halfEdges[mesh->faceStartEdge[f]]; + } + + /*! Half edge structure, generated by initHalfEdgeStructures */ + mvector<HalfEdge> halfEdges; + + /*! the following data is only required during construction of the + * half edge structure and can be cleared for static scenes */ + private: + + /*! two arrays used to sort the half edges */ + std::vector<KeyHalfEdge> halfEdges0; + std::vector<KeyHalfEdge> halfEdges1; + }; + + /*! returns the start half edge for topology t and face f */ + __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { + return topology[t].getHalfEdge(f); + } + + /*! buffer containing the number of vertices for each face */ + BufferView<unsigned int> faceVertices; + + /*! array of topologies */ + vector<Topology> topology; + + /*! vertex buffer (one buffer for each time step) */ + vector<BufferView<Vec3fa>> vertices; + + /*! user data buffers */ + vector<RawBufferView> vertexAttribs; + + /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */ + BufferView<Edge> edge_creases; + + /*! edge crease weights for each edge of the edge_creases buffer */ + BufferView<float> edge_crease_weights; + + /*! vertex crease buffer containing all vertices that carry vertex crease weights */ + BufferView<unsigned int> vertex_creases; + + /*! vertex crease weights for each vertex of the vertex_creases buffer */ + BufferView<float> vertex_crease_weights; + + /*! subdivision level for each half edge of the vertexIndices buffer */ + BufferView<float> levels; + float tessellationRate; // constant rate that is used when levels is not set + + /*! buffer that marks specific faces as holes */ + BufferView<unsigned> holes; + + /*! all data in this section is generated by initializeHalfEdgeStructures function */ + private: + + /*! number of half edges used by faces */ + size_t numHalfEdges; + + /*! fast lookup table to find the first half edge for some face */ + mvector<uint32_t> faceStartEdge; + + /*! fast lookup table to find the face for some half edge */ + mvector<uint32_t> halfEdgeFace; + + /*! set with all holes */ + parallel_set<uint32_t> holeSet; + + /*! fast lookup table to detect invalid faces */ + mvector<char> invalid_face; + + /*! test if face i is invalid in timestep j */ + __forceinline char& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; } + __forceinline const char& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; } + + /*! interpolation cache */ + public: + static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; } + static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; } + static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) { + const size_t slots = numInterpolationSlots4(stride); + assert(slot < slots); + return slots*prim+slot; + } + std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_buffer_tags; + std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_attrib_buffer_tags; + std::vector<Patch3fa::Ref> patch_eval_trees; + + /*! the following data is only required during construction of the + * half edge structure and can be cleared for static scenes */ + private: + + /*! map with all vertex creases */ + parallel_map<uint32_t,float> vertexCreaseMap; + + /*! map with all edge creases */ + parallel_map<uint64_t,float> edgeCreaseMap; + + protected: + + /*! counts number of geometry commits */ + size_t commitCounter; + }; + + namespace isa + { + struct SubdivMeshISA : public SubdivMesh + { + SubdivMeshISA (Device* device) + : SubdivMesh(device) {} + + void interpolate(const RTCInterpolateArguments* const args); + void interpolateN(const RTCInterpolateNArguments* const args); + }; + } + + DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*); +}; diff --git a/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp new file mode 100644 index 0000000000..3bbd7e51ae --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_triangle_mesh.cpp @@ -0,0 +1,194 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "scene_triangle_mesh.h" +#include "scene.h" + +namespace embree +{ +#if defined(EMBREE_LOWEST_ISA) + + TriangleMesh::TriangleMesh (Device* device) + : Geometry(device,GTY_TRIANGLE_MESH,0,1) + { + vertices.resize(numTimeSteps); + } + + void TriangleMesh::setMask (unsigned mask) + { + this->mask = mask; + Geometry::update(); + } + + void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps) + { + vertices.resize(numTimeSteps); + Geometry::setNumTimeSteps(numTimeSteps); + } + + void TriangleMesh::setVertexAttributeCount (unsigned int N) + { + vertexAttribs.resize(N); + Geometry::update(); + } + + void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) + { + /* verify that all accesses are 4 bytes aligned */ + if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned"); + + if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (format != RTC_FORMAT_FLOAT3) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format"); + + /* if buffer is larger than 16GB the premultiplied index optimization does not work */ + if (stride*num > 16ll*1024ll*1024ll*1024ll) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large"); + + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot"); + + vertices[slot].set(buffer, offset, stride, num, format); + vertices[slot].checkPadding16(); + vertices0 = vertices[0]; + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format"); + + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot"); + + vertexAttribs[slot].set(buffer, offset, stride, num, format); + vertexAttribs[slot].checkPadding16(); + } + else if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + if (format != RTC_FORMAT_UINT3) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format"); + + triangles.set(buffer, offset, stride, num, format); + setNumPrimitives(num); + } + else + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + } + + void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot) + { + if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return triangles.getPtr(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return vertices[slot].getPtr(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return vertexAttribs[slot].getPtr(); + } + else + { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + return nullptr; + } + } + + void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot) + { + if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + triangles.setModified(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + vertices[slot].setModified(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + vertexAttribs[slot].setModified(); + } + else + { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + } + + Geometry::update(); + } + + void TriangleMesh::commit() + { + /* verify that stride of all time steps are identical */ + for (unsigned int t=0; t<numTimeSteps; t++) + if (vertices[t].getStride() != vertices[0].getStride()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"stride of vertex buffers have to be identical for each time step"); + + Geometry::commit(); + } + + void TriangleMesh::addElementsToCount (GeometryCounts & counts) const + { + if (numTimeSteps == 1) counts.numTriangles += numPrimitives; + else counts.numMBTriangles += numPrimitives; + } + + bool TriangleMesh::verify() + { + /*! verify size of vertex arrays */ + if (vertices.size() == 0) return false; + for (const auto& buffer : vertices) + if (buffer.size() != numVertices()) + return false; + + /*! verify size of user vertex arrays */ + for (const auto& buffer : vertexAttribs) + if (buffer.size() != numVertices()) + return false; + + /*! verify triangle indices */ + for (size_t i=0; i<size(); i++) { + if (triangles[i].v[0] >= numVertices()) return false; + if (triangles[i].v[1] >= numVertices()) return false; + if (triangles[i].v[2] >= numVertices()) return false; + } + + /*! verify vertices */ + for (const auto& buffer : vertices) + for (size_t i=0; i<buffer.size(); i++) + if (!isvalid(buffer[i])) + return false; + + return true; + } + + void TriangleMesh::interpolate(const RTCInterpolateArguments* const args) { + interpolate_impl<4>(args); + } + +#endif + + namespace isa + { + TriangleMesh* createTriangleMesh(Device* device) { + return new TriangleMeshISA(device); + } + } +} diff --git a/thirdparty/embree/kernels/common/scene_triangle_mesh.h b/thirdparty/embree/kernels/common/scene_triangle_mesh.h new file mode 100644 index 0000000000..ad3f602fde --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_triangle_mesh.h @@ -0,0 +1,318 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! Triangle Mesh */ + struct TriangleMesh : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_TRIANGLE_MESH; + + /*! triangle indices */ + struct Triangle + { + uint32_t v[3]; + + /*! outputs triangle indices */ + __forceinline friend embree_ostream operator<<(embree_ostream cout, const Triangle& t) { + return cout << "Triangle { " << t.v[0] << ", " << t.v[1] << ", " << t.v[2] << " }"; + } + }; + + public: + + /*! triangle mesh construction */ + TriangleMesh (Device* device); + + /* geometry interface */ + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + void addElementsToCount (GeometryCounts & counts) const; + + template<int N> + void interpolate_impl(const RTCInterpolateArguments* const args) + { + unsigned int primID = args->primID; + float u = args->u; + float v = args->v; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* dPdv = args->dPdv; + float* ddPdudu = args->ddPdudu; + float* ddPdvdv = args->ddPdvdv; + float* ddPdudv = args->ddPdudv; + unsigned int valueCount = args->valueCount; + + /* calculate base pointer and stride */ + assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || + (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); + const char* src = nullptr; + size_t stride = 0; + if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { + src = vertexAttribs[bufferSlot].getPtr(); + stride = vertexAttribs[bufferSlot].getStride(); + } else { + src = vertices[bufferSlot].getPtr(); + stride = vertices[bufferSlot].getStride(); + } + + for (unsigned int i=0; i<valueCount; i+=N) + { + size_t ofs = i*sizeof(float); + const float w = 1.0f-u-v; + const Triangle& tri = triangle(primID); + const vbool<N> valid = vint<N>((int)i)+vint<N>(step) < vint<N>(int(valueCount)); + const vfloat<N> p0 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]); + const vfloat<N> p1 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]); + const vfloat<N> p2 = mem<vfloat<N>>::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]); + + if (P) { + mem<vfloat<N>>::storeu(valid,P+i,madd(w,p0,madd(u,p1,v*p2))); + } + if (dPdu) { + assert(dPdu); mem<vfloat<N>>::storeu(valid,dPdu+i,p1-p0); + assert(dPdv); mem<vfloat<N>>::storeu(valid,dPdv+i,p2-p0); + } + if (ddPdudu) { + assert(ddPdudu); mem<vfloat<N>>::storeu(valid,ddPdudu+i,vfloat<N>(zero)); + assert(ddPdvdv); mem<vfloat<N>>::storeu(valid,ddPdvdv+i,vfloat<N>(zero)); + assert(ddPdudv); mem<vfloat<N>>::storeu(valid,ddPdudv+i,vfloat<N>(zero)); + } + } + } + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th triangle*/ + __forceinline const Triangle& triangle(size_t i) const { + return triangles[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const Vec3fa vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! calculates the bounds of the i'th triangle */ + __forceinline BBox3fa bounds(size_t i) const + { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0]); + const Vec3fa v1 = vertex(tri.v[1]); + const Vec3fa v2 = vertex(tri.v[2]); + return BBox3fa(min(v0,v1,v2),max(v0,v1,v2)); + } + + /*! calculates the bounds of the i'th triangle at the itime'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0],itime); + const Vec3fa v1 = vertex(tri.v[1],itime); + const Vec3fa v2 = vertex(tri.v[2],itime); + return BBox3fa(min(v0,v1,v2),max(v0,v1,v2)); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range<size_t>& itime_range) const + { + const Triangle& tri = triangle(i); + if (unlikely(tri.v[0] >= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + if (!isvalid(vertex(tri.v[0],itime))) return false; + if (!isvalid(vertex(tri.v[1],itime))) return false; + if (!isvalid(vertex(tri.v[2],itime))) return false; + } + + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const Triangle& tri = triangle(i); + if (unlikely(tri.v[0] >= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + for (size_t t=0; t<numTimeSteps; t++) + { + const Vec3fa v0 = vertex(tri.v[0],t); + const Vec3fa v1 = vertex(tri.v[1],t); + const Vec3fa v2 = vertex(tri.v[2],t); + if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2))) + return false; + } + + if (likely(bbox)) + *bbox = bounds(i); + + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + const Triangle& tri = triangle(i); + if (unlikely(tri.v[0] >= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + assert(itime+1 < numTimeSteps); + const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false; + const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false; + const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false; + const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false; + const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false; + const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false; + + /* use bounds of first time step in builder */ + bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2)); + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const { + if (!valid(i, timeSegmentRange(dt))) return false; + bbox = linearBounds(i, dt); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return triangles.modCounter; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return triangles.isModified(otherVersion); // || numPrimitivesChanged; + } + + /* returns the projected area */ + __forceinline float projectedPrimitiveArea(const size_t i) const { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0]); + const Vec3fa v1 = vertex(tri.v[1]); + const Vec3fa v2 = vertex(tri.v[2]); + return areaProjectedTriangle(v0,v1,v2); + } + + public: + BufferView<Triangle> triangles; //!< array of triangles + BufferView<Vec3fa> vertices0; //!< fast access to first vertex buffer + vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep + vector<RawBufferView> vertexAttribs; //!< vertex attributes + }; + + namespace isa + { + struct TriangleMeshISA : public TriangleMesh + { + TriangleMeshISA (Device* device) + : TriangleMesh(device) {} + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,itime,bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!valid(j, timeSegmentRange(t0t1))) continue; + const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*); +} diff --git a/thirdparty/embree/kernels/common/scene_user_geometry.h b/thirdparty/embree/kernels/common/scene_user_geometry.h new file mode 100644 index 0000000000..2867b18b79 --- /dev/null +++ b/thirdparty/embree/kernels/common/scene_user_geometry.h @@ -0,0 +1,77 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "accelset.h" + +namespace embree +{ + /*! User geometry with user defined intersection functions */ + struct UserGeometry : public AccelSet + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY; + + public: + UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1); + virtual void setMask (unsigned mask); + virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr); + virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect); + virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded); + virtual void build() {} + virtual void addElementsToCount (GeometryCounts & counts) const; + }; + + namespace isa + { + struct UserGeometryISA : public UserGeometry + { + UserGeometryISA (Device* device) + : UserGeometry(device) {} + + PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!buildBounds(j,itime,bounds)) continue; + const PrimRef prim(bounds,geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!valid(j, timeSegmentRange(t0t1))) continue; + const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*); +} diff --git a/thirdparty/embree/kernels/common/stack_item.h b/thirdparty/embree/kernels/common/stack_item.h new file mode 100644 index 0000000000..c31c64e862 --- /dev/null +++ b/thirdparty/embree/kernels/common/stack_item.h @@ -0,0 +1,125 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! An item on the stack holds the node ID and distance of that node. */ + template<typename T> + struct __aligned(16) StackItemT + { + /*! assert that the xchg function works */ + static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed"); + + __forceinline StackItemT() {} + + __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {} + + /*! use SSE instructions to swap stack items */ + __forceinline static void xchg(StackItemT& a, StackItemT& b) + { + const vfloat4 sse_a = vfloat4::load((float*)&a); + const vfloat4 sse_b = vfloat4::load((float*)&b); + vfloat4::store(&a,sse_b); + vfloat4::store(&b,sse_a); + } + + /*! Sort 2 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2) { + if (s2.dist < s1.dist) xchg(s2,s1); + } + + /*! Sort 3 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3) + { + if (s2.dist < s1.dist) xchg(s2,s1); + if (s3.dist < s2.dist) xchg(s3,s2); + if (s2.dist < s1.dist) xchg(s2,s1); + } + + /*! Sort 4 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4) + { + if (s2.dist < s1.dist) xchg(s2,s1); + if (s4.dist < s3.dist) xchg(s4,s3); + if (s3.dist < s1.dist) xchg(s3,s1); + if (s4.dist < s2.dist) xchg(s4,s2); + if (s3.dist < s2.dist) xchg(s3,s2); + } + + /*! use SSE instructions to swap stack items */ + __forceinline static void cmp_xchg(vint4& a, vint4& b) + { +#if defined(__AVX512VL__) + const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a)); +#else + const vboolf4 mask0(b < a); + const vboolf4 mask(shuffle<2,2,2,2>(mask0)); +#endif + const vint4 c = select(mask,b,a); + const vint4 d = select(mask,a,b); + a = c; + b = d; + } + + /*! Sort 3 stack items. */ + __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3) + { + cmp_xchg(s2,s1); + cmp_xchg(s3,s2); + cmp_xchg(s2,s1); + } + + /*! Sort 4 stack items. */ + __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4) + { + cmp_xchg(s2,s1); + cmp_xchg(s4,s3); + cmp_xchg(s3,s1); + cmp_xchg(s4,s2); + cmp_xchg(s3,s2); + } + + + /*! Sort N stack items. */ + __forceinline friend void sort(StackItemT* begin, StackItemT* end) + { + for (StackItemT* i = begin+1; i != end; ++i) + { + const vfloat4 item = vfloat4::load((float*)i); + const unsigned dist = i->dist; + StackItemT* j = i; + + while ((j != begin) && ((j-1)->dist < dist)) + { + vfloat4::store(j, vfloat4::load((float*)(j-1))); + --j; + } + + vfloat4::store(j, item); + } + } + + public: + T ptr; + unsigned dist; + }; + + /*! An item on the stack holds the node ID and active ray mask. */ + template<typename T> + struct __aligned(8) StackItemMaskT + { + T ptr; + size_t mask; + }; + + struct __aligned(8) StackItemMaskCoherent + { + size_t mask; + size_t parent; + size_t child; + }; +} diff --git a/thirdparty/embree/kernels/common/stat.cpp b/thirdparty/embree/kernels/common/stat.cpp new file mode 100644 index 0000000000..ebb77cd534 --- /dev/null +++ b/thirdparty/embree/kernels/common/stat.cpp @@ -0,0 +1,128 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "stat.h" + +namespace embree +{ + Stat Stat::instance; + + Stat::Stat () { + } + + Stat::~Stat () + { +#ifdef EMBREE_STAT_COUNTERS + Stat::print(std::cout); +#endif + } + + void Stat::print(std::ostream& cout) + { + Counters& cntrs = instance.cntrs; + Counters::Data& data = instance.cntrs.code; + //Counters::Data& data = instance.cntrs.active; + + /* print absolute numbers */ + cout << "--------- ABSOLUTE ---------" << std::endl; + cout << " #normal_travs = " << float(data.normal.travs )*1E-6 << "M" << std::endl; + cout << " #nodes = " << float(data.normal.trav_nodes )*1E-6 << "M" << std::endl; + cout << " #nodes_xfm = " << float(data.normal.trav_xfm_nodes )*1E-6 << "M" << std::endl; + cout << " #leaves = " << float(data.normal.trav_leaves )*1E-6 << "M" << std::endl; + cout << " #prims = " << float(data.normal.trav_prims )*1E-6 << "M" << std::endl; + cout << " #prim_hits = " << float(data.normal.trav_prim_hits )*1E-6 << "M" << std::endl; + + cout << " #stack nodes = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl; + cout << " #stack pop = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl; + + size_t normal_box_hits = 0; + size_t weighted_box_hits = 0; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) { + normal_box_hits += data.normal.trav_hit_boxes[i]; + weighted_box_hits += data.normal.trav_hit_boxes[i]*i; + } + cout << " #hit_boxes = " << normal_box_hits << " (total) distribution: "; + float average = 0.0f; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) + { + float value = 100.0f * data.normal.trav_hit_boxes[i] / normal_box_hits; + cout << "[" << i << "] " << value << " "; + average += (float)i*data.normal.trav_hit_boxes[i] / normal_box_hits; + } + cout << " average = " << average << std::endl; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.normal.trav_hit_boxes[i]*i / weighted_box_hits << " "; + cout << std::endl; + + if (data.shadow.travs) { + cout << " #shadow_travs = " << float(data.shadow.travs )*1E-6 << "M" << std::endl; + cout << " #nodes = " << float(data.shadow.trav_nodes )*1E-6 << "M" << std::endl; + cout << " #nodes_xfm = " << float(data.shadow.trav_xfm_nodes)*1E-6 << "M" << std::endl; + cout << " #leaves = " << float(data.shadow.trav_leaves )*1E-6 << "M" << std::endl; + cout << " #prims = " << float(data.shadow.trav_prims )*1E-6 << "M" << std::endl; + cout << " #prim_hits = " << float(data.shadow.trav_prim_hits)*1E-6 << "M" << std::endl; + + cout << " #stack nodes = " << float(data.shadow.trav_stack_nodes )*1E-6 << "M" << std::endl; + cout << " #stack pop = " << float(data.shadow.trav_stack_pop )*1E-6 << "M" << std::endl; + + size_t shadow_box_hits = 0; + size_t weighted_shadow_box_hits = 0; + + for (size_t i=0;i<SIZE_HISTOGRAM;i++) { + shadow_box_hits += data.shadow.trav_hit_boxes[i]; + weighted_shadow_box_hits += data.shadow.trav_hit_boxes[i]*i; + } + cout << " #hit_boxes = "; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i] / shadow_box_hits << " "; + cout << std::endl; + for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i]*i / weighted_shadow_box_hits << " "; + cout << std::endl; + } + cout << std::endl; + + /* print per traversal numbers */ + cout << "--------- PER TRAVERSAL ---------" << std::endl; + float active_normal_travs = float(cntrs.active.normal.travs )/float(cntrs.all.normal.travs ); + float active_normal_trav_nodes = float(cntrs.active.normal.trav_nodes )/float(cntrs.all.normal.trav_nodes ); + float active_normal_trav_xfm_nodes = float(cntrs.active.normal.trav_xfm_nodes )/float(cntrs.all.normal.trav_xfm_nodes ); + float active_normal_trav_leaves = float(cntrs.active.normal.trav_leaves)/float(cntrs.all.normal.trav_leaves); + float active_normal_trav_prims = float(cntrs.active.normal.trav_prims )/float(cntrs.all.normal.trav_prims ); + float active_normal_trav_prim_hits = float(cntrs.active.normal.trav_prim_hits )/float(cntrs.all.normal.trav_prim_hits ); + float active_normal_trav_stack_pop = float(cntrs.active.normal.trav_stack_pop )/float(cntrs.all.normal.trav_stack_pop ); + + cout << " #normal_travs = " << float(cntrs.code.normal.travs )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_travs << "% active" << std::endl; + cout << " #nodes = " << float(cntrs.code.normal.trav_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_nodes << "% active" << std::endl; + cout << " #node_xfm = " << float(cntrs.code.normal.trav_xfm_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_xfm_nodes << "% active" << std::endl; + cout << " #leaves = " << float(cntrs.code.normal.trav_leaves)/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_leaves << "% active" << std::endl; + cout << " #prims = " << float(cntrs.code.normal.trav_prims )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prims << "% active" << std::endl; + cout << " #prim_hits = " << float(cntrs.code.normal.trav_prim_hits )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prim_hits << "% active" << std::endl; + cout << " #stack_pop = " << float(cntrs.code.normal.trav_stack_pop )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_stack_pop << "% active" << std::endl; + + if (cntrs.all.shadow.travs) { + float active_shadow_travs = float(cntrs.active.shadow.travs )/float(cntrs.all.shadow.travs ); + float active_shadow_trav_nodes = float(cntrs.active.shadow.trav_nodes )/float(cntrs.all.shadow.trav_nodes ); + float active_shadow_trav_xfm_nodes = float(cntrs.active.shadow.trav_xfm_nodes )/float(cntrs.all.shadow.trav_xfm_nodes ); + float active_shadow_trav_leaves = float(cntrs.active.shadow.trav_leaves)/float(cntrs.all.shadow.trav_leaves); + float active_shadow_trav_prims = float(cntrs.active.shadow.trav_prims )/float(cntrs.all.shadow.trav_prims ); + float active_shadow_trav_prim_hits = float(cntrs.active.shadow.trav_prim_hits )/float(cntrs.all.shadow.trav_prim_hits ); + + cout << " #shadow_travs = " << float(cntrs.code.shadow.travs )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_travs << "% active" << std::endl; + cout << " #nodes = " << float(cntrs.code.shadow.trav_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_nodes << "% active" << std::endl; + cout << " #nodes_xfm = " << float(cntrs.code.shadow.trav_xfm_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_xfm_nodes << "% active" << std::endl; + cout << " #leaves = " << float(cntrs.code.shadow.trav_leaves)/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_leaves << "% active" << std::endl; + cout << " #prims = " << float(cntrs.code.shadow.trav_prims )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prims << "% active" << std::endl; + cout << " #prim_hits = " << float(cntrs.code.shadow.trav_prim_hits )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prim_hits << "% active" << std::endl; + + } + cout << std::endl; + + /* print user counters for performance tuning */ + cout << "--------- USER ---------" << std::endl; + for (size_t i=0; i<10; i++) + cout << "#user" << i << " = " << float(cntrs.user[i])/float(cntrs.all.normal.travs+cntrs.all.shadow.travs) << " per traversal" << std::endl; + + cout << "#user5/user3 " << 100.0f*float(cntrs.user[5])/float(cntrs.user[3]) << "%" << std::endl; + cout << "#user6/user3 " << 100.0f*float(cntrs.user[6])/float(cntrs.user[3]) << "%" << std::endl; + cout << "#user7/user3 " << 100.0f*float(cntrs.user[7])/float(cntrs.user[3]) << "%" << std::endl; + cout << std::endl; + } +} diff --git a/thirdparty/embree/kernels/common/stat.h b/thirdparty/embree/kernels/common/stat.h new file mode 100644 index 0000000000..02fc07e67f --- /dev/null +++ b/thirdparty/embree/kernels/common/stat.h @@ -0,0 +1,116 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +/* Macros to gather statistics */ +#ifdef EMBREE_STAT_COUNTERS +# define STAT(x) x +# define STAT3(s,x,y,z) \ + STAT(Stat::get().code .s+=x); \ + STAT(Stat::get().active.s+=y); \ + STAT(Stat::get().all .s+=z); +# define STAT_USER(i,x) Stat::get().user[i]+=x; +#else +# define STAT(x) +# define STAT3(s,x,y,z) +# define STAT_USER(i,x) +#endif + +namespace embree +{ + /*! Gathers ray tracing statistics. We count 1) how often a code + * location is reached, 2) how many SIMD lanes are active, 3) how + * many SIMD lanes reach the code location */ + class Stat + { + public: + + static const size_t SIZE_HISTOGRAM = 64+1; + + /*! constructs stat counter class */ + Stat (); + + /*! destructs stat counter class */ + ~Stat (); + + class Counters + { + public: + Counters () { + clear(); + } + + void clear() + { + all.clear(); + active.clear(); + code.clear(); + for (auto& u : user) u.store(0); + } + + public: + + /* per packet and per ray stastics */ + struct Data + { + void clear () { + normal.clear(); + shadow.clear(); + point_query.clear(); + } + + /* normal and shadow ray statistics */ + struct + { + void clear() + { + travs.store(0); + trav_nodes.store(0); + trav_leaves.store(0); + trav_prims.store(0); + trav_prim_hits.store(0); + for (auto& v : trav_hit_boxes) v.store(0); + trav_stack_pop.store(0); + trav_stack_nodes.store(0); + trav_xfm_nodes.store(0); + } + + public: + std::atomic<size_t> travs; + std::atomic<size_t> trav_nodes; + std::atomic<size_t> trav_leaves; + std::atomic<size_t> trav_prims; + std::atomic<size_t> trav_prim_hits; + std::atomic<size_t> trav_hit_boxes[SIZE_HISTOGRAM+1]; + std::atomic<size_t> trav_stack_pop; + std::atomic<size_t> trav_stack_nodes; + std::atomic<size_t> trav_xfm_nodes; + + } normal, shadow, point_query; + } all, active, code; + + std::atomic<size_t> user[10]; + }; + + public: + + static __forceinline Counters& get() { + return instance.cntrs; + } + + static void clear() { + instance.cntrs.clear(); + } + + static void print(embree_ostream cout); + + private: + Counters cntrs; + + private: + static Stat instance; + }; +} diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp new file mode 100644 index 0000000000..01c862da0c --- /dev/null +++ b/thirdparty/embree/kernels/common/state.cpp @@ -0,0 +1,519 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "state.h" +#include "../../common/lexers/streamfilters.h" + +namespace embree +{ + MutexSys g_printMutex; + + State::ErrorHandler State::g_errorHandler; + + State::ErrorHandler::ErrorHandler() + : thread_error(createTls()) {} + + State::ErrorHandler::~ErrorHandler() + { + Lock<MutexSys> lock(errors_mutex); + for (size_t i=0; i<thread_errors.size(); i++) + delete thread_errors[i]; + destroyTls(thread_error); + thread_errors.clear(); + } + + RTCError* State::ErrorHandler::error() + { + RTCError* stored_error = (RTCError*) getTls(thread_error); + if (stored_error) return stored_error; + + Lock<MutexSys> lock(errors_mutex); + stored_error = new RTCError(RTC_ERROR_NONE); + thread_errors.push_back(stored_error); + setTls(thread_error,stored_error); + return stored_error; + } + + State::State () + : enabled_cpu_features(getCPUFeatures()), + enabled_builder_cpu_features(enabled_cpu_features), + frequency_level(FREQUENCY_SIMD256) + { + tri_accel = "default"; + tri_builder = "default"; + tri_traverser = "default"; + + tri_accel_mb = "default"; + tri_builder_mb = "default"; + tri_traverser_mb = "default"; + + quad_accel = "default"; + quad_builder = "default"; + quad_traverser = "default"; + + quad_accel_mb = "default"; + quad_builder_mb = "default"; + quad_traverser_mb = "default"; + + line_accel = "default"; + line_builder = "default"; + line_traverser = "default"; + + line_accel_mb = "default"; + line_builder_mb = "default"; + line_traverser_mb = "default"; + + hair_accel = "default"; + hair_builder = "default"; + hair_traverser = "default"; + + hair_accel_mb = "default"; + hair_builder_mb = "default"; + hair_traverser_mb = "default"; + + object_accel = "default"; + object_builder = "default"; + object_accel_min_leaf_size = 1; + object_accel_max_leaf_size = 1; + + object_accel_mb = "default"; + object_builder_mb = "default"; + object_accel_mb_min_leaf_size = 1; + object_accel_mb_max_leaf_size = 1; + + max_spatial_split_replications = 1.2f; + useSpatialPreSplits = false; + + tessellation_cache_size = 128*1024*1024; + + subdiv_accel = "default"; + subdiv_accel_mb = "default"; + + grid_accel = "default"; + grid_builder = "default"; + grid_accel_mb = "default"; + grid_builder_mb = "default"; + + instancing_open_min = 0; + instancing_block_size = 0; + instancing_open_factor = 8.0f; + instancing_open_max_depth = 32; + instancing_open_max = 50000000; + + float_exceptions = false; + quality_flags = -1; + scene_flags = -1; + verbose = 0; + benchmark = 0; + + numThreads = 0; + numUserThreads = 0; + +#if TASKING_INTERNAL + set_affinity = true; +#else + set_affinity = false; +#endif + + start_threads = false; + enable_selockmemoryprivilege = false; +#if defined(__LINUX__) + hugepages = true; +#else + hugepages = false; +#endif + hugepages_success = true; + + alloc_main_block_size = 0; + alloc_num_main_slots = 0; + alloc_thread_block_size = 0; + alloc_single_thread_alloc = -1; + + error_function = nullptr; + error_function_userptr = nullptr; + + memory_monitor_function = nullptr; + memory_monitor_userptr = nullptr; + } + + State::~State() { + } + + bool State::hasISA(const int isa) { + return (enabled_cpu_features & isa) == isa; + } + + bool State::checkISASupport() { + return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features; + } + + void State::verify() + { + /* verify that calculations stay in range */ + assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX); + + /* here we verify that CPP files compiled for a specific ISA only + * call that same or lower ISA version of non-inlined class member + * functions */ +#if defined(DEBUG) +#if defined(EMBREE_TARGET_SSE2) + assert(sse2::getISA() <= SSE2); +#endif +#if defined(EMBREE_TARGET_SSE42) + assert(sse42::getISA() <= SSE42); +#endif +#if defined(EMBREE_TARGET_AVX) + assert(avx::getISA() <= AVX); +#endif +#if defined(EMBREE_TARGET_AVX2) + assert(avx2::getISA() <= AVX2); +#endif +#if defined (EMBREE_TARGET_AVX512) + assert(avx512::getISA() <= AVX512); +#endif +#endif + } + + const char* symbols[3] = { "=", ",", "|" }; + + bool State::parseFile(const FileName& fileName) + { + FILE* f = fopen(fileName.c_str(),"r"); + if (!f) return false; + Ref<Stream<int> > file = new FileStream(f,fileName); + + std::vector<std::string> syms; + for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) + syms.push_back(symbols[i]); + + Ref<TokenStream> cin = new TokenStream(new LineCommentFilter(file,"#"), + TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.", + TokenStream::separators,syms); + parse(cin); + return true; + } + + void State::parseString(const char* cfg) + { + if (cfg == nullptr) return; + + std::vector<std::string> syms; + for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) + syms.push_back(symbols[i]); + + Ref<TokenStream> cin = new TokenStream(new StrStream(cfg), + TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.", + TokenStream::separators,syms); + parse(cin); + } + + int string_to_cpufeatures(const std::string& isa) + { + if (isa == "sse" ) return SSE; + else if (isa == "sse2") return SSE2; + else if (isa == "sse3") return SSE3; + else if (isa == "ssse3") return SSSE3; + else if (isa == "sse41") return SSE41; + else if (isa == "sse4.1") return SSE41; + else if (isa == "sse42") return SSE42; + else if (isa == "sse4.2") return SSE42; + else if (isa == "avx") return AVX; + else if (isa == "avxi") return AVXI; + else if (isa == "avx2") return AVX2; + else if (isa == "avx512") return AVX512; + else return SSE2; + } + + void State::parse(Ref<TokenStream> cin) + { + /* parse until end of stream */ + while (cin->peek() != Token::Eof()) + { + const Token tok = cin->get(); + + if (tok == Token::Id("threads") && cin->trySymbol("=")) + numThreads = cin->get().Int(); + + else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) + numUserThreads = cin->get().Int(); + + else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) + set_affinity = cin->get().Int(); + + else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) + set_affinity = cin->get().Int(); + + else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) + start_threads = cin->get().Int(); + + else if (tok == Token::Id("isa") && cin->trySymbol("=")) { + std::string isa_str = toLowerCase(cin->get().Identifier()); + enabled_cpu_features = string_to_cpufeatures(isa_str); + enabled_builder_cpu_features = enabled_cpu_features; + } + + else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) { + std::string isa_str = toLowerCase(cin->get().Identifier()); + enabled_cpu_features &= string_to_cpufeatures(isa_str); + enabled_builder_cpu_features &= enabled_cpu_features; + } + + else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) { + std::string isa_str = toLowerCase(cin->get().Identifier()); + enabled_builder_cpu_features &= string_to_cpufeatures(isa_str); + } + + else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) { + std::string freq = cin->get().Identifier(); + if (freq == "simd128") frequency_level = FREQUENCY_SIMD128; + else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256; + else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512; + } + + else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) { + enable_selockmemoryprivilege = cin->get().Int(); + } + else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) { + hugepages = cin->get().Int(); + } + + else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) + float_exceptions = cin->get().Int(); + + else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("=")) + tri_accel = cin->get().Identifier(); + else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("=")) + tri_builder = cin->get().Identifier(); + else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("=")) + tri_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("=")) + tri_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("=")) + tri_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("=")) + tri_traverser_mb = cin->get().Identifier(); + + else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("=")) + quad_accel = cin->get().Identifier(); + else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("=")) + quad_builder = cin->get().Identifier(); + else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("=")) + quad_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("=")) + quad_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("=")) + quad_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("=")) + quad_traverser_mb = cin->get().Identifier(); + + else if ((tok == Token::Id("line_accel")) && cin->trySymbol("=")) + line_accel = cin->get().Identifier(); + else if ((tok == Token::Id("line_builder")) && cin->trySymbol("=")) + line_builder = cin->get().Identifier(); + else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("=")) + line_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("=")) + line_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("=")) + line_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("=")) + line_traverser_mb = cin->get().Identifier(); + + else if (tok == Token::Id("hair_accel") && cin->trySymbol("=")) + hair_accel = cin->get().Identifier(); + else if (tok == Token::Id("hair_builder") && cin->trySymbol("=")) + hair_builder = cin->get().Identifier(); + else if (tok == Token::Id("hair_traverser") && cin->trySymbol("=")) + hair_traverser = cin->get().Identifier(); + + else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("=")) + hair_accel_mb = cin->get().Identifier(); + else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("=")) + hair_builder_mb = cin->get().Identifier(); + else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("=")) + hair_traverser_mb = cin->get().Identifier(); + + else if (tok == Token::Id("object_accel") && cin->trySymbol("=")) + object_accel = cin->get().Identifier(); + else if (tok == Token::Id("object_builder") && cin->trySymbol("=")) + object_builder = cin->get().Identifier(); + else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("=")) + object_accel_min_leaf_size = cin->get().Int(); + else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("=")) + object_accel_max_leaf_size = cin->get().Int(); + + else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("=")) + object_accel_mb = cin->get().Identifier(); + else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("=")) + object_builder_mb = cin->get().Identifier(); + else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("=")) + object_accel_mb_min_leaf_size = cin->get().Int(); + else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("=")) + object_accel_mb_max_leaf_size = cin->get().Int(); + + else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("=")) + instancing_open_min = cin->get().Int(); + else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) { + instancing_block_size = cin->get().Int(); + instancing_open_factor = 0.0f; + } + else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("=")) + instancing_open_max_depth = cin->get().Int(); + else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) { + instancing_block_size = 0; + instancing_open_factor = cin->get().Float(); + } + else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("=")) + instancing_open_max = cin->get().Int(); + + else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("=")) + subdiv_accel = cin->get().Identifier(); + else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("=")) + subdiv_accel_mb = cin->get().Identifier(); + + else if (tok == Token::Id("grid_accel") && cin->trySymbol("=")) + grid_accel = cin->get().Identifier(); + else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("=")) + grid_accel_mb = cin->get().Identifier(); + + else if (tok == Token::Id("verbose") && cin->trySymbol("=")) + verbose = cin->get().Int(); + else if (tok == Token::Id("benchmark") && cin->trySymbol("=")) + benchmark = cin->get().Int(); + + else if (tok == Token::Id("quality")) { + if (cin->trySymbol("=")) { + Token flag = cin->get(); + if (flag == Token::Id("low")) quality_flags = RTC_BUILD_QUALITY_LOW; + else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM; + else if (flag == Token::Id("high")) quality_flags = RTC_BUILD_QUALITY_HIGH; + } + } + + else if (tok == Token::Id("scene_flags")) { + scene_flags = 0; + if (cin->trySymbol("=")) { + do { + Token flag = cin->get(); + if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC; + else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT; + else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST; + } while (cin->trySymbol("|")); + } + } + + else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("=")) + max_spatial_split_replications = cin->get().Float(); + + else if (tok == Token::Id("presplits") && cin->trySymbol("=")) + useSpatialPreSplits = cin->get().Int() != 0 ? true : false; + + else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("=")) + tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f); + else if (tok == Token::Id("cache_size") && cin->trySymbol("=")) + tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f); + + else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("=")) + alloc_main_block_size = cin->get().Int(); + else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("=")) + alloc_num_main_slots = cin->get().Int(); + else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("=")) + alloc_thread_block_size = cin->get().Int(); + else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("=")) + alloc_single_thread_alloc = cin->get().Int(); + + cin->trySymbol(","); // optional , separator + } + } + + bool State::verbosity(size_t N) { + return N <= verbose; + } + + void State::print() + { + std::cout << "general:" << std::endl; + std::cout << " build threads = " << numThreads << std::endl; + std::cout << " build user threads = " << numUserThreads << std::endl; + std::cout << " start_threads = " << start_threads << std::endl; + std::cout << " affinity = " << set_affinity << std::endl; + std::cout << " frequency_level = "; + switch (frequency_level) { + case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break; + case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break; + case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break; + default: std::cout << "error" << std::endl; break; + } + + std::cout << " hugepages = "; + if (!hugepages) std::cout << "disabled" << std::endl; + else if (hugepages_success) std::cout << "enabled" << std::endl; + else std::cout << "failed" << std::endl; + + std::cout << " verbosity = " << verbose << std::endl; + std::cout << " cache_size = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl; + std::cout << " max_spatial_split_replications = " << max_spatial_split_replications << std::endl; + + std::cout << "triangles:" << std::endl; + std::cout << " accel = " << tri_accel << std::endl; + std::cout << " builder = " << tri_builder << std::endl; + std::cout << " traverser = " << tri_traverser << std::endl; + + std::cout << "motion blur triangles:" << std::endl; + std::cout << " accel = " << tri_accel_mb << std::endl; + std::cout << " builder = " << tri_builder_mb << std::endl; + std::cout << " traverser = " << tri_traverser_mb << std::endl; + + std::cout << "quads:" << std::endl; + std::cout << " accel = " << quad_accel << std::endl; + std::cout << " builder = " << quad_builder << std::endl; + std::cout << " traverser = " << quad_traverser << std::endl; + + std::cout << "motion blur quads:" << std::endl; + std::cout << " accel = " << quad_accel_mb << std::endl; + std::cout << " builder = " << quad_builder_mb << std::endl; + std::cout << " traverser = " << quad_traverser_mb << std::endl; + + std::cout << "line segments:" << std::endl; + std::cout << " accel = " << line_accel << std::endl; + std::cout << " builder = " << line_builder << std::endl; + std::cout << " traverser = " << line_traverser << std::endl; + + std::cout << "motion blur line segments:" << std::endl; + std::cout << " accel = " << line_accel_mb << std::endl; + std::cout << " builder = " << line_builder_mb << std::endl; + std::cout << " traverser = " << line_traverser_mb << std::endl; + + std::cout << "hair:" << std::endl; + std::cout << " accel = " << hair_accel << std::endl; + std::cout << " builder = " << hair_builder << std::endl; + std::cout << " traverser = " << hair_traverser << std::endl; + + std::cout << "motion blur hair:" << std::endl; + std::cout << " accel = " << hair_accel_mb << std::endl; + std::cout << " builder = " << hair_builder_mb << std::endl; + std::cout << " traverser = " << hair_traverser_mb << std::endl; + + std::cout << "subdivision surfaces:" << std::endl; + std::cout << " accel = " << subdiv_accel << std::endl; + + std::cout << "grids:" << std::endl; + std::cout << " accel = " << grid_accel << std::endl; + std::cout << " builder = " << grid_builder << std::endl; + + std::cout << "motion blur grids:" << std::endl; + std::cout << " accel = " << grid_accel_mb << std::endl; + std::cout << " builder = " << grid_builder_mb << std::endl; + + std::cout << "object_accel:" << std::endl; + std::cout << " min_leaf_size = " << object_accel_min_leaf_size << std::endl; + std::cout << " max_leaf_size = " << object_accel_max_leaf_size << std::endl; + + std::cout << "object_accel_mb:" << std::endl; + std::cout << " min_leaf_size = " << object_accel_mb_min_leaf_size << std::endl; + std::cout << " max_leaf_size = " << object_accel_mb_max_leaf_size << std::endl; + } +} diff --git a/thirdparty/embree/kernels/common/state.h b/thirdparty/embree/kernels/common/state.h new file mode 100644 index 0000000000..33bcc843b2 --- /dev/null +++ b/thirdparty/embree/kernels/common/state.h @@ -0,0 +1,196 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /* mutex to make printing to cout thread safe */ + extern MutexSys g_printMutex; + + struct State : public RefCount + { + public: + /*! state construction */ + State (); + + /*! state destruction */ + ~State(); + + /*! verifies that state is correct */ + void verify(); + + /*! parses state from a configuration file */ + bool parseFile(const FileName& fileName); + + /*! parses the state from a string */ + void parseString(const char* cfg); + + /*! parses the state from a stream */ + void parse(Ref<TokenStream> cin); + + /*! prints the state */ + void print(); + + /*! checks if verbosity level is at least N */ + bool verbosity(size_t N); + + /*! checks if some particular ISA is enabled */ + bool hasISA(const int isa); + + /*! check whether selected ISA is supported by the HW */ + bool checkISASupport(); + + public: + std::string tri_accel; //!< acceleration structure to use for triangles + std::string tri_builder; //!< builder to use for triangles + std::string tri_traverser; //!< traverser to use for triangles + + public: + std::string tri_accel_mb; //!< acceleration structure to use for motion blur triangles + std::string tri_builder_mb; //!< builder to use for motion blur triangles + std::string tri_traverser_mb; //!< traverser to use for triangles + + public: + std::string quad_accel; //!< acceleration structure to use for quads + std::string quad_builder; //!< builder to use for quads + std::string quad_traverser; //!< traverser to use for quads + + public: + std::string quad_accel_mb; //!< acceleration structure to use for motion blur quads + std::string quad_builder_mb; //!< builder to use for motion blur quads + std::string quad_traverser_mb; //!< traverser to use for motion blur quads + + public: + std::string line_accel; //!< acceleration structure to use for line segments + std::string line_builder; //!< builder to use for line segments + std::string line_traverser; //!< traverser to use for line segments + + public: + std::string line_accel_mb; //!< acceleration structure to use for motion blur line segments + std::string line_builder_mb; //!< builder to use for motion blur line segments + std::string line_traverser_mb; //!< traverser to use for motion blur line segments + + public: + std::string hair_accel; //!< hair acceleration structure to use + std::string hair_builder; //!< builder to use for hair + std::string hair_traverser; //!< traverser to use for hair + + public: + std::string hair_accel_mb; //!< acceleration structure to use for motion blur hair + std::string hair_builder_mb; //!< builder to use for motion blur hair + std::string hair_traverser_mb; //!< traverser to use for motion blur hair + + public: + std::string object_accel; //!< acceleration structure for user geometries + std::string object_builder; //!< builder for user geometries + int object_accel_min_leaf_size; //!< minimum leaf size for object acceleration structure + int object_accel_max_leaf_size; //!< maximum leaf size for object acceleration structure + + public: + std::string object_accel_mb; //!< acceleration structure for user geometries + std::string object_builder_mb; //!< builder for user geometries + int object_accel_mb_min_leaf_size; //!< minimum leaf size for mblur object acceleration structure + int object_accel_mb_max_leaf_size; //!< maximum leaf size for mblur object acceleration structure + + public: + std::string subdiv_accel; //!< acceleration structure to use for subdivision surfaces + std::string subdiv_accel_mb; //!< acceleration structure to use for subdivision surfaces + + public: + std::string grid_accel; //!< acceleration structure to use for grids + std::string grid_builder; //!< builder for grids + std::string grid_accel_mb; //!< acceleration structure to use for motion blur grids + std::string grid_builder_mb; //!< builder for motion blur grids + + public: + float max_spatial_split_replications; //!< maximally replications*N many primitives in accel for spatial splits + bool useSpatialPreSplits; //!< use spatial pre-splits instead of the full spatial split builder + size_t tessellation_cache_size; //!< size of the shared tessellation cache + + public: + size_t instancing_open_min; //!< instancing opens tree to minimally that number of subtrees + size_t instancing_block_size; //!< instancing opens tree up to average block size of primitives + float instancing_open_factor; //!< instancing opens tree up to x times the number of instances + size_t instancing_open_max_depth; //!< maximum open depth for geometries + size_t instancing_open_max; //!< instancing opens tree to maximally that number of subtrees + + public: + bool float_exceptions; //!< enable floating point exceptions + int quality_flags; + int scene_flags; + size_t verbose; //!< verbosity of output + size_t benchmark; //!< true + + public: + size_t numThreads; //!< number of threads to use in builders + size_t numUserThreads; //!< number of user provided threads to use in builders + bool set_affinity; //!< sets affinity for worker threads + bool start_threads; //!< true when threads should be started at device creation time + int enabled_cpu_features; //!< CPU ISA features to use + int enabled_builder_cpu_features; //!< CPU ISA features to use for builders only + enum FREQUENCY_LEVEL { + FREQUENCY_SIMD128, + FREQUENCY_SIMD256, + FREQUENCY_SIMD512 + } frequency_level; //!< frequency level the app wants to run on (default is SIMD256) + bool enable_selockmemoryprivilege; //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages + bool hugepages; //!< true if huge pages should get used + bool hugepages_success; //!< status for enabling huge pages + + public: + size_t alloc_main_block_size; //!< main allocation block size (shared between threads) + int alloc_num_main_slots; //!< number of such shared blocks to be used to allocate + size_t alloc_thread_block_size; //!< size of thread local allocator block size + int alloc_single_thread_alloc; //!< in single mode nodes and leaves use same thread local allocator + + public: + + /*! checks if we can use AVX */ + bool canUseAVX() { + return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128; + } + + /*! checks if we can use AVX2 */ + bool canUseAVX2() { + return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128; + } + + struct ErrorHandler + { + public: + ErrorHandler(); + ~ErrorHandler(); + RTCError* error(); + + public: + tls_t thread_error; + std::vector<RTCError*> thread_errors; + MutexSys errors_mutex; + }; + ErrorHandler errorHandler; + static ErrorHandler g_errorHandler; + + public: + void setErrorFunction(RTCErrorFunction fptr, void* uptr) + { + error_function = fptr; + error_function_userptr = uptr; + } + + RTCErrorFunction error_function; + void* error_function_userptr; + + public: + void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) + { + memory_monitor_function = fptr; + memory_monitor_userptr = uptr; + } + + RTCMemoryMonitorFunction memory_monitor_function; + void* memory_monitor_userptr; + }; +} diff --git a/thirdparty/embree/kernels/common/vector.h b/thirdparty/embree/kernels/common/vector.h new file mode 100644 index 0000000000..4b08275f3b --- /dev/null +++ b/thirdparty/embree/kernels/common/vector.h @@ -0,0 +1,76 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "default.h" + +namespace embree +{ + /*! invokes the memory monitor callback */ + struct MemoryMonitorInterface { + virtual void memoryMonitor(ssize_t bytes, bool post) = 0; + }; + + /*! allocator that performs aligned monitored allocations */ + template<typename T, size_t alignment = 64> + struct aligned_monitored_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) + : device(device), hugepages(false) {} + + __forceinline pointer allocate( size_type n ) + { + if (n) { + assert(device); + device->memoryMonitor(n*sizeof(T),false); + } + if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M) + { + pointer p = (pointer) os_malloc(n*sizeof(value_type),hugepages); + assert(p); + return p; + } + return (pointer) alignedMalloc(n*sizeof(value_type),alignment); + } + + __forceinline void deallocate( pointer p, size_type n ) + { + if (p) + { + if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M) + os_free(p,n*sizeof(value_type),hugepages); + else + alignedFree(p); + } + else assert(n == 0); + + if (n) { + assert(device); + device->memoryMonitor(-ssize_t(n)*sizeof(T),true); + } + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + + private: + MemoryMonitorInterface* device; + bool hugepages; + }; + + /*! monitored vector */ + template<typename T> + using mvector = vector_t<T,aligned_monitored_allocator<T,std::alignment_of<T>::value> >; +} diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h new file mode 100644 index 0000000000..80a8ab2a56 --- /dev/null +++ b/thirdparty/embree/kernels/config.h @@ -0,0 +1,76 @@ + +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +/* #undef EMBREE_RAY_MASK */ +/* #undef EMBREE_STAT_COUNTERS */ +/* #undef EMBREE_BACKFACE_CULLING */ +/* #undef EMBREE_BACKFACE_CULLING_CURVES */ +#define EMBREE_FILTER_FUNCTION +/* #undef EMBREE_IGNORE_INVALID_RAYS */ +#define EMBREE_GEOMETRY_TRIANGLE +/* #undef EMBREE_GEOMETRY_QUAD */ +/* #undef EMBREE_GEOMETRY_CURVE */ +/* #undef EMBREE_GEOMETRY_SUBDIVISION */ +/* #undef EMBREE_GEOMETRY_USER */ +/* #undef EMBREE_GEOMETRY_INSTANCE */ +/* #undef EMBREE_GEOMETRY_GRID */ +/* #undef EMBREE_GEOMETRY_POINT */ +/* #undef EMBREE_RAY_PACKETS */ +/* #undef EMBREE_COMPACT_POLYS */ + +#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + #define IF_ENABLED_TRIS(x) x +#else + #define IF_ENABLED_TRIS(x) +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + #define IF_ENABLED_QUADS(x) x +#else + #define IF_ENABLED_QUADS(x) +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + #define IF_ENABLED_CURVES_OR_POINTS(x) x +#else + #define IF_ENABLED_CURVES_OR_POINTS(x) +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) + #define IF_ENABLED_CURVES(x) x +#else + #define IF_ENABLED_CURVES(x) +#endif + +#if defined(EMBREE_GEOMETRY_POINT) + #define IF_ENABLED_POINTS(x) x +#else + #define IF_ENABLED_POINTS(x) +#endif + +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + #define IF_ENABLED_SUBDIV(x) x +#else + #define IF_ENABLED_SUBDIV(x) +#endif + +#if defined(EMBREE_GEOMETRY_USER) + #define IF_ENABLED_USER(x) x +#else + #define IF_ENABLED_USER(x) +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + #define IF_ENABLED_INSTANCE(x) x +#else + #define IF_ENABLED_INSTANCE(x) +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + #define IF_ENABLED_GRIDS(x) x +#else + #define IF_ENABLED_GRIDS(x) +#endif diff --git a/thirdparty/embree/kernels/geometry/cone.h b/thirdparty/embree/kernels/geometry/cone.h new file mode 100644 index 0000000000..17429bab32 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/cone.h @@ -0,0 +1,321 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + struct Cone + { + const Vec3fa p0; //!< start position of cone + const Vec3fa p1; //!< end position of cone + const float r0; //!< start radius of cone + const float r1; //!< end radius of cone + + __forceinline Cone(const Vec3fa& p0, const float r0, const Vec3fa& p1, const float r1) + : p0(p0), p1(p1), r0(r0), r1(r1) {} + + __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, + BBox1f& t_o, + float& u0_o, Vec3fa& Ng0_o, + float& u1_o, Vec3fa& Ng1_o) const + { + /* calculate quadratic equation to solve */ + const Vec3fa v0 = p0-org; + const Vec3fa v1 = p1-org; + + const float rl = rcp_length(v1-v0); + const Vec3fa P0 = v0, dP = (v1-v0)*rl; + const float dr = (r1-r0)*rl; + const Vec3fa O = -P0, dO = dir; + + const float dOdO = dot(dO,dO); + const float OdO = dot(dO,O); + const float OO = dot(O,O); + const float dOz = dot(dP,dO); + const float Oz = dot(dP,O); + + const float R = r0 + Oz*dr; + const float A = dOdO - sqr(dOz) * (1.0f+sqr(dr)); + const float B = 2.0f * (OdO - dOz*(Oz + R*dr)); + const float C = OO - (sqr(Oz) + sqr(R)); + + /* we miss the cone if determinant is smaller than zero */ + const float D = B*B - 4.0f*A*C; + if (D < 0.0f) return false; + + /* special case for rays that are "parallel" to the cone */ + const float eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); + if (unlikely(abs(A) < eps)) + { + /* cylinder case */ + if (abs(dr) < 16.0f*float(ulp)) { + if (C <= 0.0f) { t_o = BBox1f(neg_inf,pos_inf); return true; } + else { t_o = BBox1f(pos_inf,neg_inf); return false; } + } + + /* cone case */ + else + { + /* if we hit the negative cone there cannot be a hit */ + const float t = -C/B; + const float z0 = Oz+t*dOz; + const float z0r = r0+z0*dr; + if (z0r < 0.0f) return false; + + /* test if we start inside or outside the cone */ + if (dOz*dr > 0.0f) t_o = BBox1f(t,pos_inf); + else t_o = BBox1f(neg_inf,t); + } + } + + /* standard case for "non-parallel" rays */ + else + { + const float Q = sqrt(D); + const float rcp_2A = rcp(2.0f*A); + t_o.lower = (-B-Q)*rcp_2A; + t_o.upper = (-B+Q)*rcp_2A; + + /* standard case where both hits are on same cone */ + if (likely(A > 0.0f)) { + const float z0 = Oz+t_o.lower*dOz; + const float z0r = r0+z0*dr; + if (z0r < 0.0f) return false; + } + + /* special case where the hits are on the positive and negative cone */ + else + { + /* depending on the ray direction and the open direction + * of the cone we have a hit from inside or outside the + * cone */ + if (dOz*dr > 0) t_o.upper = pos_inf; + else t_o.lower = neg_inf; + } + } + + /* calculates u and Ng for near hit */ + { + u0_o = (Oz+t_o.lower*dOz)*rl; + const Vec3fa Pr = t_o.lower*dir; + const Vec3fa Pl = v0 + u0_o*(v1-v0); + const Vec3fa R = normalize(Pr-Pl); + const Vec3fa U = (p1-p0)+(r1-r0)*R; + const Vec3fa V = cross(p1-p0,R); + Ng0_o = cross(V,U); + } + + /* calculates u and Ng for far hit */ + { + u1_o = (Oz+t_o.upper*dOz)*rl; + const Vec3fa Pr = t_o.upper*dir; + const Vec3fa Pl = v0 + u1_o*(v1-v0); + const Vec3fa R = normalize(Pr-Pl); + const Vec3fa U = (p1-p0)+(r1-r0)*R; + const Vec3fa V = cross(p1-p0,R); + Ng1_o = cross(V,U); + } + return true; + } + + __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, BBox1f& t_o) const + { + float u0_o; Vec3fa Ng0_o; float u1_o; Vec3fa Ng1_o; + return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); + } + + static bool verify(const size_t id, const Cone& cone, const Ray& ray, bool shouldhit, const float t0, const float t1) + { + float eps = 0.001f; + BBox1f t; bool hit; + hit = cone.intersect(ray.org,ray.dir,t); + + bool failed = hit != shouldhit; + if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : (t0 == -1E6) ? t.lower > -1E6f : abs(t0-t.lower) > eps; + if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : (t1 == +1E6) ? t.upper < +1E6f : abs(t1-t.upper) > eps; + if (!failed) return true; + embree_cout << "Cone test " << id << " failed: cone = " << cone << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; + return false; + } + + /* verify cone class */ + static bool verify() + { + bool passed = true; + const Cone cone0(Vec3fa(0.0f,0.0f,0.0f),0.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f); + passed &= verify(0,cone0,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,3.0f,pos_inf); + passed &= verify(1,cone0,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f); + passed &= verify(2,cone0,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),false,0.0f,0.0f); + passed &= verify(3,cone0,Ray(Vec3fa(+1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,3.0f); + passed &= verify(4,cone0,Ray(Vec3fa(-1.0f,0.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,1.0f,pos_inf); + passed &= verify(5,cone0,Ray(Vec3fa(+1.0f,0.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f); + passed &= verify(6,cone0,Ray(Vec3fa(+0.0f,0.0f,1.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,1.0f); + passed &= verify(7,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); + passed &= verify(8,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(+1.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.5f,+1E6); + passed &= verify(9,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,+1.0f,+0.0f),0.0f,float(inf)),true,-1E6,-0.5f); + const Cone cone1(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),0.0f); + passed &= verify(10,cone1,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,2.0f); + passed &= verify(11,cone1,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,0.0f,4.0f); + const Cone cylinder(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f); + passed &= verify(12,cylinder,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); + passed &= verify(13,cylinder,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); + passed &= verify(14,cylinder,Ray(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); + passed &= verify(15,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); + passed &= verify(16,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); + passed &= verify(17,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); + passed &= verify(18,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); + return passed; + } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cone& c) { + return cout << "Cone { p0 = " << c.p0 << ", r0 = " << c.r0 << ", p1 = " << c.p1 << ", r1 = " << c.r1 << "}"; + } + }; + + template<int N> + struct ConeN + { + typedef Vec3<vfloat<N>> Vec3vfN; + + const Vec3vfN p0; //!< start position of cone + const Vec3vfN p1; //!< end position of cone + const vfloat<N> r0; //!< start radius of cone + const vfloat<N> r1; //!< end radius of cone + + __forceinline ConeN(const Vec3vfN& p0, const vfloat<N>& r0, const Vec3vfN& p1, const vfloat<N>& r1) + : p0(p0), p1(p1), r0(r0), r1(r1) {} + + __forceinline Cone operator[] (const size_t i) const + { + assert(i<N); + return Cone(Vec3fa(p0.x[i],p0.y[i],p0.z[i]),r0[i],Vec3fa(p1.x[i],p1.y[i],p1.z[i]),r1[i]); + } + + __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, + BBox<vfloat<N>>& t_o, + vfloat<N>& u0_o, Vec3vfN& Ng0_o, + vfloat<N>& u1_o, Vec3vfN& Ng1_o) const + { + /* calculate quadratic equation to solve */ + const Vec3vfN v0 = p0-Vec3vfN(org); + const Vec3vfN v1 = p1-Vec3vfN(org); + + const vfloat<N> rl = rcp_length(v1-v0); + const Vec3vfN P0 = v0, dP = (v1-v0)*rl; + const vfloat<N> dr = (r1-r0)*rl; + const Vec3vfN O = -P0, dO = dir; + + const vfloat<N> dOdO = dot(dO,dO); + const vfloat<N> OdO = dot(dO,O); + const vfloat<N> OO = dot(O,O); + const vfloat<N> dOz = dot(dP,dO); + const vfloat<N> Oz = dot(dP,O); + + const vfloat<N> R = r0 + Oz*dr; + const vfloat<N> A = dOdO - sqr(dOz) * (vfloat<N>(1.0f)+sqr(dr)); + const vfloat<N> B = 2.0f * (OdO - dOz*(Oz + R*dr)); + const vfloat<N> C = OO - (sqr(Oz) + sqr(R)); + + /* we miss the cone if determinant is smaller than zero */ + const vfloat<N> D = B*B - 4.0f*A*C; + vbool<N> valid = D >= 0.0f; + if (none(valid)) return valid; + + /* special case for rays that are "parallel" to the cone */ + const vfloat<N> eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); + const vbool<N> validt = valid & (abs(A) < eps); + const vbool<N> validf = valid & !(abs(A) < eps); + if (unlikely(any(validt))) + { + const vboolx validtt = validt & (abs(dr) < 16.0f*float(ulp)); + const vboolx validtf = validt & (abs(dr) >= 16.0f*float(ulp)); + + /* cylinder case */ + if (unlikely(any(validtt))) + { + t_o.lower = select(validtt, select(C <= 0.0f, vfloat<N>(neg_inf), vfloat<N>(pos_inf)), t_o.lower); + t_o.upper = select(validtt, select(C <= 0.0f, vfloat<N>(pos_inf), vfloat<N>(neg_inf)), t_o.upper); + valid &= !validtt | C <= 0.0f; + } + + /* cone case */ + if (any(validtf)) + { + /* if we hit the negative cone there cannot be a hit */ + const vfloat<N> t = -C/B; + const vfloat<N> z0 = Oz+t*dOz; + const vfloat<N> z0r = r0+z0*dr; + valid &= !validtf | z0r >= 0.0f; + + /* test if we start inside or outside the cone */ + t_o.lower = select(validtf, select(dOz*dr > 0.0f, t, vfloat<N>(neg_inf)), t_o.lower); + t_o.upper = select(validtf, select(dOz*dr > 0.0f, vfloat<N>(pos_inf), t), t_o.upper); + } + } + + /* standard case for "non-parallel" rays */ + if (likely(any(validf))) + { + const vfloat<N> Q = sqrt(D); + const vfloat<N> rcp_2A = 0.5f*rcp(A); + t_o.lower = select(validf, (-B-Q)*rcp_2A, t_o.lower); + t_o.upper = select(validf, (-B+Q)*rcp_2A, t_o.upper); + + /* standard case where both hits are on same cone */ + const vbool<N> validft = validf & A>0.0f; + const vbool<N> validff = validf & !(A>0.0f); + if (any(validft)) { + const vfloat<N> z0 = Oz+t_o.lower*dOz; + const vfloat<N> z0r = r0+z0*dr; + valid &= !validft | z0r >= 0.0f; + } + + /* special case where the hits are on the positive and negative cone */ + if (any(validff)) { + /* depending on the ray direction and the open direction + * of the cone we have a hit from inside or outside the + * cone */ + t_o.lower = select(validff, select(dOz*dr > 0.0f, t_o.lower, float(neg_inf)), t_o.lower); + t_o.upper = select(validff, select(dOz*dr > 0.0f, float(pos_inf), t_o.upper), t_o.upper); + } + } + + /* calculates u and Ng for near hit */ + { + u0_o = (Oz+t_o.lower*dOz)*rl; + const Vec3vfN Pr = t_o.lower*Vec3vfN(dir); + const Vec3vfN Pl = v0 + u0_o*(v1-v0); + const Vec3vfN R = normalize(Pr-Pl); + const Vec3vfN U = (p1-p0)+(r1-r0)*R; + const Vec3vfN V = cross(p1-p0,R); + Ng0_o = cross(V,U); + } + + /* calculates u and Ng for far hit */ + { + u1_o = (Oz+t_o.upper*dOz)*rl; + const Vec3vfN Pr = t_o.lower*Vec3vfN(dir); + const Vec3vfN Pl = v0 + u1_o*(v1-v0); + const Vec3vfN R = normalize(Pr-Pl); + const Vec3vfN U = (p1-p0)+(r1-r0)*R; + const Vec3vfN V = cross(p1-p0,R); + Ng1_o = cross(V,U); + } + return valid; + } + + __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const + { + vfloat<N> u0_o; Vec3vfN Ng0_o; vfloat<N> u1_o; Vec3vfN Ng1_o; + return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); + } + }; + } +} + diff --git a/thirdparty/embree/kernels/geometry/coneline_intersector.h b/thirdparty/embree/kernels/geometry/coneline_intersector.h new file mode 100644 index 0000000000..90f3792eff --- /dev/null +++ b/thirdparty/embree/kernels/geometry/coneline_intersector.h @@ -0,0 +1,209 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + namespace __coneline_internal + { + template<int M, typename Epilog, typename ray_tfar_func> + static __forceinline bool intersectCone(const vbool<M>& valid_i, + const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, + const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar, + const Vec4vf<M>& v0, const Vec4vf<M>& v1, + const vbool<M>& cL, const vbool<M>& cR, + const Epilog& epilog) + { + vbool<M> valid = valid_i; + + /* move ray origin closer to make calculations numerically stable */ + const vfloat<M> dOdO = sqr(ray_dir); + const vfloat<M> rcp_dOdO = rcp(dOdO); + const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz()); + const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO; + const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir; + + const Vec3vf<M> dP = v1.xyz() - v0.xyz(); + const Vec3vf<M> p0 = ray_org - v0.xyz(); + const Vec3vf<M> p1 = ray_org - v1.xyz(); + + const vfloat<M> dPdP = sqr(dP); + const vfloat<M> dP0 = dot(p0,dP); + const vfloat<M> dP1 = dot(p1,dP); + const vfloat<M> dOdP = dot(ray_dir,dP); + + // intersect cone body + const vfloat<M> dr = v0.w - v1.w; + const vfloat<M> hy = dPdP + sqr(dr); + const vfloat<M> dO0 = dot(ray_dir,p0); + const vfloat<M> OO = sqr(p0); + const vfloat<M> dPdP2 = sqr(dPdP); + const vfloat<M> dPdPr0 = dPdP*v0.w; + + const vfloat<M> A = dPdP2 - sqr(dOdP)*hy; + const vfloat<M> B = dPdP2*dO0 - dP0*dOdP*hy + dPdPr0*(dr*dOdP); + const vfloat<M> C = dPdP2*OO - sqr(dP0)*hy + dPdPr0*(2.0f*dr*dP0 - dPdPr0); + + const vfloat<M> D = B*B - A*C; + valid &= D >= 0.0f; + if (unlikely(none(valid))) { + return false; + } + + /* standard case for "non-parallel" rays */ + const vfloat<M> Q = sqrt(D); + const vfloat<M> rcp_A = rcp(A); + /* special case for rays that are "parallel" to the cone - assume miss */ + const vbool<M> isParallel = abs(A) <= min_rcp_input; + + vfloat<M> t_cone_lower = select (isParallel, neg_inf, (-B-Q)*rcp_A); + vfloat<M> t_cone_upper = select (isParallel, pos_inf, (-B+Q)*rcp_A); + const vfloat<M> y_lower = dP0 + t_cone_lower*dOdP; + const vfloat<M> y_upper = dP0 + t_cone_upper*dOdP; + t_cone_lower = select(valid & y_lower > 0.0f & y_lower < dPdP, t_cone_lower, pos_inf); + t_cone_upper = select(valid & y_upper > 0.0f & y_upper < dPdP, t_cone_upper, neg_inf); + + const vbool<M> hitDisk0 = valid & cL; + const vbool<M> hitDisk1 = valid & cR; + const vfloat<M> rcp_dOdP = rcp(dOdP); + const vfloat<M> t_disk0 = select (hitDisk0, select (sqr(p0*dOdP-ray_dir*dP0)<(sqr(v0.w)*sqr(dOdP)), -dP0*rcp_dOdP, pos_inf), pos_inf); + const vfloat<M> t_disk1 = select (hitDisk1, select (sqr(p1*dOdP-ray_dir*dP1)<(sqr(v1.w)*sqr(dOdP)), -dP1*rcp_dOdP, pos_inf), pos_inf); + const vfloat<M> t_disk_lower = min(t_disk0, t_disk1); + const vfloat<M> t_disk_upper = max(t_disk0, t_disk1); + + const vfloat<M> t_lower = min(t_cone_lower, t_disk_lower); + const vfloat<M> t_upper = max(t_cone_upper, select(t_lower==t_disk_lower, + select(t_disk_upper==vfloat<M>(pos_inf),neg_inf,t_disk_upper), + select(t_disk_lower==vfloat<M>(pos_inf),neg_inf,t_disk_lower))); + + const vbool<M> valid_lower = valid & ray_tnear <= dt+t_lower & dt+t_lower <= ray_tfar() & t_lower != vfloat<M>(pos_inf); + const vbool<M> valid_upper = valid & ray_tnear <= dt+t_upper & dt+t_upper <= ray_tfar() & t_upper != vfloat<M>(neg_inf); + + const vbool<M> valid_first = valid_lower | valid_upper; + if (unlikely(none(valid_first))) + return false; + + const vfloat<M> t_first = select(valid_lower, t_lower, t_upper); + const vfloat<M> y_first = select(valid_lower, y_lower, y_upper); + + const vfloat<M> rcp_dPdP = rcp(dPdP); + const Vec3vf<M> dP2drr0dP = dPdP*dr*v0.w*dP; + const Vec3vf<M> dPhy = dP*hy; + const vbool<M> cone_hit_first = valid & (t_first == t_cone_lower | t_first == t_cone_upper); + const vbool<M> disk0_hit_first = valid & (t_first == t_disk0); + const Vec3vf<M> Ng_first = select(cone_hit_first, dPdP2*(p0+t_first*ray_dir)+dP2drr0dP-dPhy*y_first, select(disk0_hit_first, -dP, dP)); + const vfloat<M> u_first = select(cone_hit_first, y_first*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one))); + + /* invoke intersection filter for first hit */ + RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first); + const bool is_hit_first = epilog(valid_first, hit); + + /* check for possible second hits before potentially accepted hit */ + const vfloat<M> t_second = t_upper; + const vfloat<M> y_second = y_upper; + const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_upper <= ray_tfar()); + if (unlikely(none(valid_second))) + return is_hit_first; + + /* invoke intersection filter for second hit */ + const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper; + const vbool<M> disk0_hit_second = t_second == t_disk0; + const Vec3vf<M> Ng_second = select(cone_hit_second, dPdP2*(p0+t_second*ray_dir)+dP2drr0dP-dPhy*y_second, select(disk0_hit_second, -dP, dP)); + const vfloat<M> u_second = select(cone_hit_second, y_second*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one))); + + hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second); + const bool is_hit_second = epilog(valid_second, hit); + + return is_hit_first | is_hit_second; + } + } + + template<int M> + struct ConeLineIntersectorHitM + { + __forceinline ConeLineIntersectorHitM() {} + + __forceinline ConeLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng) + : vu(u), vv(v), vt(t), vNg(Ng) {} + + __forceinline void finalize() {} + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + public: + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + }; + + template<int M> + struct ConeCurveIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + struct ray_tfar { + Ray& ray; + __forceinline ray_tfar(Ray& ray) : ray(ray) {} + __forceinline vfloat<M> operator() () const { return ray.tfar; }; + }; + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + Ray& ray, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, const Vec4vf<M>& v1i, + const vbool<M>& cL, const vbool<M>& cR, + const Epilog& epilog) + { + const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); + const vfloat<M> ray_tnear(ray.tnear()); + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i); + return __coneline_internal::intersectCone<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,cL,cR,epilog); + } + }; + + template<int M, int K> + struct ConeCurveIntersectorK + { + typedef CurvePrecalculationsK<K> Precalculations; + + struct ray_tfar { + RayK<K>& ray; + size_t k; + __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {} + __forceinline vfloat<M> operator() () const { return ray.tfar[k]; }; + }; + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + RayK<K>& ray, size_t k, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, const Vec4vf<M>& v1i, + const vbool<M>& cL, const vbool<M>& cR, + const Epilog& epilog) + { + const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + const vfloat<M> ray_tnear = ray.tnear()[k]; + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i); + return __coneline_internal::intersectCone<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,cL,cR,epilog); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/conelinei_intersector.h b/thirdparty/embree/kernels/geometry/conelinei_intersector.h new file mode 100644 index 0000000000..6a985ebcad --- /dev/null +++ b/thirdparty/embree/kernels/geometry/conelinei_intersector.h @@ -0,0 +1,141 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "coneline_intersector.h" +#include "intersector_epilog.h" + +namespace embree +{ + namespace isa + { + template<int M, bool filter> + struct ConeCurveMiIntersector1 + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; + vbool<M> cL,cR; + line.gather(v0,v1,cL,cR,geom); + const vbool<M> valid = line.valid(); + ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; + vbool<M> cL,cR; + line.gather(v0,v1,cL,cR,geom); + const vbool<M> valid = line.valid(); + return ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + return false; + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line); + } + }; + + template<int M, bool filter> + struct ConeCurveMiMBIntersector1 + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; + vbool<M> cL,cR; + line.gather(v0,v1,cL,cR,geom,ray.time()); + const vbool<M> valid = line.valid(); + ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; + vbool<M> cL,cR; + line.gather(v0,v1,cL,cR,geom,ray.time()); + const vbool<M> valid = line.valid(); + return ConeCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + return false; + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line); + } + }; + + template<int M, int K, bool filter> + struct ConeCurveMiIntersectorK + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; + vbool<M> cL,cR; + line.gather(v0,v1,cL,cR,geom); + const vbool<M> valid = line.valid(); + ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; + vbool<M> cL,cR; + line.gather(v0,v1,cL,cR,geom); + const vbool<M> valid = line.valid(); + return ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + }; + + template<int M, int K, bool filter> + struct ConeCurveMiMBIntersectorK + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; + vbool<M> cL,cR; + line.gather(v0,v1,cL,cR,geom,ray.time()[k]); + const vbool<M> valid = line.valid(); + ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; + vbool<M> cL,cR; + line.gather(v0,v1,cL,cR,geom,ray.time()[k]); + const vbool<M> valid = line.valid(); + return ConeCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curveNi.h b/thirdparty/embree/kernels/geometry/curveNi.h new file mode 100644 index 0000000000..6366a6fb9c --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curveNi.h @@ -0,0 +1,222 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + template<int M> + struct CurveNi + { + struct Type : public PrimitiveType { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } + + static __forceinline size_t bytes(size_t N) + { + const size_t f = N/M, r = N%M; + static_assert(sizeof(CurveNi) == 22+25*M, "internal data layout issue"); + return f*sizeof(CurveNi) + (r!=0)*(22 + 25*r); + } + + public: + + /*! Default constructor. */ + __forceinline CurveNi () {} + + /*! fill curve from curve list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene) + { + size_t end = min(begin+M,_end); + N = (unsigned char)(end-begin); + const unsigned int geomID0 = prims[begin].geomID(); + this->geomID(N) = geomID0; + ty = (unsigned char) scene->get(geomID0)->getType(); + + /* encode all primitives */ + BBox3fa bounds = empty; + for (size_t i=0; i<N; i++) + { + const PrimRef& prim = prims[begin+i]; + const unsigned int geomID = prim.geomID(); assert(geomID == geomID0); + const unsigned int primID = prim.primID(); + bounds.extend(scene->get(geomID)->vbounds(primID)); + } + + /* calculate offset and scale */ + Vec3fa loffset = bounds.lower; + float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f))); + if (bounds.size() == Vec3fa(zero)) lscale = 0.0f; + *this->offset(N) = loffset; + *this->scale(N) = lscale; + + /* encode all primitives */ + for (size_t i=0; i<M && begin<end; i++, begin++) + { + const PrimRef& prim = prims[begin]; + const unsigned int geomID = prim.geomID(); + const unsigned int primID = prim.primID(); + const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpace(primID); + + const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz)); + const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID); + + bounds_vx_x(N)[i] = (char) space3.vx.x; + bounds_vx_y(N)[i] = (char) space3.vx.y; + bounds_vx_z(N)[i] = (char) space3.vx.z; + bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f); + bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f); + + bounds_vy_x(N)[i] = (char) space3.vy.x; + bounds_vy_y(N)[i] = (char) space3.vy.y; + bounds_vy_z(N)[i] = (char) space3.vy.z; + bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f); + bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f); + + bounds_vz_x(N)[i] = (char) space3.vz.x; + bounds_vz_y(N)[i] = (char) space3.vz.y; + bounds_vz_z(N)[i] = (char) space3.vz.z; + bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f); + bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.upper.z) && ceil (bounds.upper.z) <= 32767.0f); + + this->primID(N)[i] = primID; + } + } + + template<typename BVH, typename Allocator> + __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) + { + size_t start = set.begin(); + size_t items = CurveNi::blocks(set.size()); + size_t numbytes = CurveNi::bytes(set.size()); + CurveNi* accel = (CurveNi*) alloc.malloc1(numbytes,BVH::byteAlignment); + for (size_t i=0; i<items; i++) { + accel[i].fill(prims,start,set.end(),bvh->scene); + } + return bvh->encodeLeaf((char*)accel,items); + }; + + public: + + // 27.6 - 46 bytes per primitive + unsigned char ty; + unsigned char N; + unsigned char data[4+25*M+16]; + + /* + struct Layout + { + unsigned int geomID; + unsigned int primID[N]; + + char bounds_vx_x[N]; + char bounds_vx_y[N]; + char bounds_vx_z[N]; + short bounds_vx_lower[N]; + short bounds_vx_upper[N]; + + char bounds_vy_x[N]; + char bounds_vy_y[N]; + char bounds_vy_z[N]; + short bounds_vy_lower[N]; + short bounds_vy_upper[N]; + + char bounds_vz_x[N]; + char bounds_vz_y[N]; + char bounds_vz_z[N]; + short bounds_vz_lower[N]; + short bounds_vz_upper[N]; + + Vec3f offset; + float scale; + }; + */ + + __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((char*)this+2); } + __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); } + + __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((char*)this+6); } + __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); } + + __forceinline char* bounds_vx_x(size_t N) { return (char*)((char*)this+6+4*N); } + __forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); } + + __forceinline char* bounds_vx_y(size_t N) { return (char*)((char*)this+6+5*N); } + __forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); } + + __forceinline char* bounds_vx_z(size_t N) { return (char*)((char*)this+6+6*N); } + __forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); } + + __forceinline short* bounds_vx_lower(size_t N) { return (short*)((char*)this+6+7*N); } + __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((char*)this+6+7*N); } + + __forceinline short* bounds_vx_upper(size_t N) { return (short*)((char*)this+6+9*N); } + __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((char*)this+6+9*N); } + + __forceinline char* bounds_vy_x(size_t N) { return (char*)((char*)this+6+11*N); } + __forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+11*N); } + + __forceinline char* bounds_vy_y(size_t N) { return (char*)((char*)this+6+12*N); } + __forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+12*N); } + + __forceinline char* bounds_vy_z(size_t N) { return (char*)((char*)this+6+13*N); } + __forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+13*N); } + + __forceinline short* bounds_vy_lower(size_t N) { return (short*)((char*)this+6+14*N); } + __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((char*)this+6+14*N); } + + __forceinline short* bounds_vy_upper(size_t N) { return (short*)((char*)this+6+16*N); } + __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((char*)this+6+16*N); } + + __forceinline char* bounds_vz_x(size_t N) { return (char*)((char*)this+6+18*N); } + __forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+18*N); } + + __forceinline char* bounds_vz_y(size_t N) { return (char*)((char*)this+6+19*N); } + __forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+19*N); } + + __forceinline char* bounds_vz_z(size_t N) { return (char*)((char*)this+6+20*N); } + __forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+20*N); } + + __forceinline short* bounds_vz_lower(size_t N) { return (short*)((char*)this+6+21*N); } + __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((char*)this+6+21*N); } + + __forceinline short* bounds_vz_upper(size_t N) { return (short*)((char*)this+6+23*N); } + __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((char*)this+6+23*N); } + + __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((char*)this+6+25*N); } + __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+25*N); } + + __forceinline float* scale(size_t N) { return (float*)((char*)this+6+25*N+12); } + __forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+25*N+12); } + + __forceinline char* end(size_t N) { return (char*)this+6+25*N+16; } + __forceinline const char* end(size_t N) const { return (char*)this+6+25*N+16; } + }; + + template<int M> + typename CurveNi<M>::Type CurveNi<M>::type; + + typedef CurveNi<4> Curve4i; + typedef CurveNi<8> Curve8i; +} diff --git a/thirdparty/embree/kernels/geometry/curveNi_intersector.h b/thirdparty/embree/kernels/geometry/curveNi_intersector.h new file mode 100644 index 0000000000..c0b66515c1 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curveNi_intersector.h @@ -0,0 +1,569 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curveNi.h" + +namespace embree +{ + namespace isa + { + template<int M> + struct CurveNiIntersector1 + { + typedef CurveNi<M> Primitive; + typedef Vec3vf<M> Vec3vfM; + typedef LinearSpace3<Vec3vfM>LinearSpace3vfM; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o) + { + const size_t N = prim.N; + const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); + const Vec3fa offset = Vec3fa(offset_scale); + const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); + const Vec3fa org1 = (ray.org-offset)*scale; + const Vec3fa dir1 = ray.dir*scale; + + const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)), + vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)), + vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N))); + + const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); + const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); + const Vec3vfM rcp_dir2 = rcp_safe(dir2); + + const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x); + const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x); + const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y); + const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y); + const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z); + const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z); + + const vfloat<M> round_up (1.0f+3.0f*float(ulp)); + const vfloat<M> round_down(1.0f-3.0f*float(ulp)); + const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear())); + const vfloat<M> tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar)); + tNear_o = tNear; + return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar); + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + + unsigned int vertexID = geom->curve(primID); + Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + + unsigned int vertexID = geom->curve(primID); + Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); + Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); + if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); + Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); + if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + }; + + template<int M, int K> + struct CurveNiIntersectorK + { + typedef CurveNi<M> Primitive; + typedef Vec3vf<M> Vec3vfM; + typedef LinearSpace3<Vec3vfM>LinearSpace3vfM; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o) + { + const size_t N = prim.N; + const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); + const Vec3fa offset = Vec3fa(offset_scale); + const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); + + const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); + const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + const Vec3fa org1 = (ray_org-offset)*scale; + const Vec3fa dir1 = ray_dir*scale; + + const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)), + vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)), + vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N))); + + const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); + const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); + const Vec3vfM rcp_dir2 = rcp_safe(dir2); + + const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x); + const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x); + const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y); + const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y); + const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z); + const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z); + + const vfloat<M> round_up (1.0f+3.0f*float(ulp)); + const vfloat<M> round_down(1.0f-3.0f*float(ulp)); + const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k])); + const vfloat<M> tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k])); + tNear_o = tNear; + return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar); + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + + unsigned int vertexID = geom->curve(primID); + Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + + unsigned int vertexID = geom->curve(primID); + Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); + Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); + if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); + Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); + if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curveNi_mb.h b/thirdparty/embree/kernels/geometry/curveNi_mb.h new file mode 100644 index 0000000000..5d972b43a0 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curveNi_mb.h @@ -0,0 +1,278 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + template<int M> + struct CurveNiMB + { + struct Type : public PrimitiveType { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } + + static __forceinline size_t bytes(size_t N) + { + const size_t f = N/M, r = N%M; + static_assert(sizeof(CurveNiMB) == 6+37*M+24, "internal data layout issue"); + return f*sizeof(CurveNiMB) + (r!=0)*(6+37*r+24); + } + + public: + + /*! Default constructor. */ + __forceinline CurveNiMB () {} + + /*! fill curve from curve list */ + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range) + { + size_t end = min(begin+M,_end); + N = (unsigned char)(end-begin); + const unsigned int geomID0 = prims[begin].geomID(); + this->geomID(N) = geomID0; + ty = (unsigned char) scene->get(geomID0)->getType(); + + /* encode all primitives */ + LBBox3fa lbounds = empty; + for (size_t i=0; i<N; i++) + { + const PrimRefMB& prim = prims[begin+i]; + const unsigned int geomID = prim.geomID(); assert(geomID == geomID0); + const unsigned int primID = prim.primID(); + lbounds.extend(scene->get(geomID)->vlinearBounds(primID,time_range)); + } + BBox3fa bounds = lbounds.bounds(); + + /* calculate offset and scale */ + Vec3fa loffset = bounds.lower; + float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f))); + if (bounds.size() == Vec3fa(zero)) lscale = 0.0f; + *this->offset(N) = loffset; + *this->scale(N) = lscale; + this->time_offset(N) = time_range.lower; + this->time_scale(N) = 1.0f/time_range.size(); + + /* encode all primitives */ + for (size_t i=0; i<M && begin<end; i++, begin++) + { + const PrimRefMB& prim = prims[begin]; + const unsigned int geomID = prim.geomID(); + const unsigned int primID = prim.primID(); + const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpaceMB(primID,time_range); + + const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz)); + const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range); + + // NOTE: this weird (char) (short) cast works around VS2015 Win32 compiler bug + bounds_vx_x(N)[i] = (char) (short) space3.vx.x; + bounds_vx_y(N)[i] = (char) (short) space3.vx.y; + bounds_vx_z(N)[i] = (char) (short) space3.vx.z; + bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f); + bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f); + bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f); + bounds_vx_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.x),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.bounds0.lower.x) && floor(bounds.bounds0.lower.x) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds0.upper.x) && ceil (bounds.bounds0.upper.x) <= 32767.0f); + assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f); + + bounds_vy_x(N)[i] = (char) (short) space3.vy.x; + bounds_vy_y(N)[i] = (char) (short) space3.vy.y; + bounds_vy_z(N)[i] = (char) (short) space3.vy.z; + bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f); + bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f); + bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f); + bounds_vy_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.y),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.bounds0.lower.y) && floor(bounds.bounds0.lower.y) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds0.upper.y) && ceil (bounds.bounds0.upper.y) <= 32767.0f); + assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f); + + bounds_vz_x(N)[i] = (char) (short) space3.vz.x; + bounds_vz_y(N)[i] = (char) (short) space3.vz.y; + bounds_vz_z(N)[i] = (char) (short) space3.vz.z; + bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f); + bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f); + bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f); + bounds_vz_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.z),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.bounds0.lower.z) && floor(bounds.bounds0.lower.z) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds0.upper.z) && ceil (bounds.bounds0.upper.z) <= 32767.0f); + assert(-32767.0f <= floor(bounds.bounds1.lower.z) && floor(bounds.bounds1.lower.z) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds1.upper.z) && ceil (bounds.bounds1.upper.z) <= 32767.0f); + + this->primID(N)[i] = primID; + } + + return lbounds; + } + + template<typename BVH, typename SetMB, typename Allocator> + __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) + { + size_t start = prims.begin(); + size_t end = prims.end(); + size_t items = CurveNiMB::blocks(prims.size()); + size_t numbytes = CurveNiMB::bytes(prims.size()); + CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment); + const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items); + + LBBox3fa bounds = empty; + for (size_t i=0; i<items; i++) + bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range)); + + return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range); + }; + + + public: + + // 27.6 - 46 bytes per primitive + unsigned char ty; + unsigned char N; + unsigned char data[4+37*M+24]; + + /* + struct Layout + { + unsigned int geomID; + unsigned int primID[N]; + + char bounds_vx_x[N]; + char bounds_vx_y[N]; + char bounds_vx_z[N]; + short bounds_vx_lower0[N]; + short bounds_vx_upper0[N]; + short bounds_vx_lower1[N]; + short bounds_vx_upper1[N]; + + char bounds_vy_x[N]; + char bounds_vy_y[N]; + char bounds_vy_z[N]; + short bounds_vy_lower0[N]; + short bounds_vy_upper0[N]; + short bounds_vy_lower1[N]; + short bounds_vy_upper1[N]; + + char bounds_vz_x[N]; + char bounds_vz_y[N]; + char bounds_vz_z[N]; + short bounds_vz_lower0[N]; + short bounds_vz_upper0[N]; + short bounds_vz_lower1[N]; + short bounds_vz_upper1[N]; + + Vec3f offset; + float scale; + + float time_offset; + float time_scale; + }; + */ + + __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((char*)this+2); } + __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((char*)this+2); } + + __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((char*)this+6); } + __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((char*)this+6); } + + __forceinline char* bounds_vx_x(size_t N) { return (char*)((char*)this+6+4*N); } + __forceinline const char* bounds_vx_x(size_t N) const { return (char*)((char*)this+6+4*N); } + + __forceinline char* bounds_vx_y(size_t N) { return (char*)((char*)this+6+5*N); } + __forceinline const char* bounds_vx_y(size_t N) const { return (char*)((char*)this+6+5*N); } + + __forceinline char* bounds_vx_z(size_t N) { return (char*)((char*)this+6+6*N); } + __forceinline const char* bounds_vx_z(size_t N) const { return (char*)((char*)this+6+6*N); } + + __forceinline short* bounds_vx_lower0(size_t N) { return (short*)((char*)this+6+7*N); } + __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((char*)this+6+7*N); } + + __forceinline short* bounds_vx_upper0(size_t N) { return (short*)((char*)this+6+9*N); } + __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((char*)this+6+9*N); } + + __forceinline short* bounds_vx_lower1(size_t N) { return (short*)((char*)this+6+11*N); } + __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((char*)this+6+11*N); } + + __forceinline short* bounds_vx_upper1(size_t N) { return (short*)((char*)this+6+13*N); } + __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((char*)this+6+13*N); } + + __forceinline char* bounds_vy_x(size_t N) { return (char*)((char*)this+6+15*N); } + __forceinline const char* bounds_vy_x(size_t N) const { return (char*)((char*)this+6+15*N); } + + __forceinline char* bounds_vy_y(size_t N) { return (char*)((char*)this+6+16*N); } + __forceinline const char* bounds_vy_y(size_t N) const { return (char*)((char*)this+6+16*N); } + + __forceinline char* bounds_vy_z(size_t N) { return (char*)((char*)this+6+17*N); } + __forceinline const char* bounds_vy_z(size_t N) const { return (char*)((char*)this+6+17*N); } + + __forceinline short* bounds_vy_lower0(size_t N) { return (short*)((char*)this+6+18*N); } + __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((char*)this+6+18*N); } + + __forceinline short* bounds_vy_upper0(size_t N) { return (short*)((char*)this+6+20*N); } + __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((char*)this+6+20*N); } + + __forceinline short* bounds_vy_lower1(size_t N) { return (short*)((char*)this+6+22*N); } + __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((char*)this+6+22*N); } + + __forceinline short* bounds_vy_upper1(size_t N) { return (short*)((char*)this+6+24*N); } + __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((char*)this+6+24*N); } + + __forceinline char* bounds_vz_x(size_t N) { return (char*)((char*)this+6+26*N); } + __forceinline const char* bounds_vz_x(size_t N) const { return (char*)((char*)this+6+26*N); } + + __forceinline char* bounds_vz_y(size_t N) { return (char*)((char*)this+6+27*N); } + __forceinline const char* bounds_vz_y(size_t N) const { return (char*)((char*)this+6+27*N); } + + __forceinline char* bounds_vz_z(size_t N) { return (char*)((char*)this+6+28*N); } + __forceinline const char* bounds_vz_z(size_t N) const { return (char*)((char*)this+6+28*N); } + + __forceinline short* bounds_vz_lower0(size_t N) { return (short*)((char*)this+6+29*N); } + __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((char*)this+6+29*N); } + + __forceinline short* bounds_vz_upper0(size_t N) { return (short*)((char*)this+6+31*N); } + __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((char*)this+6+31*N); } + + __forceinline short* bounds_vz_lower1(size_t N) { return (short*)((char*)this+6+33*N); } + __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((char*)this+6+33*N); } + + __forceinline short* bounds_vz_upper1(size_t N) { return (short*)((char*)this+6+35*N); } + __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((char*)this+6+35*N); } + + __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((char*)this+6+37*N); } + __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((char*)this+6+37*N); } + + __forceinline float* scale(size_t N) { return (float*)((char*)this+6+37*N+12); } + __forceinline const float* scale(size_t N) const { return (float*)((char*)this+6+37*N+12); } + + __forceinline float& time_offset(size_t N) { return *(float*)((char*)this+6+37*N+16); } + __forceinline const float& time_offset(size_t N) const { return *(float*)((char*)this+6+37*N+16); } + + __forceinline float& time_scale(size_t N) { return *(float*)((char*)this+6+37*N+20); } + __forceinline const float& time_scale(size_t N) const { return *(float*)((char*)this+6+37*N+20); } + + __forceinline char* end(size_t N) { return (char*)this+6+37*N+24; } + __forceinline const char* end(size_t N) const { return (char*)this+6+37*N+24; } + }; + + template<int M> + typename CurveNiMB<M>::Type CurveNiMB<M>::type; + + typedef CurveNiMB<4> Curve4iMB; + typedef CurveNiMB<8> Curve8iMB; +} diff --git a/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h new file mode 100644 index 0000000000..bab796b33b --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curveNi_mb_intersector.h @@ -0,0 +1,516 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curveNi_mb.h" +#include "../subdiv/linear_bezier_patch.h" + +namespace embree +{ + namespace isa + { + template<int M> + struct CurveNiMBIntersector1 + { + typedef CurveNiMB<M> Primitive; + typedef Vec3vf<M> Vec3vfM; + typedef LinearSpace3<Vec3vfM>LinearSpace3vfM; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o) + { + const size_t N = prim.N; + const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); + const Vec3fa offset = Vec3fa(offset_scale); + const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); + const Vec3fa org1 = (ray.org-offset)*scale; + const Vec3fa dir1 = ray.dir*scale; + + const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)), + vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)), + vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N))); + + const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); + const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); + const Vec3vfM rcp_dir2 = rcp_safe(dir2); + + const vfloat<M> ltime = (ray.time()-prim.time_offset(N))*prim.time_scale(N); + const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N)); + const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N)); + const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0); + const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N)); + const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N)); + const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0); + + const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N)); + const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N)); + const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0); + const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N)); + const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N)); + const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0); + + const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N)); + const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N)); + const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0); + const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N)); + const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N)); + const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0); + + const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x); + const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x); + const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y); + const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y); + const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z); + const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z); + + const vfloat<M> round_up (1.0f+3.0f*float(ulp)); + const vfloat<M> round_down(1.0f-3.0f*float(ulp)); + const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear())); + const vfloat<M> tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar)); + tNear_o = tNear; + return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar); + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()); + + Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()); + + if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time()); + Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time()); + + if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()); + Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()); + if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time()); + Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time()); + if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + }; + + template<int M, int K> + struct CurveNiMBIntersectorK + { + typedef CurveNiMB<M> Primitive; + typedef Vec3vf<M> Vec3vfM; + typedef LinearSpace3<Vec3vfM>LinearSpace3vfM; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o) + { + const size_t N = prim.N; + const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); + const Vec3fa offset = Vec3fa(offset_scale); + const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); + + const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); + const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + const Vec3fa org1 = (ray_org-offset)*scale; + const Vec3fa dir1 = ray_dir*scale; + + const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)), + vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)), + vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N))); + + const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); + const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); + const Vec3vfM rcp_dir2 = rcp_safe(dir2); + + const vfloat<M> ltime = (ray.time()[k]-prim.time_offset(N))*prim.time_scale(N); + const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N)); + const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N)); + const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0); + const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N)); + const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N)); + const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0); + + const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N)); + const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N)); + const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0); + const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N)); + const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N)); + const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0); + + const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N)); + const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N)); + const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0); + const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N)); + const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N)); + const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0); + + const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x); + const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x); + const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y); + const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y); + const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z); + const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z); + + const vfloat<M> round_up (1.0f+3.0f*float(ulp)); + const vfloat<M> round_down(1.0f-3.0f*float(ulp)); + const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k])); + const vfloat<M> tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k])); + tNear_o = tNear; + return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar); + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]); + + Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]); + + if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]); + Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]); + + if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]); + Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]); + if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]); + Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID); + const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]); + if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curveNv.h b/thirdparty/embree/kernels/geometry/curveNv.h new file mode 100644 index 0000000000..e41a381706 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curveNv.h @@ -0,0 +1,101 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curveNi.h" + +namespace embree +{ + template<int M> + struct CurveNv : public CurveNi<M> + { + using CurveNi<M>::N; + + struct Type : public PrimitiveType { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } + + static __forceinline size_t bytes(size_t N) + { + const size_t f = N/M, r = N%M; + static_assert(sizeof(CurveNv) == 22+25*M+4*16*M, "internal data layout issue"); + return f*sizeof(CurveNv) + (r!=0)*(22 + 25*r + 4*16*r); + } + + public: + + /*! Default constructor. */ + __forceinline CurveNv () {} + + /*! fill curve from curve list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene) + { + size_t end = min(begin+M,_end); + size_t N = end-begin; + + /* encode all primitives */ + for (size_t i=0; i<N; i++) + { + const PrimRef& prim = prims[begin+i]; + const unsigned int geomID = prim.geomID(); + const unsigned int primID = prim.primID(); + CurveGeometry* mesh = (CurveGeometry*) scene->get(geomID); + const unsigned vtxID = mesh->curve(primID); + Vec3fa::storeu(&this->vertices(i,N)[0],mesh->vertex(vtxID+0)); + Vec3fa::storeu(&this->vertices(i,N)[1],mesh->vertex(vtxID+1)); + Vec3fa::storeu(&this->vertices(i,N)[2],mesh->vertex(vtxID+2)); + Vec3fa::storeu(&this->vertices(i,N)[3],mesh->vertex(vtxID+3)); + } + } + + template<typename BVH, typename Allocator> + __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) + { + if (set.size() == 0) + return BVH::emptyNode; + + /* fall back to CurveNi for oriented curves */ + unsigned int geomID = prims[set.begin()].geomID(); + if (bvh->scene->get(geomID)->getCurveType() == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) { + return CurveNi<M>::createLeaf(bvh,prims,set,alloc); + } + if (bvh->scene->get(geomID)->getCurveBasis() == Geometry::GTY_BASIS_HERMITE) { + return CurveNi<M>::createLeaf(bvh,prims,set,alloc); + } + + size_t start = set.begin(); + size_t items = CurveNv::blocks(set.size()); + size_t numbytes = CurveNv::bytes(set.size()); + CurveNv* accel = (CurveNv*) alloc.malloc1(numbytes,BVH::byteAlignment); + for (size_t i=0; i<items; i++) { + accel[i].CurveNv<M>::fill(prims,start,set.end(),bvh->scene); + accel[i].CurveNi<M>::fill(prims,start,set.end(),bvh->scene); + } + return bvh->encodeLeaf((char*)accel,items); + }; + + public: + unsigned char data[4*16*M]; + __forceinline Vec3fa* vertices(size_t i, size_t N) { return (Vec3fa*)CurveNi<M>::end(N)+4*i; } + __forceinline const Vec3fa* vertices(size_t i, size_t N) const { return (Vec3fa*)CurveNi<M>::end(N)+4*i; } + }; + + template<int M> + typename CurveNv<M>::Type CurveNv<M>::type; + + typedef CurveNv<4> Curve4v; + typedef CurveNv<8> Curve8v; +} diff --git a/thirdparty/embree/kernels/geometry/curveNv_intersector.h b/thirdparty/embree/kernels/geometry/curveNv_intersector.h new file mode 100644 index 0000000000..2742725aec --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curveNv_intersector.h @@ -0,0 +1,181 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curveNv.h" +#include "curveNi_intersector.h" + +namespace embree +{ + namespace isa + { + template<int M> + struct CurveNvIntersector1 : public CurveNiIntersector1<M> + { + typedef CurveNv<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); + const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); + const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); + const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); + const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + prefetchL1(&prim.vertices(i1,N)[0]); + prefetchL1(&prim.vertices(i1,N)[4]); + if (mask1) { + const size_t i2 = bsf(mask1); + prefetchL2(&prim.vertices(i2,N)[0]); + prefetchL2(&prim.vertices(i2,N)[4]); + } + } + + Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); + const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); + const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); + const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); + const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + prefetchL1(&prim.vertices(i1,N)[0]); + prefetchL1(&prim.vertices(i1,N)[4]); + if (mask1) { + const size_t i2 = bsf(mask1); + prefetchL2(&prim.vertices(i2,N)[0]); + prefetchL2(&prim.vertices(i2,N)[4]); + } + } + + if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar)); + } + return false; + } + }; + + template<int M, int K> + struct CurveNvIntersectorK : public CurveNiIntersectorK<M,K> + { + typedef CurveNv<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + template<typename Intersector, typename Epilog> + static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); + const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); + const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); + const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); + const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + prefetchL1(&prim.vertices(i1,N)[0]); + prefetchL1(&prim.vertices(i1,N)[4]); + if (mask1) { + const size_t i2 = bsf(mask1); + prefetchL2(&prim.vertices(i2,N)[0]); + prefetchL2(&prim.vertices(i2,N)[4]); + } + } + + Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + } + + template<typename Intersector, typename Epilog> + static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat<M> tNear; + vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); + const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); + const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); + const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); + const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + prefetchL1(&prim.vertices(i1,N)[0]); + prefetchL1(&prim.vertices(i1,N)[4]); + if (mask1) { + const size_t i2 = bsf(mask1); + prefetchL2(&prim.vertices(i2,N)[0]); + prefetchL2(&prim.vertices(i2,N)[4]); + } + } + + if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat<M>(ray.tfar[k])); + } + return false; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curve_intersector.h b/thirdparty/embree/kernels/geometry/curve_intersector.h new file mode 100644 index 0000000000..1e8ac26125 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curve_intersector.h @@ -0,0 +1,98 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../subdiv/bezier_curve.h" +#include "../common/primref.h" +#include "bezier_hair_intersector.h" +#include "bezier_ribbon_intersector.h" +#include "bezier_curve_intersector.h" +#include "oriented_curve_intersector.h" +#include "../bvh/node_intersector1.h" + +// FIXME: this file seems replicate of curve_intersector_virtual.h + +namespace embree +{ + namespace isa + { + struct VirtualCurveIntersector1 + { + typedef unsigned char Primitive; + typedef CurvePrecalculations1 Precalculations; + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + leafIntersector.intersect<1>(&pre,&ray,context,prim); + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + return leafIntersector.occluded<1>(&pre,&ray,context,prim); + } + }; + + template<int K> + struct VirtualCurveIntersectorK + { + typedef unsigned char Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + size_t mask = movemask(valid_i); + while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim); + } + + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + vbool<K> valid_o = false; + size_t mask = movemask(valid_i); + while (mask) { + size_t k = bscf(mask); + if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim)) + set(valid_o, k); + } + return valid_o; + } + + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + leafIntersector.intersect<K>(&pre,&ray,k,context,prim); + } + + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + return leafIntersector.occluded<K>(&pre,&ray,k,context,prim); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree/kernels/geometry/curve_intersector_distance.h new file mode 100644 index 0000000000..748a9511a5 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curve_intersector_distance.h @@ -0,0 +1,129 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + template<typename NativeCurve3fa, int M> + struct DistanceCurveHit + { + __forceinline DistanceCurveHit() {} + + __forceinline DistanceCurveHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N, + const NativeCurve3fa& curve3D) + : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {} + + __forceinline void finalize() + { + vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N)); + vv = V; + vt = T; + } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { + return curve3D.eval_du(vu[i]); + } + + public: + vfloat<M> U; + vfloat<M> V; + vfloat<M> T; + int i, N; + NativeCurve3fa curve3D; + + public: + vbool<M> valid; + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + }; + + template<typename NativeCurve3fa> + struct DistanceCurve1Intersector1 + { + template<typename Epilog> + __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3, + const Epilog& epilog) + { + const int N = geom->tessellationRate; + + /* transform control points into ray space */ + const NativeCurve3fa curve3Di(v0,v1,v2,v3); + const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di); + const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org); + + /* evaluate the bezier curve */ + vboolx valid = vfloatx(step) < vfloatx(float(N)); + const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N); + const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N); + + /* approximative intersection with cone */ + const Vec4vfx v = p1-p0; + const Vec4vfx w = -p0; + const vfloatx d0 = madd(w.x,v.x,w.y*v.y); + const vfloatx d1 = madd(v.x,v.x,v.y*v.y); + const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one)); + const Vec4vfx p = madd(u,v,p0); + const vfloatx t = p.z*pre.depth_scale; + const vfloatx d2 = madd(p.x,p.x,p.y*p.y); + const vfloatx r = p.w; + const vfloatx r2 = r*r; + valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar)); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections + + /* update hit information */ + bool ishit = false; + if (unlikely(any(valid))) { + DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,0,N,curve3D); + ishit = ishit | epilog(valid,hit); + } + + if (unlikely(VSIZEX < N)) + { + /* process SIMD-size many segments per iteration */ + for (int i=VSIZEX; i<N; i+=VSIZEX) + { + /* evaluate the bezier curve */ + vboolx valid = vintx(i)+vintx(step) < vintx(N); + const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N); + const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N); + + /* approximative intersection with cone */ + const Vec4vfx v = p1-p0; + const Vec4vfx w = -p0; + const vfloatx d0 = madd(w.x,v.x,w.y*v.y); + const vfloatx d1 = madd(v.x,v.x,v.y*v.y); + const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one)); + const Vec4vfx p = madd(u,v,p0); + const vfloatx t = p.z*pre.depth_scale; + const vfloatx d2 = madd(p.x,p.x,p.y*p.y); + const vfloatx r = p.w; + const vfloatx r2 = r*r; + valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar)); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections + + /* update hit information */ + if (unlikely(any(valid))) { + DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,i,N,curve3D); + ishit = ishit | epilog(valid,hit); + } + } + } + return ishit; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h new file mode 100644 index 0000000000..3d8900c2aa --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h @@ -0,0 +1,417 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" +#include "curve_intersector_sweep.h" +#include "../subdiv/linear_bezier_patch.h" + +#define DBG(x) + +namespace embree +{ + namespace isa + { + template<typename Ray, typename Epilog> + struct TensorLinearCubicBezierSurfaceIntersector + { + const LinearSpace3fa& ray_space; + Ray& ray; + TensorLinearCubicBezierSurface3fa curve3d; + TensorLinearCubicBezierSurface2fa curve2d; + float eps; + const Epilog& epilog; + bool isHit; + + __forceinline TensorLinearCubicBezierSurfaceIntersector (const LinearSpace3fa& ray_space, Ray& ray, const TensorLinearCubicBezierSurface3fa& curve3d, const Epilog& epilog) + : ray_space(ray_space), ray(ray), curve3d(curve3d), epilog(epilog), isHit(false) + { + const TensorLinearCubicBezierSurface3fa curve3dray = curve3d.xfm(ray_space,ray.org); + curve2d = TensorLinearCubicBezierSurface2fa(CubicBezierCurve2fa(curve3dray.L),CubicBezierCurve2fa(curve3dray.R)); + const BBox2fa b2 = curve2d.bounds(); + eps = 8.0f*float(ulp)*reduce_max(max(abs(b2.lower),abs(b2.upper))); + } + + __forceinline Interval1f solve_linear(const float u0, const float u1, const float& p0, const float& p1) + { + if (p1 == p0) { + if (p0 == 0.0f) return Interval1f(u0,u1); + else return Interval1f(empty); + } + const float t = -p0/(p1-p0); + const float tt = lerp(u0,u1,t); + return Interval1f(tt); + } + + __forceinline void solve_linear(const float u0, const float u1, const Interval1f& p0, const Interval1f& p1, Interval1f& u) + { + if (sign(p0.lower) != sign(p0.upper)) u.extend(u0); + if (sign(p0.lower) != sign(p1.lower)) u.extend(solve_linear(u0,u1,p0.lower,p1.lower)); + if (sign(p0.upper) != sign(p1.upper)) u.extend(solve_linear(u0,u1,p0.upper,p1.upper)); + if (sign(p1.lower) != sign(p1.upper)) u.extend(u1); + } + + __forceinline Interval1f bezier_clipping(const CubicBezierCurve<Interval1f>& curve) + { + Interval1f u = empty; + solve_linear(0.0f/3.0f,1.0f/3.0f,curve.v0,curve.v1,u); + solve_linear(0.0f/3.0f,2.0f/3.0f,curve.v0,curve.v2,u); + solve_linear(0.0f/3.0f,3.0f/3.0f,curve.v0,curve.v3,u); + solve_linear(1.0f/3.0f,2.0f/3.0f,curve.v1,curve.v2,u); + solve_linear(1.0f/3.0f,3.0f/3.0f,curve.v1,curve.v3,u); + solve_linear(2.0f/3.0f,3.0f/3.0f,curve.v2,curve.v3,u); + return intersect(u,Interval1f(0.0f,1.0f)); + } + + __forceinline Interval1f bezier_clipping(const LinearBezierCurve<Interval1f>& curve) + { + Interval1f v = empty; + solve_linear(0.0f,1.0f,curve.v0,curve.v1,v); + return intersect(v,Interval1f(0.0f,1.0f)); + } + + __forceinline void solve_bezier_clipping(BBox1f cu, BBox1f cv, const TensorLinearCubicBezierSurface2fa& curve2) + { + BBox2fa bounds = curve2.bounds(); + if (bounds.upper.x < 0.0f) return; + if (bounds.upper.y < 0.0f) return; + if (bounds.lower.x > 0.0f) return; + if (bounds.lower.y > 0.0f) return; + + if (max(cu.size(),cv.size()) < 1E-4f) + { + const float u = cu.center(); + const float v = cv.center(); + TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org); + const float t = curve_z.eval(u,v); + if (ray.tnear() <= t && t <= ray.tfar) { + const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v)); + BezierCurveHit hit(t,u,v,Ng); + isHit |= epilog(hit); + } + return; + } + + const Vec2fa dv = curve2.axis_v(); + const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv); + LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u(); + if (!curve0v.hasRoot()) return; + + const Interval1f v = bezier_clipping(curve0v); + if (isEmpty(v)) return; + TensorLinearCubicBezierSurface2fa curve2a = curve2.clip_v(v); + cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); + + const Vec2fa du = curve2.axis_u(); + const TensorLinearCubicBezierSurface1f curve1u = curve2a.xfm(du); + CubicBezierCurve<Interval1f> curve0u = curve1u.reduce_v(); + int roots = curve0u.maxRoots(); + if (roots == 0) return; + + if (roots == 1) + { + const Interval1f u = bezier_clipping(curve0u); + if (isEmpty(u)) return; + TensorLinearCubicBezierSurface2fa curve2b = curve2a.clip_u(u); + cu = BBox1f(lerp(cu.lower,cu.upper,u.lower),lerp(cu.lower,cu.upper,u.upper)); + solve_bezier_clipping(cu,cv,curve2b); + return; + } + + TensorLinearCubicBezierSurface2fa curve2l, curve2r; + curve2a.split_u(curve2l,curve2r); + solve_bezier_clipping(BBox1f(cu.lower,cu.center()),cv,curve2l); + solve_bezier_clipping(BBox1f(cu.center(),cu.upper),cv,curve2r); + } + + __forceinline bool solve_bezier_clipping() + { + solve_bezier_clipping(BBox1f(0.0f,1.0f),BBox1f(0.0f,1.0f),curve2d); + return isHit; + } + + __forceinline void solve_newton_raphson(BBox1f cu, BBox1f cv) + { + Vec2fa uv(cu.center(),cv.center()); + const Vec2fa dfdu = curve2d.eval_du(uv.x,uv.y); + const Vec2fa dfdv = curve2d.eval_dv(uv.x,uv.y); + const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv)); + solve_newton_raphson_loop(cu,cv,uv,dfdu,dfdv,rcp_J); + } + + __forceinline void solve_newton_raphson_loop(BBox1f cu, BBox1f cv, const Vec2fa& uv_in, const Vec2fa& dfdu, const Vec2fa& dfdv, const LinearSpace2fa& rcp_J) + { + Vec2fa uv = uv_in; + + for (size_t i=0; i<200; i++) + { + const Vec2fa f = curve2d.eval(uv.x,uv.y); + const Vec2fa duv = rcp_J*f; + uv -= duv; + + if (max(abs(f.x),abs(f.y)) < eps) + { + const float u = uv.x; + const float v = uv.y; + if (!(u >= 0.0f && u <= 1.0f)) return; // rejects NaNs + if (!(v >= 0.0f && v <= 1.0f)) return; // rejects NaNs + const TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org); + const float t = curve_z.eval(u,v); + if (!(ray.tnear() <= t && t <= ray.tfar)) return; // rejects NaNs + const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v)); + BezierCurveHit hit(t,u,v,Ng); + isHit |= epilog(hit); + return; + } + } + } + + __forceinline bool clip_v(BBox1f& cu, BBox1f& cv) + { + const Vec2fa dv = curve2d.eval_dv(cu.lower,cv.lower); + const TensorLinearCubicBezierSurface1f curve1v = curve2d.xfm(dv).clip(cu,cv); + LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u(); + if (!curve0v.hasRoot()) return false; + Interval1f v = bezier_clipping(curve0v); + if (isEmpty(v)) return false; + v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f)); + cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); + return true; + } + + __forceinline bool solve_krawczyk(bool very_small, BBox1f& cu, BBox1f& cv) + { + /* perform bezier clipping in v-direction to get tight v-bounds */ + TensorLinearCubicBezierSurface2fa curve2 = curve2d.clip(cu,cv); + const Vec2fa dv = curve2.axis_v(); + const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv); + LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u(); + if (unlikely(!curve0v.hasRoot())) return true; + Interval1f v = bezier_clipping(curve0v); + if (unlikely(isEmpty(v))) return true; + v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f)); + curve2 = curve2.clip_v(v); + cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); + + /* perform one newton raphson iteration */ + Vec2fa c(cu.center(),cv.center()); + Vec2fa f,dfdu,dfdv; curve2d.eval(c.x,c.y,f,dfdu,dfdv); + const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv)); + const Vec2fa c1 = c - rcp_J*f; + + /* calculate bounds of derivatives */ + const BBox2fa bounds_du = (1.0f/cu.size())*curve2.derivative_u().bounds(); + const BBox2fa bounds_dv = (1.0f/cv.size())*curve2.derivative_v().bounds(); + + /* calculate krawczyk test */ + LinearSpace2<Vec2<Interval1f>> I(Interval1f(1.0f), Interval1f(0.0f), + Interval1f(0.0f), Interval1f(1.0f)); + + LinearSpace2<Vec2<Interval1f>> G(Interval1f(bounds_du.lower.x,bounds_du.upper.x), Interval1f(bounds_dv.lower.x,bounds_dv.upper.x), + Interval1f(bounds_du.lower.y,bounds_du.upper.y), Interval1f(bounds_dv.lower.y,bounds_dv.upper.y)); + + const LinearSpace2<Vec2f> rcp_J2(rcp_J); + const LinearSpace2<Vec2<Interval1f>> rcp_Ji(rcp_J2); + + const Vec2<Interval1f> x(cu,cv); + const Vec2<Interval1f> K = Vec2<Interval1f>(Vec2f(c1)) + (I - rcp_Ji*G)*(x-Vec2<Interval1f>(Vec2f(c))); + + /* test if there is no solution */ + const Vec2<Interval1f> KK = intersect(K,x); + if (unlikely(isEmpty(KK.x) || isEmpty(KK.y))) return true; + + /* exit if convergence cannot get proven, but terminate if we are very small */ + if (unlikely(!subset(K,x) && !very_small)) return false; + + /* solve using newton raphson iteration of convergence is guarenteed */ + solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J); + return true; + } + + __forceinline void solve_newton_raphson_no_recursion(BBox1f cu, BBox1f cv) + { + if (!clip_v(cu,cv)) return; + return solve_newton_raphson(cu,cv); + } + + __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv) + { + unsigned int sptr = 0; + const unsigned int stack_size = 4; + unsigned int mask_stack[stack_size]; + BBox1f cu_stack[stack_size]; + BBox1f cv_stack[stack_size]; + goto entry; + + /* terminate if stack is empty */ + while (sptr) + { + /* pop from stack */ + { + sptr--; + size_t mask = mask_stack[sptr]; + cu = cu_stack[sptr]; + cv = cv_stack[sptr]; + const size_t i = bscf(mask); + mask_stack[sptr] = mask; + if (mask) sptr++; // there are still items on the stack + + /* process next element recurse into each hit curve segment */ + const float u0 = float(i+0)*(1.0f/(VSIZEX-1)); + const float u1 = float(i+1)*(1.0f/(VSIZEX-1)); + const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1)); + cu = cui; + } + +#if 0 + solve_newton_raphson_no_recursion(cu,cv); + continue; + +#else + /* we assume convergence for small u ranges and verify using krawczyk */ + if (cu.size() < 1.0f/6.0f) { + const bool very_small = cu.size() < 0.001f || sptr >= stack_size; + if (solve_krawczyk(very_small,cu,cv)) { + continue; + } + } +#endif + + entry: + + /* split the curve into VSIZEX-1 segments in u-direction */ + vboolx valid = true; + TensorLinearCubicBezierSurface<Vec2vfx> subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu); + + /* slabs test in u-direction */ + Vec2vfx ndv = cross(subcurves.axis_v()); + BBox<vfloatx> boundsv = subcurves.vxfm(ndv).bounds(); + valid &= boundsv.lower <= eps; + valid &= boundsv.upper >= -eps; + if (none(valid)) continue; + + /* slabs test in v-direction */ + Vec2vfx ndu = cross(subcurves.axis_u()); + BBox<vfloatx> boundsu = subcurves.vxfm(ndu).bounds(); + valid &= boundsu.lower <= eps; + valid &= boundsu.upper >= -eps; + if (none(valid)) continue; + + /* push valid segments to stack */ + assert(sptr < stack_size); + mask_stack [sptr] = movemask(valid); + cu_stack [sptr] = cu; + cv_stack [sptr] = cv; + sptr++; + } + } + + __forceinline bool solve_newton_raphson_main() + { + BBox1f vu(0.0f,1.0f); + BBox1f vv(0.0f,1.0f); + solve_newton_raphson_recursion(vu,vv); + return isHit; + } + }; + + + template<template<typename Ty> class SourceCurve> + struct OrientedCurve1Intersector1 + { + //template<typename Ty> using Curve = SourceCurve<Ty>; + typedef SourceCurve<Vec3ff> SourceCurve3ff; + typedef SourceCurve<Vec3fa> SourceCurve3fa; + + __forceinline OrientedCurve1Intersector1() {} + + __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {} + + template<typename Epilog> + __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i, + const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i, + const Epilog& epilog) const + { + STAT3(normal.trav_prims,1,1,1); + + SourceCurve3ff ccurve(v0i,v1i,v2i,v3i); + SourceCurve3fa ncurve(n0i,n1i,n2i,n3i); + ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve); + TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping(); + return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main(); + } + + template<typename Epilog> + __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const + { + STAT3(normal.trav_prims,1,1,1); + //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping(); + return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main(); + } + }; + + template<template<typename Ty> class SourceCurve, int K> + struct OrientedCurve1IntersectorK + { + //template<typename Ty> using Curve = SourceCurve<Ty>; + typedef SourceCurve<Vec3ff> SourceCurve3ff; + typedef SourceCurve<Vec3fa> SourceCurve3fa; + + struct Ray1 + { + __forceinline Ray1(RayK<K>& ray, size_t k) + : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {} + + Vec3fa org; + Vec3fa dir; + float _tnear; + float& tfar; + + __forceinline float& tnear() { return _tnear; } + //__forceinline float& tfar() { return _tfar; } + __forceinline const float& tnear() const { return _tnear; } + //__forceinline const float& tfar() const { return _tfar; } + }; + + template<typename Epilog> + __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i, + const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i, + const Epilog& epilog) + { + STAT3(normal.trav_prims,1,1,1); + Ray1 ray(vray,k); + SourceCurve3ff ccurve(v0i,v1i,v2i,v3i); + SourceCurve3fa ncurve(n0i,n1i,n2i,n3i); + ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve); + TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping(); + return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main(); + } + + template<typename Epilog> + __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const TensorLinearCubicBezierSurface3fa& curve, + const Epilog& epilog) + { + STAT3(normal.trav_prims,1,1,1); + Ray1 ray(vray,k); + //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping(); + return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main(); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h b/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h new file mode 100644 index 0000000000..de6b70be1b --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curve_intersector_precalculations.h @@ -0,0 +1,49 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/geometry.h" + +namespace embree +{ + namespace isa + { + struct CurvePrecalculations1 + { + float depth_scale; + LinearSpace3fa ray_space; + + __forceinline CurvePrecalculations1() {} + + __forceinline CurvePrecalculations1(const Ray& ray, const void* ptr) + { + depth_scale = rsqrt(dot(ray.dir,ray.dir)); + LinearSpace3fa space = frame(depth_scale*ray.dir); + space.vz *= depth_scale; + ray_space = space.transposed(); + } + }; + + template<int K> + struct CurvePrecalculationsK + { + vfloat<K> depth_scale; + LinearSpace3fa ray_space[K]; + + __forceinline CurvePrecalculationsK(const vbool<K>& valid, const RayK<K>& ray) + { + size_t mask = movemask(valid); + depth_scale = rsqrt(dot(ray.dir,ray.dir)); + while (mask) { + size_t k = bscf(mask); + Vec3fa ray_dir_k = Vec3fa(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + LinearSpace3fa ray_space_k = frame(depth_scale[k]*ray_dir_k); + ray_space_k.vz *= depth_scale[k]; + ray_space[k] = ray_space_k.transposed(); + } + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h new file mode 100644 index 0000000000..c3272e99fd --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curve_intersector_ribbon.h @@ -0,0 +1,216 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "quad_intersector.h" +#include "curve_intersector_precalculations.h" + +#define Bezier1Intersector1 RibbonCurve1Intersector1 +#define Bezier1IntersectorK RibbonCurve1IntersectorK + +namespace embree +{ + namespace isa + { + template<typename NativeCurve3ff, int M> + struct RibbonHit + { + __forceinline RibbonHit() {} + + __forceinline RibbonHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N, + const NativeCurve3ff& curve3D) + : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {} + + __forceinline void finalize() + { + vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N)); + vv = V; + vt = T; + } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return curve3D.eval_du(vu[i]); } + + __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); } + __forceinline vfloat<M> t () const { return vt; } + __forceinline Vec3vf<M> Ng() const { return (Vec3vf<M>) curve3D.template veval_du<M>(vu); } + + public: + vfloat<M> U; + vfloat<M> V; + vfloat<M> T; + int i, N; + NativeCurve3ff curve3D; + + public: + vbool<M> valid; + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + }; + + /* calculate squared distance of point p0 to line p1->p2 */ + __forceinline std::pair<vfloatx,vfloatx> sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2) + { + const vfloatx num = det(p2-p1,p1-p0); + const vfloatx den2 = dot(p2-p1,p2-p1); + return std::make_pair(num*num,den2); + } + + /* performs culling against a cylinder */ + __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r) + { + const std::pair<vfloatx,vfloatx> d = sqr_point_line_distance(p0,p1,p2); + return d.first <= r*r*d.second; + } + + template<typename NativeCurve3ff, typename Epilog> + __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar, + const LinearSpace3fa& ray_space, const float& depth_scale, + const NativeCurve3ff& curve3D, const int N, + const Epilog& epilog) + { + /* transform control points into ray space */ + const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org); + float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3))); + + /* evaluate the bezier curve */ + bool ishit = false; + vboolx valid = vfloatx(step) < vfloatx(float(N)); + const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N); + const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N); + valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w)); + + if (any(valid)) + { + Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(0,N); + Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(0,N); + dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt); + dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt); + const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f); + const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f); + const Vec3vfx nn0 = normalize(n0); + const Vec3vfx nn1 = normalize(n1); + const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0)); + const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1)); + const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0)); + const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1)); + + vfloatx vu,vv,vt; + vboolx valid0 = intersect_quad_backface_culling<VSIZEX>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt); + + if (any(valid0)) + { + /* ignore self intersections */ + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) { + vfloatx r = lerp(p0.w, p1.w, vu); + valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; + } + + if (any(valid0)) + { + vv = madd(2.0f,vv,vfloatx(-1.0f)); + RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,0,N,curve3D); + ishit |= epilog(bhit.valid,bhit); + } + } + } + + if (unlikely(VSIZEX < N)) + { + /* process SIMD-size many segments per iteration */ + for (int i=VSIZEX; i<N; i+=VSIZEX) + { + /* evaluate the bezier curve */ + vboolx valid = vintx(i)+vintx(step) < vintx(N); + const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N); + const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N); + valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w)); + if (none(valid)) continue; + + Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(i,N); + Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(i,N); + dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt); + dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt); + const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f); + const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f); + const Vec3vfx nn0 = normalize(n0); + const Vec3vfx nn1 = normalize(n1); + const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0)); + const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1)); + const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0)); + const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1)); + + vfloatx vu,vv,vt; + vboolx valid0 = intersect_quad_backface_culling<VSIZEX>(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt); + + if (any(valid0)) + { + /* ignore self intersections */ + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) { + vfloatx r = lerp(p0.w, p1.w, vu); + valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; + } + + if (any(valid0)) + { + vv = madd(2.0f,vv,vfloatx(-1.0f)); + RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,i,N,curve3D); + ishit |= epilog(bhit.valid,bhit); + } + } + } + } + return ishit; + } + + template<template<typename Ty> class NativeCurve> + struct RibbonCurve1Intersector1 + { + typedef NativeCurve<Vec3ff> NativeCurve3ff; + + template<typename Epilog> + __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, + const Epilog& epilog) + { + const int N = geom->tessellationRate; + NativeCurve3ff curve(v0,v1,v2,v3); + curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve); + return intersect_ribbon<NativeCurve3ff>(ray.org,ray.dir,ray.tnear(),ray.tfar, + pre.ray_space,pre.depth_scale, + curve,N, + epilog); + } + }; + + template<template<typename Ty> class NativeCurve, int K> + struct RibbonCurve1IntersectorK + { + typedef NativeCurve<Vec3ff> NativeCurve3ff; + + template<typename Epilog> + __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& ray, size_t k, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, + const Epilog& epilog) + { + const int N = geom->tessellationRate; + const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); + const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + NativeCurve3ff curve(v0,v1,v2,v3); + curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve); + return intersect_ribbon<NativeCurve3ff>(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k], + pre.ray_space[k],pre.depth_scale[k], + curve,N, + epilog); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h new file mode 100644 index 0000000000..2d4abd73ac --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h @@ -0,0 +1,364 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "cylinder.h" +#include "plane.h" +#include "line_intersector.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + static const size_t numJacobianIterations = 5; +#if defined(__AVX__) + static const size_t numBezierSubdivisions = 2; +#else + static const size_t numBezierSubdivisions = 3; +#endif + + struct BezierCurveHit + { + __forceinline BezierCurveHit() {} + + __forceinline BezierCurveHit(const float t, const float u, const Vec3fa& Ng) + : t(t), u(u), v(0.0f), Ng(Ng) {} + + __forceinline BezierCurveHit(const float t, const float u, const float v, const Vec3fa& Ng) + : t(t), u(u), v(v), Ng(Ng) {} + + __forceinline void finalize() {} + + public: + float t; + float u; + float v; + Vec3fa Ng; + }; + + template<typename NativeCurve3ff, typename Ray, typename Epilog> + __forceinline bool intersect_bezier_iterative_debug(const Ray& ray, const float dt, const NativeCurve3ff& curve, size_t i, + const vfloatx& u, const BBox<vfloatx>& tp, const BBox<vfloatx>& h0, const BBox<vfloatx>& h1, + const Vec3vfx& Ng, const Vec4vfx& dP0du, const Vec4vfx& dP3du, + const Epilog& epilog) + { + if (tp.lower[i]+dt > ray.tfar) return false; + Vec3fa Ng_o = Vec3fa(Ng.x[i],Ng.y[i],Ng.z[i]); + if (h0.lower[i] == tp.lower[i]) Ng_o = -Vec3fa(dP0du.x[i],dP0du.y[i],dP0du.z[i]); + if (h1.lower[i] == tp.lower[i]) Ng_o = +Vec3fa(dP3du.x[i],dP3du.y[i],dP3du.z[i]); + BezierCurveHit hit(tp.lower[i]+dt,u[i],Ng_o); + return epilog(hit); + } + + template<typename NativeCurve3ff, typename Ray, typename Epilog> + __forceinline bool intersect_bezier_iterative_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, float u, float t, const Epilog& epilog) + { + const Vec3fa org = zero; + const Vec3fa dir = ray.dir; + const float length_ray_dir = length(dir); + + /* error of curve evaluations is propertional to largest coordinate */ + const BBox3ff box = curve.bounds(); + const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper))); + + for (size_t i=0; i<numJacobianIterations; i++) + { + const Vec3fa Q = madd(Vec3fa(t),dir,org); + //const Vec3fa dQdu = zero; + const Vec3fa dQdt = dir; + const float Q_err = 16.0f*float(ulp)*length_ray_dir*t; // works as org=zero here + + Vec3ff P,dPdu,ddPdu; curve.eval(u,P,dPdu,ddPdu); + //const Vec3fa dPdt = zero; + + const Vec3fa R = Q-P; + const float len_R = length(R); //reduce_max(abs(R)); + const float R_err = max(Q_err,P_err); + const Vec3fa dRdu = /*dQdu*/-dPdu; + const Vec3fa dRdt = dQdt;//-dPdt; + + const Vec3fa T = normalize(dPdu); + const Vec3fa dTdu = dnormalize(dPdu,ddPdu); + //const Vec3fa dTdt = zero; + const float cos_err = P_err/length(dPdu); + + /* Error estimate for dot(R,T): + + dot(R,T) = cos(R,T) |R| |T| + = (cos(R,T) +- cos_error) * (|R| +- |R|_err) * (|T| +- |T|_err) + = cos(R,T)*|R|*|T| + +- cos(R,T)*(|R|*|T|_err + |T|*|R|_err) + +- cos_error*(|R| + |T|) + +- lower order terms + with cos(R,T) being in [0,1] and |T| = 1 we get: + dot(R,T)_err = |R|*|T|_err + |R|_err = cos_error*(|R|+1) + */ + + const float f = dot(R,T); + const float f_err = len_R*P_err + R_err + cos_err*(1.0f+len_R); + const float dfdu = dot(dRdu,T) + dot(R,dTdu); + const float dfdt = dot(dRdt,T);// + dot(R,dTdt); + + const float K = dot(R,R)-sqr(f); + const float dKdu = /*2.0f*/(dot(R,dRdu)-f*dfdu); + const float dKdt = /*2.0f*/(dot(R,dRdt)-f*dfdt); + const float rsqrt_K = rsqrt(K); + + const float g = sqrt(K)-P.w; + const float g_err = R_err + f_err + 16.0f*float(ulp)*box.upper.w; + const float dgdu = /*0.5f*/dKdu*rsqrt_K-dPdu.w; + const float dgdt = /*0.5f*/dKdt*rsqrt_K;//-dPdt.w; + + const LinearSpace2f J = LinearSpace2f(dfdu,dfdt,dgdu,dgdt); + const Vec2f dut = rcp(J)*Vec2f(f,g); + const Vec2f ut = Vec2f(u,t) - dut; + u = ut.x; t = ut.y; + + if (abs(f) < f_err && abs(g) < g_err) + { + t+=dt; + if (!(ray.tnear() <= t && t <= ray.tfar)) return false; // rejects NaNs + if (!(u >= 0.0f && u <= 1.0f)) return false; // rejects NaNs + const Vec3fa R = normalize(Q-P); + const Vec3fa U = madd(Vec3fa(dPdu.w),R,dPdu); + const Vec3fa V = cross(dPdu,R); + BezierCurveHit hit(t,u,cross(V,U)); + return epilog(hit); + } + } + return false; + } + + template<typename NativeCurve3ff, typename Ray, typename Epilog> + bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, + float u0, float u1, unsigned int depth, const Epilog& epilog) + { +#if defined(__AVX__) + enum { VSIZEX_ = 8 }; + typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues + typedef vint8 vintx; + typedef vfloat8 vfloatx; +#else + enum { VSIZEX_ = 4 }; + typedef vbool4 vboolx; + typedef vint4 vintx; + typedef vfloat4 vfloatx; +#endif + typedef Vec3<vfloatx> Vec3vfx; + typedef Vec4<vfloatx> Vec4vfx; + + unsigned int maxDepth = numBezierSubdivisions; + bool found = false; + const Vec3fa org = zero; + const Vec3fa dir = ray.dir; + + unsigned int sptr = 0; + const unsigned int stack_size = numBezierSubdivisions+1; // +1 because of unstable workaround below + struct StackEntry { + vboolx valid; + vfloatx tlower; + float u0; + float u1; + unsigned int depth; + }; + StackEntry stack[stack_size]; + goto entry; + + /* terminate if stack is empty */ + while (sptr) + { + /* pop from stack */ + { + sptr--; + vboolx valid = stack[sptr].valid; + const vfloatx tlower = stack[sptr].tlower; + valid &= tlower+dt <= ray.tfar; + if (none(valid)) continue; + u0 = stack[sptr].u0; + u1 = stack[sptr].u1; + depth = stack[sptr].depth; + const size_t i = select_min(valid,tlower); clear(valid,i); + stack[sptr].valid = valid; + if (any(valid)) sptr++; // there are still items on the stack + + /* process next segment */ + const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1))); + u0 = vu0[i+0]; + u1 = vu0[i+1]; + } + entry: + + /* subdivide curve */ + const float dscale = (u1-u0)*(1.0f/(3.0f*(vfloatx::size-1))); + const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1))); + Vec4vfx P0, dP0du; curve.template veval<VSIZEX_>(vu0,P0,dP0du); dP0du = dP0du * Vec4vfx(dscale); + const Vec4vfx P3 = shift_right_1(P0); + const Vec4vfx dP3du = shift_right_1(dP0du); + const Vec4vfx P1 = P0 + dP0du; + const Vec4vfx P2 = P3 - dP3du; + + /* calculate bounding cylinders */ + const vfloatx rr1 = sqr_point_to_line_distance(Vec3vfx(dP0du),Vec3vfx(P3-P0)); + const vfloatx rr2 = sqr_point_to_line_distance(Vec3vfx(dP3du),Vec3vfx(P3-P0)); + const vfloatx maxr12 = sqrt(max(rr1,rr2)); + const vfloatx one_plus_ulp = 1.0f+2.0f*float(ulp); + const vfloatx one_minus_ulp = 1.0f-2.0f*float(ulp); + vfloatx r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12; + vfloatx r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12; + r_outer = one_plus_ulp*r_outer; + r_inner = max(0.0f,one_minus_ulp*r_inner); + const CylinderN<vfloatx::size> cylinder_outer(Vec3vfx(P0),Vec3vfx(P3),r_outer); + const CylinderN<vfloatx::size> cylinder_inner(Vec3vfx(P0),Vec3vfx(P3),r_inner); + vboolx valid = true; clear(valid,vfloatx::size-1); + + /* intersect with outer cylinder */ + BBox<vfloatx> tc_outer; vfloatx u_outer0; Vec3vfx Ng_outer0; vfloatx u_outer1; Vec3vfx Ng_outer1; + valid &= cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1); + if (none(valid)) continue; + + /* intersect with cap-planes */ + BBox<vfloatx> tp(ray.tnear()-dt,ray.tfar-dt); + tp = embree::intersect(tp,tc_outer); + BBox<vfloatx> h0 = HalfPlaneN<vfloatx::size>(Vec3vfx(P0),+Vec3vfx(dP0du)).intersect(org,dir); + tp = embree::intersect(tp,h0); + BBox<vfloatx> h1 = HalfPlaneN<vfloatx::size>(Vec3vfx(P3),-Vec3vfx(dP3du)).intersect(org,dir); + tp = embree::intersect(tp,h1); + valid &= tp.lower <= tp.upper; + if (none(valid)) continue; + + /* clamp and correct u parameter */ + u_outer0 = clamp(u_outer0,vfloatx(0.0f),vfloatx(1.0f)); + u_outer1 = clamp(u_outer1,vfloatx(0.0f),vfloatx(1.0f)); + u_outer0 = lerp(u0,u1,(vfloatx(step)+u_outer0)*(1.0f/float(vfloatx::size))); + u_outer1 = lerp(u0,u1,(vfloatx(step)+u_outer1)*(1.0f/float(vfloatx::size))); + + /* intersect with inner cylinder */ + BBox<vfloatx> tc_inner; + vfloatx u_inner0 = zero; Vec3vfx Ng_inner0 = zero; vfloatx u_inner1 = zero; Vec3vfx Ng_inner1 = zero; + const vboolx valid_inner = cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1); + + /* at the unstable area we subdivide deeper */ + const vboolx unstable0 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner0)) < 0.3f); + const vboolx unstable1 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner1)) < 0.3f); + + /* subtract the inner interval from the current hit interval */ + BBox<vfloatx> tp0, tp1; + subtract(tp,tc_inner,tp0,tp1); + vboolx valid0 = valid & (tp0.lower <= tp0.upper); + vboolx valid1 = valid & (tp1.lower <= tp1.upper); + if (none(valid0 | valid1)) continue; + + /* iterate over all first hits front to back */ + const vintx termDepth0 = select(unstable0,vintx(maxDepth+1),vintx(maxDepth)); + vboolx recursion_valid0 = valid0 & (depth < termDepth0); + valid0 &= depth >= termDepth0; + + while (any(valid0)) + { + const size_t i = select_min(valid0,tp0.lower); clear(valid0,i); + found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0[i],tp0.lower[i],epilog); + //found = found | intersect_bezier_iterative_debug (ray,dt,curve,i,u_outer0,tp0,h0,h1,Ng_outer0,dP0du,dP3du,epilog); + valid0 &= tp0.lower+dt <= ray.tfar; + } + valid1 &= tp1.lower+dt <= ray.tfar; + + /* iterate over all second hits front to back */ + const vintx termDepth1 = select(unstable1,vintx(maxDepth+1),vintx(maxDepth)); + vboolx recursion_valid1 = valid1 & (depth < termDepth1); + valid1 &= depth >= termDepth1; + while (any(valid1)) + { + const size_t i = select_min(valid1,tp1.lower); clear(valid1,i); + found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1[i],tp1.upper[i],epilog); + //found = found | intersect_bezier_iterative_debug (ray,dt,curve,i,u_outer1,tp1,h0,h1,Ng_outer1,dP0du,dP3du,epilog); + valid1 &= tp1.lower+dt <= ray.tfar; + } + + /* push valid segments to stack */ + recursion_valid0 &= tp0.lower+dt <= ray.tfar; + recursion_valid1 &= tp1.lower+dt <= ray.tfar; + const vboolx recursion_valid = recursion_valid0 | recursion_valid1; + if (any(recursion_valid)) + { + assert(sptr < stack_size); + stack[sptr].valid = recursion_valid; + stack[sptr].tlower = select(recursion_valid0,tp0.lower,tp1.lower); + stack[sptr].u0 = u0; + stack[sptr].u1 = u1; + stack[sptr].depth = depth+1; + sptr++; + } + } + return found; + } + + template<template<typename Ty> class NativeCurve> + struct SweepCurve1Intersector1 + { + typedef NativeCurve<Vec3ff> NativeCurve3ff; + + template<typename Epilog> + __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, + const Epilog& epilog) + { + STAT3(normal.trav_prims,1,1,1); + + /* move ray closer to make intersection stable */ + NativeCurve3ff curve0(v0,v1,v2,v3); + curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0); + const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir)); + const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f); + const NativeCurve3ff curve1 = curve0-ref; + return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog); + } + }; + + template<template<typename Ty> class NativeCurve, int K> + struct SweepCurve1IntersectorK + { + typedef NativeCurve<Vec3ff> NativeCurve3ff; + + struct Ray1 + { + __forceinline Ray1(RayK<K>& ray, size_t k) + : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {} + + Vec3fa org; + Vec3fa dir; + float _tnear; + float& tfar; + + __forceinline float& tnear() { return _tnear; } + //__forceinline float& tfar() { return _tfar; } + __forceinline const float& tnear() const { return _tnear; } + //__forceinline const float& tfar() const { return _tfar; } + + }; + + template<typename Epilog> + __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, + const Epilog& epilog) + { + STAT3(normal.trav_prims,1,1,1); + Ray1 ray(vray,k); + + /* move ray closer to make intersection stable */ + NativeCurve3ff curve0(v0,v1,v2,v3); + curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0); + const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir)); + const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f); + const NativeCurve3ff curve1 = curve0-ref; + return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h new file mode 100644 index 0000000000..cffa8e46ad --- /dev/null +++ b/thirdparty/embree/kernels/geometry/curve_intersector_virtual.h @@ -0,0 +1,671 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../subdiv/bezier_curve.h" +#include "../common/primref.h" +#include "curve_intersector_precalculations.h" +#include "../bvh/node_intersector1.h" +#include "../bvh/node_intersector_packet.h" + +#include "intersector_epilog.h" + +#include "../subdiv/bezier_curve.h" +#include "../subdiv/bspline_curve.h" +#include "../subdiv/hermite_curve.h" +#include "../subdiv/catmullrom_curve.h" + +#include "spherei_intersector.h" +#include "disci_intersector.h" + +#include "linei_intersector.h" +#include "roundlinei_intersector.h" +#include "conelinei_intersector.h" + +#include "curveNi_intersector.h" +#include "curveNv_intersector.h" +#include "curveNi_mb_intersector.h" + +#include "curve_intersector_distance.h" +#include "curve_intersector_ribbon.h" +#include "curve_intersector_oriented.h" +#include "curve_intersector_sweep.h" + +namespace embree +{ + struct VirtualCurveIntersector + { + typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive); + typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive); + + typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + + typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + + typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + + public: + struct Intersectors + { + Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp. + + template<int K> void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive); + template<int K> bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive); + + template<int K> void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + template<int K> bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + + public: + Intersect1Ty intersect1; + Occluded1Ty occluded1; + Intersect4Ty intersect4; + Occluded4Ty occluded4; + Intersect8Ty intersect8; + Occluded8Ty occluded8; + Intersect16Ty intersect16; + Occluded16Ty occluded16; + }; + + Intersectors vtbl[Geometry::GTY_END]; + }; + + template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); } + template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); } + + template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); } + template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); } + +#if defined(__AVX__) + template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); } + template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); } +#endif + +#if defined(__AVX512F__) + template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); } + template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); } +#endif + + namespace isa + { + struct VirtualCurveIntersector1 + { + typedef unsigned char Primitive; + typedef CurvePrecalculations1 Precalculations; + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + leafIntersector.intersect<1>(&pre,&ray,context,prim); + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + return leafIntersector.occluded<1>(&pre,&ray,context,prim); + } + }; + + template<int K> + struct VirtualCurveIntersectorK + { + typedef unsigned char Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + template<bool robust> + static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + size_t mask = movemask(valid_i); + while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim); + } + + template<bool robust> + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + vbool<K> valid_o = false; + size_t mask = movemask(valid_i); + while (mask) { + size_t k = bscf(mask); + if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim)) + set(valid_o, k); + } + return valid_o; + } + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + leafIntersector.intersect<K>(&pre,&ray,k,context,prim); + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + return leafIntersector.occluded<K>(&pre,&ray,k,context,prim); + } + }; + + template<int N> + static VirtualCurveIntersector::Intersectors LinearRoundConeNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &RoundLinearCurveMiIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &RoundLinearCurveMiIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors LinearConeNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &ConeCurveMiIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &ConeCurveMiIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors LinearRoundConeNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiMBIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &RoundLinearCurveMiMBIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiMBIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &RoundLinearCurveMiMBIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiMBIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiMBIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiMBIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiMBIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors LinearConeNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiMBIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &ConeCurveMiMBIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiMBIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &ConeCurveMiMBIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiMBIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiMBIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiMBIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiMBIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + + template<int N> + static VirtualCurveIntersector::Intersectors LinearRibbonNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &FlatLinearCurveMiIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &FlatLinearCurveMiIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors LinearRibbonNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiMBIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &FlatLinearCurveMiMBIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiMBIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &FlatLinearCurveMiMBIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiMBIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiMBIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiMBIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiMBIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors SphereNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &SphereMiIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &SphereMiIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &SphereMiIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &SphereMiIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors SphereNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiMBIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &SphereMiMBIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiMBIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &SphereMiMBIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiMBIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &SphereMiMBIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiMBIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &SphereMiMBIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors DiscNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &DiscMiIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &DiscMiIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &DiscMiIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &DiscMiIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors DiscNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiMBIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &DiscMiMBIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiMBIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &DiscMiMBIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiMBIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &DiscMiMBIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiMBIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &DiscMiMBIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors OrientedDiscNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &OrientedDiscMiIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &OrientedDiscMiIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<int N> + static VirtualCurveIntersector::Intersectors OrientedDiscNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiMBIntersector1<N,true>::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &OrientedDiscMiMBIntersector1<N,true>::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiMBIntersectorK<N,4,true>::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &OrientedDiscMiMBIntersectorK<N,4,true>::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiMBIntersectorK<N,8,true>::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiMBIntersectorK<N,8,true>::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiMBIntersectorK<N,16,true>::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiMBIntersectorK<N,16,true>::occluded; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors RibbonNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors RibbonNvIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNvIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNvIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors RibbonNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiMBIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors CurveNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors CurveNvIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNvIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNvIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors CurveNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors OrientedCurveNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors OrientedCurveNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteRibbonNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteRibbonNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteCurveNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteCurveNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template<template<typename Ty> class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + } +} diff --git a/thirdparty/embree/kernels/geometry/cylinder.h b/thirdparty/embree/kernels/geometry/cylinder.h new file mode 100644 index 0000000000..dab02989ce --- /dev/null +++ b/thirdparty/embree/kernels/geometry/cylinder.h @@ -0,0 +1,223 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + struct Cylinder + { + const Vec3fa p0; //!< start location + const Vec3fa p1; //!< end position + const float rr; //!< squared radius of cylinder + + __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float r) + : p0(p0), p1(p1), rr(sqr(r)) {} + + __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float rr, bool) + : p0(p0), p1(p1), rr(rr) {} + + __forceinline bool intersect(const Vec3fa& org, + const Vec3fa& dir, + BBox1f& t_o, + float& u0_o, Vec3fa& Ng0_o, + float& u1_o, Vec3fa& Ng1_o) const + { + /* calculate quadratic equation to solve */ + const float rl = rcp_length(p1-p0); + const Vec3fa P0 = p0, dP = (p1-p0)*rl; + const Vec3fa O = org-P0, dO = dir; + + const float dOdO = dot(dO,dO); + const float OdO = dot(dO,O); + const float OO = dot(O,O); + const float dOz = dot(dP,dO); + const float Oz = dot(dP,O); + + const float A = dOdO - sqr(dOz); + const float B = 2.0f * (OdO - dOz*Oz); + const float C = OO - sqr(Oz) - rr; + + /* we miss the cylinder if determinant is smaller than zero */ + const float D = B*B - 4.0f*A*C; + if (D < 0.0f) { + t_o = BBox1f(pos_inf,neg_inf); + return false; + } + + /* special case for rays that are parallel to the cylinder */ + const float eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); + if (abs(A) < eps) + { + if (C <= 0.0f) { + t_o = BBox1f(neg_inf,pos_inf); + return true; + } else { + t_o = BBox1f(pos_inf,neg_inf); + return false; + } + } + + /* standard case for rays that are not parallel to the cylinder */ + const float Q = sqrt(D); + const float rcp_2A = rcp(2.0f*A); + const float t0 = (-B-Q)*rcp_2A; + const float t1 = (-B+Q)*rcp_2A; + + /* calculates u and Ng for near hit */ + { + u0_o = madd(t0,dOz,Oz)*rl; + const Vec3fa Pr = t0*dir; + const Vec3fa Pl = madd(u0_o,p1-p0,p0); + Ng0_o = Pr-Pl; + } + + /* calculates u and Ng for far hit */ + { + u1_o = madd(t1,dOz,Oz)*rl; + const Vec3fa Pr = t1*dir; + const Vec3fa Pl = madd(u1_o,p1-p0,p0); + Ng1_o = Pr-Pl; + } + + t_o.lower = t0; + t_o.upper = t1; + return true; + } + + __forceinline bool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox1f& t_o) const + { + float u0_o; Vec3fa Ng0_o; + float u1_o; Vec3fa Ng1_o; + return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); + } + + static bool verify(const size_t id, const Cylinder& cylinder, const RayHit& ray, bool shouldhit, const float t0, const float t1) + { + float eps = 0.001f; + BBox1f t; bool hit; + hit = cylinder.intersect(ray.org,ray.dir,t); + + bool failed = hit != shouldhit; + if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : abs(t0-t.lower) > eps; + if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : abs(t1-t.upper) > eps; + if (!failed) return true; + embree_cout << "Cylinder test " << id << " failed: cylinder = " << cylinder << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; + return false; + } + + /* verify cylinder class */ + static bool verify() + { + bool passed = true; + const Cylinder cylinder(Vec3fa(0.0f,0.0f,0.0f),Vec3fa(1.0f,0.0f,0.0f),1.0f); + passed &= verify(0,cylinder,RayHit(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); + passed &= verify(1,cylinder,RayHit(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); + passed &= verify(2,cylinder,RayHit(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); + passed &= verify(3,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); + passed &= verify(4,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); + passed &= verify(5,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); + passed &= verify(6,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); + return passed; + } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cylinder& c) { + return cout << "Cylinder { p0 = " << c.p0 << ", p1 = " << c.p1 << ", r = " << sqrtf(c.rr) << "}"; + } + }; + + template<int N> + struct CylinderN + { + const Vec3vf<N> p0; //!< start location + const Vec3vf<N> p1; //!< end position + const vfloat<N> rr; //!< squared radius of cylinder + + __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& r) + : p0(p0), p1(p1), rr(sqr(r)) {} + + __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& rr, bool) + : p0(p0), p1(p1), rr(rr) {} + + + __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, + BBox<vfloat<N>>& t_o, + vfloat<N>& u0_o, Vec3vf<N>& Ng0_o, + vfloat<N>& u1_o, Vec3vf<N>& Ng1_o) const + { + /* calculate quadratic equation to solve */ + const vfloat<N> rl = rcp_length(p1-p0); + const Vec3vf<N> P0 = p0, dP = (p1-p0)*rl; + const Vec3vf<N> O = Vec3vf<N>(org)-P0, dO = dir; + + const vfloat<N> dOdO = dot(dO,dO); + const vfloat<N> OdO = dot(dO,O); + const vfloat<N> OO = dot(O,O); + const vfloat<N> dOz = dot(dP,dO); + const vfloat<N> Oz = dot(dP,O); + + const vfloat<N> A = dOdO - sqr(dOz); + const vfloat<N> B = 2.0f * (OdO - dOz*Oz); + const vfloat<N> C = OO - sqr(Oz) - rr; + + /* we miss the cylinder if determinant is smaller than zero */ + const vfloat<N> D = B*B - 4.0f*A*C; + vbool<N> valid = D >= 0.0f; + if (none(valid)) { + t_o = BBox<vfloat<N>>(empty); + return valid; + } + + /* standard case for rays that are not parallel to the cylinder */ + const vfloat<N> Q = sqrt(D); + const vfloat<N> rcp_2A = rcp(2.0f*A); + const vfloat<N> t0 = (-B-Q)*rcp_2A; + const vfloat<N> t1 = (-B+Q)*rcp_2A; + + /* calculates u and Ng for near hit */ + { + u0_o = madd(t0,dOz,Oz)*rl; + const Vec3vf<N> Pr = t0*Vec3vf<N>(dir); + const Vec3vf<N> Pl = madd(u0_o,p1-p0,p0); + Ng0_o = Pr-Pl; + } + + /* calculates u and Ng for far hit */ + { + u1_o = madd(t1,dOz,Oz)*rl; + const Vec3vf<N> Pr = t1*Vec3vf<N>(dir); + const Vec3vf<N> Pl = madd(u1_o,p1-p0,p0); + Ng1_o = Pr-Pl; + } + + t_o.lower = select(valid, t0, vfloat<N>(pos_inf)); + t_o.upper = select(valid, t1, vfloat<N>(neg_inf)); + + /* special case for rays that are parallel to the cylinder */ + const vfloat<N> eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); + vbool<N> validt = valid & (abs(A) < eps); + if (unlikely(any(validt))) + { + vbool<N> inside = C <= 0.0f; + t_o.lower = select(validt,select(inside,vfloat<N>(neg_inf),vfloat<N>(pos_inf)),t_o.lower); + t_o.upper = select(validt,select(inside,vfloat<N>(pos_inf),vfloat<N>(neg_inf)),t_o.upper); + valid &= !validt | inside; + } + return valid; + } + + __forceinline vbool<N> intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const + { + vfloat<N> u0_o; Vec3vf<N> Ng0_o; + vfloat<N> u1_o; Vec3vf<N> Ng1_o; + return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); + } + }; + } +} + diff --git a/thirdparty/embree/kernels/geometry/disc_intersector.h b/thirdparty/embree/kernels/geometry/disc_intersector.h new file mode 100644 index 0000000000..816c066899 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/disc_intersector.h @@ -0,0 +1,216 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/scene_points.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + template<int M> + struct DiscIntersectorHitM + { + __forceinline DiscIntersectorHitM() {} + + __forceinline DiscIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng) + : vu(u), vv(v), vt(t), vNg(Ng) + { + } + + __forceinline void finalize() {} + + __forceinline Vec2f uv(const size_t i) const + { + return Vec2f(vu[i], vv[i]); + } + __forceinline float t(const size_t i) const + { + return vt[i]; + } + __forceinline Vec3fa Ng(const size_t i) const + { + return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]); + } + + public: + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + }; + + template<int M> + struct DiscIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + template<typename Epilog> + static __forceinline bool intersect( + const vbool<M>& valid_i, + Ray& ray, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, + const Epilog& epilog) + { + vbool<M> valid = valid_i; + + const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); + const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir)); + + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec3vf<M> center = v0.xyz(); + const vfloat<M> radius = v0.w; + + const Vec3vf<M> c0 = center - ray_org; + const vfloat<M> projC0 = dot(c0, ray_dir) * rd2; + + valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar)); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale; // ignore self intersections + if (unlikely(none(valid))) + return false; + + const Vec3vf<M> perp = c0 - projC0 * ray_dir; + const vfloat<M> l2 = dot(perp, perp); + const vfloat<M> r2 = radius * radius; + valid &= (l2 <= r2); + if (unlikely(none(valid))) + return false; + + DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir); + return epilog(valid, hit); + } + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + Ray& ray, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, + const Vec3vf<M>& normal, + const Epilog& epilog) + { + vbool<M> valid = valid_i; + const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z); + + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec3vf<M> center = v0.xyz(); + const vfloat<M> radius = v0.w; + + vfloat<M> divisor = dot(Vec3vf<M>((Vec3fa)ray.dir), normal); + const vbool<M> parallel = divisor == vfloat<M>(0.f); + valid &= !parallel; + divisor = select(parallel, 1.f, divisor); // prevent divide by zero + + vfloat<M> t = dot(center - Vec3vf<M>((Vec3fa)ray.org), Vec3vf<M>(normal)) / divisor; + + valid &= (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar)); + if (unlikely(none(valid))) + return false; + + Vec3vf<M> intersection = Vec3vf<M>((Vec3fa)ray.org) + Vec3vf<M>((Vec3fa)ray.dir) * t; + vfloat<M> dist2 = dot(intersection - center, intersection - center); + valid &= dist2 < radius * radius; + if (unlikely(none(valid))) + return false; + + DiscIntersectorHitM<M> hit(zero, zero, t, normal); + return epilog(valid, hit); + } + }; + + template<int M, int K> + struct DiscIntersectorK + { + typedef CurvePrecalculationsK<K> Precalculations; + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + RayK<K>& ray, + size_t k, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, + const Epilog& epilog) + { + vbool<M> valid = valid_i; + + const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir)); + + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec3vf<M> center = v0.xyz(); + const vfloat<M> radius = v0.w; + + const Vec3vf<M> c0 = center - ray_org; + const vfloat<M> projC0 = dot(c0, ray_dir) * rd2; + + valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k])); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k]; // ignore self intersections + if (unlikely(none(valid))) + return false; + + const Vec3vf<M> perp = c0 - projC0 * ray_dir; + const vfloat<M> l2 = dot(perp, perp); + const vfloat<M> r2 = radius * radius; + valid &= (l2 <= r2); + if (unlikely(none(valid))) + return false; + + DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir); + return epilog(valid, hit); + } + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + RayK<K>& ray, + size_t k, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, + const Vec3vf<M>& normal, + const Epilog& epilog) + { + vbool<M> valid = valid_i; + const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec3vf<M> center = v0.xyz(); + const vfloat<M> radius = v0.w; + + vfloat<M> divisor = dot(Vec3vf<M>(ray_dir), normal); + const vbool<M> parallel = divisor == vfloat<M>(0.f); + valid &= !parallel; + divisor = select(parallel, 1.f, divisor); // prevent divide by zero + + vfloat<M> t = dot(center - Vec3vf<M>(ray_org), Vec3vf<M>(normal)) / divisor; + + valid &= (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k])); + if (unlikely(none(valid))) + return false; + + Vec3vf<M> intersection = Vec3vf<M>(ray_org) + Vec3vf<M>(ray_dir) * t; + vfloat<M> dist2 = dot(intersection - center, intersection - center); + valid &= dist2 < radius * radius; + if (unlikely(none(valid))) + return false; + + DiscIntersectorHitM<M> hit(zero, zero, t, normal); + return epilog(valid, hit); + } + }; + } // namespace isa +} // namespace embree diff --git a/thirdparty/embree/kernels/geometry/disci_intersector.h b/thirdparty/embree/kernels/geometry/disci_intersector.h new file mode 100644 index 0000000000..bb9d396f6e --- /dev/null +++ b/thirdparty/embree/kernels/geometry/disci_intersector.h @@ -0,0 +1,277 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "disc_intersector.h" +#include "intersector_epilog.h" +#include "pointi.h" + +namespace embree +{ + namespace isa + { + template<int M, bool filter> + struct DiscMiIntersector1 + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Disc.gather(v0, geom); + const vbool<M> valid = Disc.valid(); + DiscIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Disc.gather(v0, geom); + const vbool<M> valid = Disc.valid(); + return DiscIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID())); + } + }; + + template<int M, bool filter> + struct DiscMiMBIntersector1 + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()); + const vbool<M> valid = Disc.valid(); + DiscIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()); + const vbool<M> valid = Disc.valid(); + return DiscIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID())); + } + }; + + template<int M, int K, bool filter> + struct DiscMiIntersectorK + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Disc.gather(v0, geom); + const vbool<M> valid = Disc.valid(); + DiscIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, + Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Disc.gather(v0, geom); + const vbool<M> valid = Disc.valid(); + return DiscIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, + Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID())); + } + }; + + template<int M, int K, bool filter> + struct DiscMiMBIntersectorK + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]); + const vbool<M> valid = Disc.valid(); + DiscIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, + Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]); + const vbool<M> valid = Disc.valid(); + return DiscIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID())); + } + }; + + template<int M, bool filter> + struct OrientedDiscMiIntersector1 + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Vec3vf<M> n0; + Disc.gather(v0, n0, geom); + const vbool<M> valid = Disc.valid(); + DiscIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Vec3vf<M> n0; + Disc.gather(v0, n0, geom); + const vbool<M> valid = Disc.valid(); + return DiscIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID())); + } + }; + + template<int M, bool filter> + struct OrientedDiscMiMBIntersector1 + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Vec3vf<M> n0; + Disc.gather(v0, n0, geom, ray.time()); + const vbool<M> valid = Disc.valid(); + DiscIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Vec3vf<M> n0; + Disc.gather(v0, n0, geom, ray.time()); + const vbool<M> valid = Disc.valid(); + return DiscIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, filter>(ray, context, Disc.geomID(), Disc.primID())); + } + }; + + template<int M, int K, bool filter> + struct OrientedDiscMiIntersectorK + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Vec3vf<M> n0; + Disc.gather(v0, n0, geom); + const vbool<M> valid = Disc.valid(); + DiscIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, n0, + Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Vec3vf<M> n0; + Disc.gather(v0, n0, geom); + const vbool<M> valid = Disc.valid(); + return DiscIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, n0, + Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID())); + } + }; + + template<int M, int K, bool filter> + struct OrientedDiscMiMBIntersectorK + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Vec3vf<M> n0; + Disc.gather(v0, n0, geom, ray.time()[k]); + const vbool<M> valid = Disc.valid(); + DiscIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, n0, + Intersect1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(Disc.geomID()); + Vec4vf<M> v0; Vec3vf<M> n0; + Disc.gather(v0, n0, geom, ray.time()[k]); + const vbool<M> valid = Disc.valid(); + return DiscIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, n0, + Occluded1KEpilogM<M, K, filter>(ray, k, context, Disc.geomID(), Disc.primID())); + } + }; + } // namespace isa +} // namespace embree diff --git a/thirdparty/embree/kernels/geometry/filter.h b/thirdparty/embree/kernels/geometry/filter.h new file mode 100644 index 0000000000..3b4d924ea7 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/filter.h @@ -0,0 +1,204 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/geometry.h" +#include "../common/ray.h" +#include "../common/hit.h" +#include "../common/context.h" + +namespace embree +{ + namespace isa + { + __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) + { + if (geometry->intersectionFilterN) + { + assert(context->scene->hasGeometryFilterFunction()); + geometry->intersectionFilterN(args); + + if (args->valid[0] == 0) + return false; + } + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(args); + + if (args->valid[0] == 0) + return false; + } + + copyHitToRay(*(RayHit*)args->ray,*(Hit*)args->hit); + return true; + } + + __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit) + { + RTCFilterFunctionNArguments args; + int mask = -1; + args.valid = &mask; + args.geometryUserPtr = geometry->userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.hit = (RTCHitN*)&hit; + args.N = 1; + return runIntersectionFilter1Helper(&args,geometry,context); + } + + __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) + { +#if defined(EMBREE_FILTER_FUNCTION) + IntersectContext* MAYBE_UNUSED context = args->internal_context; + const Geometry* const geometry = args->geometry; + if (geometry->intersectionFilterN) { + assert(context->scene->hasGeometryFilterFunction()); + geometry->intersectionFilterN(filter_args); + } + + //if (args->valid[0] == 0) + // return; + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(filter_args); + } +#endif + } + + __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) + { + if (geometry->occlusionFilterN) + { + assert(context->scene->hasGeometryFilterFunction()); + geometry->occlusionFilterN(args); + + if (args->valid[0] == 0) + return false; + } + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(args); + + if (args->valid[0] == 0) + return false; + } + return true; + } + + __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit) + { + RTCFilterFunctionNArguments args; + int mask = -1; + args.valid = &mask; + args.geometryUserPtr = geometry->userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.hit = (RTCHitN*)&hit; + args.N = 1; + return runOcclusionFilter1Helper(&args,geometry,context); + } + + __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) + { +#if defined(EMBREE_FILTER_FUNCTION) + IntersectContext* MAYBE_UNUSED context = args->internal_context; + const Geometry* const geometry = args->geometry; + if (geometry->occlusionFilterN) { + assert(context->scene->hasGeometryFilterFunction()); + geometry->occlusionFilterN(filter_args); + } + + //if (args->valid[0] == 0) + // return false; + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(filter_args); + } +#endif + } + + template<int K> + __forceinline vbool<K> runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) + { + vint<K>* mask = (vint<K>*) args->valid; + if (geometry->intersectionFilterN) + { + assert(context->scene->hasGeometryFilterFunction()); + geometry->intersectionFilterN(args); + } + + vbool<K> valid_o = *mask != vint<K>(zero); + if (none(valid_o)) return valid_o; + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(args); + } + + valid_o = *mask != vint<K>(zero); + if (none(valid_o)) return valid_o; + + copyHitToRay(valid_o,*(RayHitK<K>*)args->ray,*(HitK<K>*)args->hit); + return valid_o; + } + + template<int K> + __forceinline vbool<K> runIntersectionFilter(const vbool<K>& valid, const Geometry* const geometry, RayHitK<K>& ray, IntersectContext* context, HitK<K>& hit) + { + RTCFilterFunctionNArguments args; + vint<K> mask = valid.mask32(); + args.valid = (int*)&mask; + args.geometryUserPtr = geometry->userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.hit = (RTCHitN*)&hit; + args.N = K; + return runIntersectionFilterHelper<K>(&args,geometry,context); + } + + template<int K> + __forceinline vbool<K> runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) + { + vint<K>* mask = (vint<K>*) args->valid; + if (geometry->occlusionFilterN) + { + assert(context->scene->hasGeometryFilterFunction()); + geometry->occlusionFilterN(args); + } + + vbool<K> valid_o = *mask != vint<K>(zero); + + if (none(valid_o)) return valid_o; + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(args); + } + + valid_o = *mask != vint<K>(zero); + + RayK<K>* ray = (RayK<K>*) args->ray; + ray->tfar = select(valid_o, vfloat<K>(neg_inf), ray->tfar); + return valid_o; + } + + template<int K> + __forceinline vbool<K> runOcclusionFilter(const vbool<K>& valid, const Geometry* const geometry, RayK<K>& ray, IntersectContext* context, HitK<K>& hit) + { + RTCFilterFunctionNArguments args; + vint<K> mask = valid.mask32(); + args.valid = (int*)&mask; + args.geometryUserPtr = geometry->userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.hit = (RTCHitN*)&hit; + args.N = K; + return runOcclusionFilterHelper<K>(&args,geometry,context); + } + } +} diff --git a/thirdparty/embree/kernels/geometry/grid_intersector.h b/thirdparty/embree/kernels/geometry/grid_intersector.h new file mode 100644 index 0000000000..9c59cef119 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/grid_intersector.h @@ -0,0 +1,99 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "grid_soa.h" +#include "grid_soa_intersector1.h" +#include "grid_soa_intersector_packet.h" +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + template<typename T> + class SubdivPatch1Precalculations : public T + { + public: + __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr) + : T(ray,ptr) {} + }; + + template<int K, typename T> + class SubdivPatch1PrecalculationsK : public T + { + public: + __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray) + : T(valid,ray) {} + }; + + class Grid1Intersector1 + { + public: + typedef GridSOA Primitive; + typedef Grid1Precalculations<GridSOAIntersector1::Precalculations> Precalculations; + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node); + } + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { + intersect(pre,ray,context,prim,ty,lazy_node); + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node); + } + static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { + return occluded(pre,ray,context,prim,ty,lazy_node); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) { + assert(false && "not implemented"); + return false; + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { + assert(false && "not implemented"); + return false; + } + }; + + template <int K> + struct GridIntersectorK + { + typedef GridSOA Primitive; + typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations; + + + static __forceinline void intersect(const vbool<K>& valid, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node); + } + + static __forceinline vbool<K> occluded(const vbool<K>& valid, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node); + } + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node); + } + }; + + typedef Grid1IntersectorK<4> SubdivPatch1Intersector4; + typedef Grid1IntersectorK<8> SubdivPatch1Intersector8; + typedef Grid1IntersectorK<16> SubdivPatch1Intersector16; + + } +} diff --git a/thirdparty/embree/kernels/geometry/grid_soa.h b/thirdparty/embree/kernels/geometry/grid_soa.h new file mode 100644 index 0000000000..cea90aedf6 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/grid_soa.h @@ -0,0 +1,275 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/scene_subdiv_mesh.h" +#include "../bvh/bvh.h" +#include "../subdiv/tessellation.h" +#include "../subdiv/tessellation_cache.h" +#include "subdivpatch1.h" + +namespace embree +{ + namespace isa + { + class GridSOA + { + public: + + /*! GridSOA constructor */ + GridSOA(const SubdivPatch1Base* patches, const unsigned time_steps, + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + const SubdivMesh* const geom, const size_t totalBvhBytes, const size_t gridBytes, BBox3fa* bounds_o = nullptr); + + /*! Subgrid creation */ + template<typename Allocator> + static GridSOA* create(const SubdivPatch1Base* patches, const unsigned time_steps, + unsigned x0, unsigned x1, unsigned y0, unsigned y1, + const Scene* scene, Allocator& alloc, BBox3fa* bounds_o = nullptr) + { + const unsigned width = x1-x0+1; + const unsigned height = y1-y0+1; + const GridRange range(0,width-1,0,height-1); + size_t bvhBytes = 0; + if (time_steps == 1) + bvhBytes = getBVHBytes(range,sizeof(BVH4::AABBNode),0); + else { + bvhBytes = (time_steps-1)*getBVHBytes(range,sizeof(BVH4::AABBNodeMB),0); + bvhBytes += getTemporalBVHBytes(make_range(0,int(time_steps-1)),sizeof(BVH4::AABBNodeMB4D)); + } + const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float); + size_t rootBytes = time_steps*sizeof(BVH4::NodeRef); +#if !defined(__64BIT__) + rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding. +#endif + void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes); + assert(data); + return new (data) GridSOA(patches,time_steps,x0,x1,y0,y1,patches->grid_u_res,patches->grid_v_res,scene->get<SubdivMesh>(patches->geomID()),bvhBytes,gridBytes,bounds_o); + } + + /*! Grid creation */ + template<typename Allocator> + static GridSOA* create(const SubdivPatch1Base* const patches, const unsigned time_steps, + const Scene* scene, const Allocator& alloc, BBox3fa* bounds_o = nullptr) + { + return create(patches,time_steps,0,patches->grid_u_res-1,0,patches->grid_v_res-1,scene,alloc,bounds_o); + } + + /*! returns reference to root */ + __forceinline BVH4::NodeRef& root(size_t t = 0) { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; } + __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; } + + /*! returns pointer to BVH array */ + __forceinline char* bvhData() { return &data[0]; } + __forceinline const char* bvhData() const { return &data[0]; } + + /*! returns pointer to Grid array */ + __forceinline float* gridData(size_t t = 0) { return (float*) &data[gridOffset + t*gridBytes]; } + __forceinline const float* gridData(size_t t = 0) const { return (float*) &data[gridOffset + t*gridBytes]; } + + __forceinline void* encodeLeaf(size_t u, size_t v) { + return (void*) (16*(v * width + u + 1)); // +1 to not create empty leaf + } + __forceinline float* decodeLeaf(size_t t, const void* ptr) { + return gridData(t) + (((size_t) (ptr) >> 4) - 1); + } + + /*! returns the size of the BVH over the grid in bytes */ + static size_t getBVHBytes(const GridRange& range, const size_t nodeBytes, const size_t leafBytes); + + /*! returns the size of the temporal BVH over the time range BVHs */ + static size_t getTemporalBVHBytes(const range<int> time_range, const size_t nodeBytes); + + /*! calculates bounding box of grid range */ + __forceinline BBox3fa calculateBounds(size_t time, const GridRange& range) const + { + const float* const grid_array = gridData(time); + const float* const grid_x_array = grid_array + 0 * dim_offset; + const float* const grid_y_array = grid_array + 1 * dim_offset; + const float* const grid_z_array = grid_array + 2 * dim_offset; + + /* compute the bounds just for the range! */ + BBox3fa bounds( empty ); + for (unsigned v = range.v_start; v<=range.v_end; v++) + { + for (unsigned u = range.u_start; u<=range.u_end; u++) + { + const float x = grid_x_array[ v * width + u]; + const float y = grid_y_array[ v * width + u]; + const float z = grid_z_array[ v * width + u]; + bounds.extend( Vec3fa(x,y,z) ); + } + } + assert(is_finite(bounds)); + return bounds; + } + + /*! Evaluates grid over patch and builds BVH4 tree over the grid. */ + std::pair<BVH4::NodeRef,BBox3fa> buildBVH(BBox3fa* bounds_o); + + /*! Create BVH4 tree over grid. */ + std::pair<BVH4::NodeRef,BBox3fa> buildBVH(const GridRange& range, size_t& allocator); + + /*! Evaluates grid over patch and builds MSMBlur BVH4 tree over the grid. */ + std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, BBox3fa* bounds_o); + + /*! Create MBlur BVH4 tree over grid. */ + std::pair<BVH4::NodeRef,LBBox3fa> buildMBlurBVH(size_t time, const GridRange& range, size_t& allocator); + + /*! Create MSMBlur BVH4 tree over grid. */ + std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, size_t& allocator, BBox3fa* bounds_o); + + template<typename Loader> + struct MapUV + { + typedef typename Loader::vfloat vfloat; + const float* const grid_uv; + size_t line_offset; + size_t lines; + + __forceinline MapUV(const float* const grid_uv, size_t line_offset, const size_t lines) + : grid_uv(grid_uv), line_offset(line_offset), lines(lines) {} + + __forceinline void operator() (vfloat& u, vfloat& v, Vec3<vfloat>& Ng) const { + const Vec3<vfloat> tri_v012_uv = Loader::gather(grid_uv,line_offset,lines); + const Vec2<vfloat> uv0 = GridSOA::decodeUV(tri_v012_uv[0]); + const Vec2<vfloat> uv1 = GridSOA::decodeUV(tri_v012_uv[1]); + const Vec2<vfloat> uv2 = GridSOA::decodeUV(tri_v012_uv[2]); + const Vec2<vfloat> uv = u * uv1 + v * uv2 + (1.0f-u-v) * uv0; + u = uv[0];v = uv[1]; + } + }; + + struct Gather2x3 + { + enum { M = 4 }; + typedef vbool4 vbool; + typedef vint4 vint; + typedef vfloat4 vfloat; + + static __forceinline const Vec3vf4 gather(const float* const grid, const size_t line_offset, const size_t lines) + { + vfloat4 r0 = vfloat4::loadu(grid + 0*line_offset); + vfloat4 r1 = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid + if (unlikely(line_offset == 2)) + { + r0 = shuffle<0,1,1,1>(r0); + r1 = shuffle<0,1,1,1>(r1); + } + return Vec3vf4(unpacklo(r0,r1), // r00, r10, r01, r11 + shuffle<1,1,2,2>(r0), // r01, r01, r02, r02 + shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12 + } + + static __forceinline void gather(const float* const grid_x, + const float* const grid_y, + const float* const grid_z, + const size_t line_offset, + const size_t lines, + Vec3vf4& v0_o, + Vec3vf4& v1_o, + Vec3vf4& v2_o) + { + const Vec3vf4 tri_v012_x = gather(grid_x,line_offset,lines); + const Vec3vf4 tri_v012_y = gather(grid_y,line_offset,lines); + const Vec3vf4 tri_v012_z = gather(grid_z,line_offset,lines); + v0_o = Vec3vf4(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]); + v1_o = Vec3vf4(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]); + v2_o = Vec3vf4(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]); + } + }; + +#if defined (__AVX__) + struct Gather3x3 + { + enum { M = 8 }; + typedef vbool8 vbool; + typedef vint8 vint; + typedef vfloat8 vfloat; + + static __forceinline const Vec3vf8 gather(const float* const grid, const size_t line_offset, const size_t lines) + { + vfloat4 ra = vfloat4::loadu(grid + 0*line_offset); + vfloat4 rb = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid + vfloat4 rc; + if (likely(lines > 2)) + rc = vfloat4::loadu(grid + 2*line_offset); + else + rc = rb; + + if (unlikely(line_offset == 2)) + { + ra = shuffle<0,1,1,1>(ra); + rb = shuffle<0,1,1,1>(rb); + rc = shuffle<0,1,1,1>(rc); + } + + const vfloat8 r0 = vfloat8(ra,rb); + const vfloat8 r1 = vfloat8(rb,rc); + return Vec3vf8(unpacklo(r0,r1), // r00, r10, r01, r11, r10, r20, r11, r21 + shuffle<1,1,2,2>(r0), // r01, r01, r02, r02, r11, r11, r12, r12 + shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12, r20, r21, r21, r22 + } + + static __forceinline void gather(const float* const grid_x, + const float* const grid_y, + const float* const grid_z, + const size_t line_offset, + const size_t lines, + Vec3vf8& v0_o, + Vec3vf8& v1_o, + Vec3vf8& v2_o) + { + const Vec3vf8 tri_v012_x = gather(grid_x,line_offset,lines); + const Vec3vf8 tri_v012_y = gather(grid_y,line_offset,lines); + const Vec3vf8 tri_v012_z = gather(grid_z,line_offset,lines); + v0_o = Vec3vf8(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]); + v1_o = Vec3vf8(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]); + v2_o = Vec3vf8(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]); + } + }; +#endif + + template<typename vfloat> + static __forceinline Vec2<vfloat> decodeUV(const vfloat& uv) + { + typedef typename vfloat::Int vint; + const vint iu = asInt(uv) & 0xffff; + const vint iv = srl(asInt(uv),16); + const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000); + const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000); + return Vec2<vfloat>(u,v); + } + + __forceinline unsigned int geomID() const { + return _geomID; + } + + __forceinline unsigned int primID() const { + return _primID; + } + + public: + BVH4::NodeRef troot; +#if !defined(__64BIT__) + unsigned align1; +#endif + unsigned time_steps; + unsigned width; + + unsigned height; + unsigned dim_offset; + unsigned _geomID; + unsigned _primID; + + unsigned align2; + unsigned gridOffset; + unsigned gridBytes; + unsigned rootOffset; + + char data[1]; //!< after the struct we first store the BVH, then the grid, and finally the roots + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h new file mode 100644 index 0000000000..8fbf0d4bdf --- /dev/null +++ b/thirdparty/embree/kernels/geometry/grid_soa_intersector1.h @@ -0,0 +1,207 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "grid_soa.h" +#include "../common/ray.h" +#include "triangle_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + class GridSOAIntersector1 + { + public: + typedef void Primitive; + + class Precalculations + { + public: + __forceinline Precalculations (const Ray& ray, const void* ptr) + : grid(nullptr) {} + + public: + GridSOA* grid; + int itime; + float ftime; + }; + + template<typename Loader> + static __forceinline void intersect(RayHit& ray, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + Vec3<vfloat> v0, v1, v2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); + GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines); + PlueckerIntersector1<Loader::M> intersector(ray,nullptr); + intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + }; + + template<typename Loader> + static __forceinline bool occluded(Ray& ray, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3<vfloat> v0, v1, v2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); + + GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines); + PlueckerIntersector1<Loader::M> intersector(ray,nullptr); + return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + +#if defined(__AVX__) + intersect<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre); +#else + intersect<GridSOA::Gather2x3>(ray, context, grid_x , line_offset, lines, pre); + if (likely(lines > 2)) + intersect<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre); +#endif + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + +#if defined(__AVX__) + return occluded<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre); +#else + if (occluded<GridSOA::Gather2x3>(ray, context, grid_x , line_offset, lines, pre)) return true; + if (likely(lines > 2)) + if (occluded<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre)) return true; +#endif + return false; + } + }; + + class GridSOAMBIntersector1 + { + public: + typedef void Primitive; + typedef GridSOAIntersector1::Precalculations Precalculations; + + template<typename Loader> + static __forceinline void intersect(RayHit& ray, const float ftime, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const size_t grid_offset = pre.grid->gridBytes >> 2; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3<vfloat> a0, a1, a2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); + + Vec3<vfloat> b0, b1, b2; + Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); + + Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime)); + Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime)); + Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime)); + + GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines); + PlueckerIntersector1<Loader::M> intersector(ray,nullptr); + intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + }; + + template<typename Loader> + static __forceinline bool occluded(Ray& ray, const float ftime, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const size_t grid_offset = pre.grid->gridBytes >> 2; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3<vfloat> a0, a1, a2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); + + Vec3<vfloat> b0, b1, b2; + Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); + + Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime)); + Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime)); + Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime)); + + GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines); + PlueckerIntersector1<Loader::M> intersector(ray,nullptr); + return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(pre.itime,prim); + +#if defined(__AVX__) + intersect<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre); +#else + intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x, line_offset, lines, pre); + if (likely(lines > 2)) + intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre); +#endif + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(pre.itime,prim); + +#if defined(__AVX__) + return occluded<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre); +#else + if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x , line_offset, lines, pre)) return true; + if (likely(lines > 2)) + if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true; +#endif + return false; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h new file mode 100644 index 0000000000..14cacab5fe --- /dev/null +++ b/thirdparty/embree/kernels/geometry/grid_soa_intersector_packet.h @@ -0,0 +1,445 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "grid_soa.h" +#include "../common/ray.h" +#include "triangle_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + template<int K> + struct MapUV0 + { + const float* const grid_uv; + size_t ofs00, ofs01, ofs10, ofs11; + + __forceinline MapUV0(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11) + : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {} + + __forceinline void operator() (vfloat<K>& u, vfloat<K>& v, Vec3vf<K>& Ng) const { + const vfloat<K> uv00(grid_uv[ofs00]); + const vfloat<K> uv01(grid_uv[ofs01]); + const vfloat<K> uv10(grid_uv[ofs10]); + const vfloat<K> uv11(grid_uv[ofs11]); + const Vec2vf<K> uv0 = GridSOA::decodeUV(uv00); + const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01); + const Vec2vf<K> uv2 = GridSOA::decodeUV(uv10); + const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0)); + u = uv[0]; v = uv[1]; + } + }; + + template<int K> + struct MapUV1 + { + const float* const grid_uv; + size_t ofs00, ofs01, ofs10, ofs11; + + __forceinline MapUV1(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11) + : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {} + + __forceinline void operator() (vfloat<K>& u, vfloat<K>& v, Vec3vf<K>& Ng) const { + const vfloat<K> uv00(grid_uv[ofs00]); + const vfloat<K> uv01(grid_uv[ofs01]); + const vfloat<K> uv10(grid_uv[ofs10]); + const vfloat<K> uv11(grid_uv[ofs11]); + const Vec2vf<K> uv0 = GridSOA::decodeUV(uv10); + const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01); + const Vec2vf<K> uv2 = GridSOA::decodeUV(uv11); + const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0)); + u = uv[0]; v = uv[1]; + } + }; + + template<int K> + class GridSOAIntersectorK + { + public: + typedef void Primitive; + + class Precalculations + { +#if defined(__AVX__) + static const int M = 8; +#else + static const int M = 4; +#endif + + public: + __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) + : grid(nullptr), intersector(valid,ray) {} + + public: + GridSOA* grid; + PlueckerIntersectorK<M,K> intersector; // FIXME: use quad intersector + }; + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t dim_offset = pre.grid->dim_offset; + const size_t line_offset = pre.grid->width; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + const size_t max_x = pre.grid->width == 2 ? 1 : 2; + const size_t max_y = pre.grid->height == 2 ? 1 : 2; + for (size_t y=0; y<max_y; y++) + { + for (size_t x=0; x<max_x; x++) + { + const size_t ofs00 = (y+0)*line_offset+(x+0); + const size_t ofs01 = (y+0)*line_offset+(x+1); + const size_t ofs10 = (y+1)*line_offset+(x+0); + const size_t ofs11 = (y+1)*line_offset+(x+1); + const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + + pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + } + } + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t dim_offset = pre.grid->dim_offset; + const size_t line_offset = pre.grid->width; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + vbool<K> valid = valid_i; + const size_t max_x = pre.grid->width == 2 ? 1 : 2; + const size_t max_y = pre.grid->height == 2 ? 1 : 2; + for (size_t y=0; y<max_y; y++) + { + for (size_t x=0; x<max_x; x++) + { + const size_t ofs00 = (y+0)*line_offset+(x+0); + const size_t ofs01 = (y+0)*line_offset+(x+1); + const size_t ofs10 = (y+1)*line_offset+(x+0); + const size_t ofs11 = (y+1)*line_offset+(x+1); + const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + + pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); + if (none(valid)) break; + pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); + if (none(valid)) break; + } + } + return !valid; + } + + template<typename Loader> + static __forceinline void intersect(RayHitK<K>& ray, size_t k, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); + pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID())); + }; + + template<typename Loader> + static __forceinline bool occluded(RayK<K>& ray, size_t k, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); + return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID())); + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); +#if defined(__AVX__) + intersect<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre); +#else + intersect<GridSOA::Gather2x3>(ray, k, context, grid_x , line_offset, lines, pre); + if (likely(lines > 2)) + intersect<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre); +#endif + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + +#if defined(__AVX__) + return occluded<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre); +#else + if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x , line_offset, lines, pre)) return true; + if (likely(lines > 2)) + if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre)) return true; +#endif + return false; + } + }; + + template<int K> + class GridSOAMBIntersectorK + { + public: + typedef void Primitive; + typedef typename GridSOAIntersectorK<K>::Precalculations Precalculations; + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + vfloat<K> vftime; + vint<K> vitime = getTimeSegment<K>(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime); + + vbool<K> valid1 = valid_i; + while (any(valid1)) { + const size_t j = bsf(movemask(valid1)); + const int itime = vitime[j]; + const vbool<K> valid2 = valid1 & (itime == vitime); + valid1 = valid1 & !valid2; + intersect(valid2,pre,ray,vftime,itime,context,prim,lazy_node); + } + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t grid_offset = pre.grid->gridBytes >> 2; + const size_t dim_offset = pre.grid->dim_offset; + const size_t line_offset = pre.grid->width; + const float* const grid_x = pre.grid->decodeLeaf(itime,prim); + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + const size_t max_x = pre.grid->width == 2 ? 1 : 2; + const size_t max_y = pre.grid->height == 2 ? 1 : 2; + for (size_t y=0; y<max_y; y++) + { + for (size_t x=0; x<max_x; x++) + { + size_t ofs00 = (y+0)*line_offset+(x+0); + size_t ofs01 = (y+0)*line_offset+(x+1); + size_t ofs10 = (y+1)*line_offset+(x+0); + size_t ofs11 = (y+1)*line_offset+(x+1); + const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + ofs00 += grid_offset; + ofs01 += grid_offset; + ofs10 += grid_offset; + ofs11 += grid_offset; + const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + const Vec3vf<K> p00 = lerp(a00,b00,ftime); + const Vec3vf<K> p01 = lerp(a01,b01,ftime); + const Vec3vf<K> p10 = lerp(a10,b10,ftime); + const Vec3vf<K> p11 = lerp(a11,b11,ftime); + + pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + } + } + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + vfloat<K> vftime; + vint<K> vitime = getTimeSegment<K>(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime); + + vbool<K> valid_o = valid_i; + vbool<K> valid1 = valid_i; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int itime = vitime[j]; + const vbool<K> valid2 = valid1 & (itime == vitime); + valid1 = valid1 & !valid2; + valid_o &= !valid2 | occluded(valid2,pre,ray,vftime,itime,context,prim,lazy_node); + } + return !valid_o; + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t grid_offset = pre.grid->gridBytes >> 2; + const size_t dim_offset = pre.grid->dim_offset; + const size_t line_offset = pre.grid->width; + const float* const grid_x = pre.grid->decodeLeaf(itime,prim); + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + vbool<K> valid = valid_i; + const size_t max_x = pre.grid->width == 2 ? 1 : 2; + const size_t max_y = pre.grid->height == 2 ? 1 : 2; + for (size_t y=0; y<max_y; y++) + { + for (size_t x=0; x<max_x; x++) + { + size_t ofs00 = (y+0)*line_offset+(x+0); + size_t ofs01 = (y+0)*line_offset+(x+1); + size_t ofs10 = (y+1)*line_offset+(x+0); + size_t ofs11 = (y+1)*line_offset+(x+1); + const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + ofs00 += grid_offset; + ofs01 += grid_offset; + ofs10 += grid_offset; + ofs11 += grid_offset; + const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + const Vec3vf<K> p00 = lerp(a00,b00,ftime); + const Vec3vf<K> p01 = lerp(a01,b01,ftime); + const Vec3vf<K> p10 = lerp(a10,b10,ftime); + const Vec3vf<K> p11 = lerp(a11,b11,ftime); + + pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); + if (none(valid)) break; + pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); + if (none(valid)) break; + } + } + return valid; + } + + template<typename Loader> + static __forceinline void intersect(RayHitK<K>& ray, size_t k, + const float ftime, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t grid_offset = pre.grid->gridBytes >> 2; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3<vfloat> a0, a1, a2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); + + Vec3<vfloat> b0, b1, b2; + Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); + + Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime)); + Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime)); + Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime)); + + pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID())); + }; + + template<typename Loader> + static __forceinline bool occluded(RayK<K>& ray, size_t k, + const float ftime, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t grid_offset = pre.grid->gridBytes >> 2; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3<vfloat> a0, a1, a2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); + + Vec3<vfloat> b0, b1, b2; + Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); + + Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime)); + Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime)); + Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime)); + + return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID())); + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + float ftime; + int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime); + + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(itime,prim); + +#if defined(__AVX__) + intersect<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre); +#else + intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre); + if (likely(lines > 2)) + intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre); +#endif + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + float ftime; + int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime); + + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(itime,prim); + +#if defined(__AVX__) + return occluded<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre); +#else + if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre)) return true; + if (likely(lines > 2)) + if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true; +#endif + return false; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/instance.h b/thirdparty/embree/kernels/geometry/instance.h new file mode 100644 index 0000000000..7c0e7e0f49 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/instance.h @@ -0,0 +1,78 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../common/scene_instance.h" + +namespace embree +{ + struct InstancePrimitive + { + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return 1; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return N; } + + public: + + InstancePrimitive (const Instance* instance, unsigned int instID) + : instance(instance) + , instID_(instID) + {} + + __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene) + { + assert(end-i == 1); + const PrimRef& prim = prims[i]; i++; + const unsigned int geomID = prim.geomID(); + const Instance* instance = scene->get<Instance>(geomID); + new (this) InstancePrimitive(instance, geomID); + } + + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime) + { + assert(end-i == 1); + const PrimRef& prim = prims[i]; i++; + const unsigned int geomID = prim.geomID(); + const Instance* instance = scene->get<Instance>(geomID); + new (this) InstancePrimitive(instance,geomID); + return instance->linearBounds(0,itime); + } + + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range) + { + assert(end-i == 1); + const PrimRefMB& prim = prims[i]; i++; + const unsigned int geomID = prim.geomID(); + const Instance* instance = scene->get<Instance>(geomID); + new (this) InstancePrimitive(instance,geomID); + return instance->linearBounds(0,time_range); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(Instance* instance) { + return instance->bounds(0); + } + + public: + const Instance* instance; + const unsigned int instID_ = std::numeric_limits<unsigned int>::max (); + }; +} diff --git a/thirdparty/embree/kernels/geometry/instance_intersector.h b/thirdparty/embree/kernels/geometry/instance_intersector.h new file mode 100644 index 0000000000..28a7b728e5 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/instance_intersector.h @@ -0,0 +1,84 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "instance.h" +#include "../common/ray.h" +#include "../common/point_query.h" + +namespace embree +{ + namespace isa + { + struct InstanceIntersector1 + { + typedef InstancePrimitive Primitive; + + struct Precalculations { + __forceinline Precalculations (const Ray& ray, const void *ptr) {} + }; + + static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim); + static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim); + static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim); + }; + + struct InstanceIntersector1MB + { + typedef InstancePrimitive Primitive; + + struct Precalculations { + __forceinline Precalculations (const Ray& ray, const void *ptr) {} + }; + + static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim); + static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim); + static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim); + }; + + template<int K> + struct InstanceIntersectorK + { + typedef InstancePrimitive Primitive; + + struct Precalculations { + __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {} + }; + + static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim); + static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim); + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) { + intersect(vbool<K>(1<<int(k)),pre,ray,context,prim); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) { + occluded(vbool<K>(1<<int(k)),pre,ray,context,prim); + return ray.tfar[k] < 0.0f; + } + }; + + template<int K> + struct InstanceIntersectorKMB + { + typedef InstancePrimitive Primitive; + + struct Precalculations { + __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {} + }; + + static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim); + static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim); + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) { + intersect(vbool<K>(1<<int(k)),pre,ray,context,prim); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) { + occluded(vbool<K>(1<<int(k)),pre,ray,context,prim); + return ray.tfar[k] < 0.0f; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/intersector_epilog.h b/thirdparty/embree/kernels/geometry/intersector_epilog.h new file mode 100644 index 0000000000..7bf134cc54 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/intersector_epilog.h @@ -0,0 +1,979 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/context.h" +#include "filter.h" + +namespace embree +{ + namespace isa + { + template<int M> + struct UVIdentity { + __forceinline void operator() (vfloat<M>& u, vfloat<M>& v, Vec3vf<M>& Ng) const {} + }; + + + template<bool filter> + struct Intersect1Epilog1 + { + RayHit& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Intersect1Epilog1(RayHit& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline bool operator() (Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask) == 0) return false; +#endif + hit.finalize(); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); + const float old_t = ray.tfar; + ray.tfar = hit.t; + bool found = runIntersectionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + return found; + } + } +#endif + + /* update hit information */ + ray.tfar = hit.t; + ray.Ng = hit.Ng; + ray.u = hit.u; + ray.v = hit.v; + ray.primID = primID; + ray.geomID = geomID; + instance_id_stack::copy_UU(context->user->instID, ray.instID); + return true; + } + }; + + template<bool filter> + struct Occluded1Epilog1 + { + Ray& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Occluded1Epilog1(Ray& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline bool operator() (Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + + +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask) == 0) return false; +#endif + hit.finalize(); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) { + HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); + const float old_t = ray.tfar; + ray.tfar = hit.t; + const bool found = runOcclusionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + return found; + } + } +#endif + return true; + } + }; + + template<int K, bool filter> + struct Intersect1KEpilog1 + { + RayHitK<K>& ray; + size_t k; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Intersect1KEpilog1(RayHitK<K>& ray, size_t k, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline bool operator() (Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask[k]) == 0) + return false; +#endif + hit.finalize(); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t; + const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h)); + if (!found) ray.tfar[k] = old_t; + return found; + } + } +#endif + + /* update hit information */ + ray.tfar[k] = hit.t; + ray.Ng.x[k] = hit.Ng.x; + ray.Ng.y[k] = hit.Ng.y; + ray.Ng.z[k] = hit.Ng.z; + ray.u[k] = hit.u; + ray.v[k] = hit.v; + ray.primID[k] = primID; + ray.geomID[k] = geomID; + instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k); + return true; + } + }; + + template<int K, bool filter> + struct Occluded1KEpilog1 + { + RayK<K>& ray; + size_t k; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Occluded1KEpilog1(RayK<K>& ray, size_t k, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline bool operator() (Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask[k]) == 0) + return false; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) { + hit.finalize(); + HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t; + const bool found = any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h)); + if (!found) ray.tfar[k] = old_t; + return found; + } + } +#endif + return true; + } + }; + + template<int M, bool filter> + struct Intersect1EpilogM + { + RayHit& ray; + IntersectContext* context; + const vuint<M>& geomIDs; + const vuint<M>& primIDs; + + __forceinline Intersect1EpilogM(RayHit& ray, + IntersectContext* context, + const vuint<M>& geomIDs, + const vuint<M>& primIDs) + : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template<typename Hit> + __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + vbool<M> valid = valid_i; + hit.finalize(); + size_t i = select_min(valid,hit.vt); + unsigned int geomID = geomIDs[i]; + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK) + bool foundhit = false; + goto entry; + while (true) + { + if (unlikely(none(valid))) return foundhit; + i = select_min(valid,hit.vt); + + geomID = geomIDs[i]; + entry: + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask) == 0) { + clear(valid,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* call intersection filter function */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + const Vec2f uv = hit.uv(i); + HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + const bool found = runIntersectionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + foundhit |= found; + clear(valid,i); + valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value + continue; + } + } +#endif + break; + } +#endif + + /* update hit information */ + const Vec2f uv = hit.uv(i); + ray.tfar = hit.vt[i]; + ray.Ng.x = hit.vNg.x[i]; + ray.Ng.y = hit.vNg.y[i]; + ray.Ng.z = hit.vNg.z[i]; + ray.u = uv.x; + ray.v = uv.y; + ray.primID = primIDs[i]; + ray.geomID = geomID; + instance_id_stack::copy_UU(context->user->instID, ray.instID); + return true; + + } + }; + + template<int M, bool filter> + struct Occluded1EpilogM + { + Ray& ray; + IntersectContext* context; + const vuint<M>& geomIDs; + const vuint<M>& primIDs; + + __forceinline Occluded1EpilogM(Ray& ray, + IntersectContext* context, + const vuint<M>& geomIDs, + const vuint<M>& primIDs) + : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template<typename Hit> + __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK) + if (unlikely(filter)) + hit.finalize(); /* called only once */ + + vbool<M> valid = valid_i; + size_t m=movemask(valid); + goto entry; + while (true) + { + if (unlikely(m == 0)) return false; + entry: + size_t i=bsf(m); + + const unsigned int geomID = geomIDs[i]; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask) == 0) { + m=btc(m,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* if we have no filter then the test passed */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + const Vec2f uv = hit.uv(i); + HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + if (runOcclusionFilter1(geometry,ray,context,h)) return true; + ray.tfar = old_t; + m=btc(m,i); + continue; + } + } +#endif + break; + } +#endif + + return true; + } + }; + + template<int M, bool filter> + struct Intersect1EpilogMU + { + RayHit& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Intersect1EpilogMU(RayHit& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask) == 0) return false; +#endif + + vbool<M> valid = valid_i; + hit.finalize(); + + size_t i = select_min(valid,hit.vt); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) + { + bool foundhit = false; + while (true) + { + /* call intersection filter function */ + Vec2f uv = hit.uv(i); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); + const bool found = runIntersectionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + foundhit |= found; + clear(valid,i); + valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value + if (unlikely(none(valid))) break; + i = select_min(valid,hit.vt); + } + return foundhit; + } +#endif + + /* update hit information */ + const Vec2f uv = hit.uv(i); + const Vec3fa Ng = hit.Ng(i); + ray.tfar = hit.t(i); + ray.Ng.x = Ng.x; + ray.Ng.y = Ng.y; + ray.Ng.z = Ng.z; + ray.u = uv.x; + ray.v = uv.y; + ray.primID = primID; + ray.geomID = geomID; + instance_id_stack::copy_UU(context->user->instID, ray.instID); + return true; + } + }; + + template<int M, bool filter> + struct Occluded1EpilogMU + { + Ray& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Occluded1EpilogMU(Ray& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline bool operator() (const vbool<M>& valid, Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask) == 0) return false; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + hit.finalize(); + for (size_t m=movemask(valid), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) + { + const Vec2f uv = hit.uv(i); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); + if (runOcclusionFilter1(geometry,ray,context,h)) return true; + ray.tfar = old_t; + } + return false; + } +#endif + return true; + } + }; + + template<int M, int K, bool filter> + struct IntersectKEpilogM + { + RayHitK<K>& ray; + IntersectContext* context; + const vuint<M>& geomIDs; + const vuint<M>& primIDs; + const size_t i; + + __forceinline IntersectKEpilogM(RayHitK<K>& ray, + IntersectContext* context, + const vuint<M>& geomIDs, + const vuint<M>& primIDs, + size_t i) + : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {} + + template<typename Hit> + __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + + vfloat<K> u, v, t; + Vec3vf<K> Ng; + vbool<K> valid = valid_i; + + std::tie(u,v,t,Ng) = hit(); + + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + + /* ray masking test */ +#if defined(EMBREE_RAY_MASK) + valid &= (geometry->mask & ray.mask) != 0; + if (unlikely(none(valid))) return false; +#endif + + /* occlusion filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + HitK<K> h(context->user,geomID,primID,u,v,Ng); + const vfloat<K> old_t = ray.tfar; + ray.tfar = select(valid,t,ray.tfar); + const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h); + ray.tfar = select(m_accept,ray.tfar,old_t); + return m_accept; + } + } +#endif + + /* update hit information */ + vfloat<K>::store(valid,&ray.tfar,t); + vfloat<K>::store(valid,&ray.Ng.x,Ng.x); + vfloat<K>::store(valid,&ray.Ng.y,Ng.y); + vfloat<K>::store(valid,&ray.Ng.z,Ng.z); + vfloat<K>::store(valid,&ray.u,u); + vfloat<K>::store(valid,&ray.v,v); + vuint<K>::store(valid,&ray.primID,primID); + vuint<K>::store(valid,&ray.geomID,geomID); + instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, valid); + return valid; + } + }; + + template<int M, int K, bool filter> + struct OccludedKEpilogM + { + vbool<K>& valid0; + RayK<K>& ray; + IntersectContext* context; + const vuint<M>& geomIDs; + const vuint<M>& primIDs; + const size_t i; + + __forceinline OccludedKEpilogM(vbool<K>& valid0, + RayK<K>& ray, + IntersectContext* context, + const vuint<M>& geomIDs, + const vuint<M>& primIDs, + size_t i) + : valid0(valid0), ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {} + + template<typename Hit> + __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const + { + vbool<K> valid = valid_i; + + /* ray masking test */ + Scene* scene MAYBE_UNUSED = context->scene; + const unsigned int geomID MAYBE_UNUSED = geomIDs[i]; + const unsigned int primID MAYBE_UNUSED = primIDs[i]; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + valid &= (geometry->mask & ray.mask) != 0; + if (unlikely(none(valid))) return valid; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + vfloat<K> u, v, t; + Vec3vf<K> Ng; + std::tie(u,v,t,Ng) = hit(); + HitK<K> h(context->user,geomID,primID,u,v,Ng); + const vfloat<K> old_t = ray.tfar; + ray.tfar = select(valid,t,ray.tfar); + valid = runOcclusionFilter(valid,geometry,ray,context,h); + ray.tfar = select(valid,ray.tfar,old_t); + } + } +#endif + + /* update occlusion */ + valid0 = valid0 & !valid; + return valid; + } + }; + + template<int M, int K, bool filter> + struct IntersectKEpilogMU + { + RayHitK<K>& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline IntersectKEpilogMU(RayHitK<K>& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline vbool<K> operator() (const vbool<K>& valid_org, const Hit& hit) const + { + vbool<K> valid = valid_org; + vfloat<K> u, v, t; + Vec3vf<K> Ng; + std::tie(u,v,t,Ng) = hit(); + + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + + /* ray masking test */ +#if defined(EMBREE_RAY_MASK) + valid &= (geometry->mask & ray.mask) != 0; + if (unlikely(none(valid))) return false; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + HitK<K> h(context->user,geomID,primID,u,v,Ng); + const vfloat<K> old_t = ray.tfar; + ray.tfar = select(valid,t,ray.tfar); + const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h); + ray.tfar = select(m_accept,ray.tfar,old_t); + return m_accept; + } + } +#endif + + /* update hit information */ + vfloat<K>::store(valid,&ray.tfar,t); + vfloat<K>::store(valid,&ray.Ng.x,Ng.x); + vfloat<K>::store(valid,&ray.Ng.y,Ng.y); + vfloat<K>::store(valid,&ray.Ng.z,Ng.z); + vfloat<K>::store(valid,&ray.u,u); + vfloat<K>::store(valid,&ray.v,v); + vuint<K>::store(valid,&ray.primID,primID); + vuint<K>::store(valid,&ray.geomID,geomID); + instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, valid); + return valid; + } + }; + + template<int M, int K, bool filter> + struct OccludedKEpilogMU + { + vbool<K>& valid0; + RayK<K>& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline OccludedKEpilogMU(vbool<K>& valid0, + RayK<K>& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const + { + vbool<K> valid = valid_i; + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + +#if defined(EMBREE_RAY_MASK) + valid &= (geometry->mask & ray.mask) != 0; + if (unlikely(none(valid))) return false; +#endif + + /* occlusion filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + vfloat<K> u, v, t; + Vec3vf<K> Ng; + std::tie(u,v,t,Ng) = hit(); + HitK<K> h(context->user,geomID,primID,u,v,Ng); + const vfloat<K> old_t = ray.tfar; + ray.tfar = select(valid,t,ray.tfar); + valid = runOcclusionFilter(valid,geometry,ray,context,h); + ray.tfar = select(valid,ray.tfar,old_t); + } + } +#endif + + /* update occlusion */ + valid0 = valid0 & !valid; + return valid; + } + }; + + template<int M, int K, bool filter> + struct Intersect1KEpilogM + { + RayHitK<K>& ray; + size_t k; + IntersectContext* context; + const vuint<M>& geomIDs; + const vuint<M>& primIDs; + + __forceinline Intersect1KEpilogM(RayHitK<K>& ray, size_t k, + IntersectContext* context, + const vuint<M>& geomIDs, + const vuint<M>& primIDs) + : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template<typename Hit> + __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + vbool<M> valid = valid_i; + hit.finalize(); + size_t i = select_min(valid,hit.vt); + assert(i<M); + unsigned int geomID = geomIDs[i]; + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK) + bool foundhit = false; + goto entry; + while (true) + { + if (unlikely(none(valid))) return foundhit; + i = select_min(valid,hit.vt); + assert(i<M); + geomID = geomIDs[i]; + entry: + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask[k]) == 0) { + clear(valid,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* call intersection filter function */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + assert(i<M); + const Vec2f uv = hit.uv(i); + HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t(i); + const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h)); + if (!found) ray.tfar[k] = old_t; + foundhit = foundhit | found; + clear(valid,i); + valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value + continue; + } + } +#endif + break; + } +#endif + assert(i<M); + /* update hit information */ + const Vec2f uv = hit.uv(i); + ray.tfar[k] = hit.t(i); + ray.Ng.x[k] = hit.vNg.x[i]; + ray.Ng.y[k] = hit.vNg.y[i]; + ray.Ng.z[k] = hit.vNg.z[i]; + ray.u[k] = uv.x; + ray.v[k] = uv.y; + ray.primID[k] = primIDs[i]; + ray.geomID[k] = geomID; + instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k); + return true; + } + }; + + template<int M, int K, bool filter> + struct Occluded1KEpilogM + { + RayK<K>& ray; + size_t k; + IntersectContext* context; + const vuint<M>& geomIDs; + const vuint<M>& primIDs; + + __forceinline Occluded1KEpilogM(RayK<K>& ray, size_t k, + IntersectContext* context, + const vuint<M>& geomIDs, + const vuint<M>& primIDs) + : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template<typename Hit> + __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK) + if (unlikely(filter)) + hit.finalize(); /* called only once */ + + vbool<M> valid = valid_i; + size_t m=movemask(valid); + goto entry; + while (true) + { + if (unlikely(m == 0)) return false; + entry: + size_t i=bsf(m); + + const unsigned int geomID = geomIDs[i]; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask[k]) == 0) { + m=btc(m,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* execute occlusion filer */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + const Vec2f uv = hit.uv(i); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t(i); + HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true; + ray.tfar[k] = old_t; + m=btc(m,i); + continue; + } + } +#endif + break; + } +#endif + return true; + } + }; + + template<int M, int K, bool filter> + struct Intersect1KEpilogMU + { + RayHitK<K>& ray; + size_t k; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Intersect1KEpilogMU(RayHitK<K>& ray, size_t k, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + /* ray mask test */ + if ((geometry->mask & ray.mask[k]) == 0) + return false; +#endif + + /* finalize hit calculation */ + vbool<M> valid = valid_i; + hit.finalize(); + size_t i = select_min(valid,hit.vt); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) + { + bool foundhit = false; + while (true) + { + const Vec2f uv = hit.uv(i); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t(i); + HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); + const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h)); + if (!found) ray.tfar[k] = old_t; + foundhit = foundhit | found; + clear(valid,i); + valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value + if (unlikely(none(valid))) break; + i = select_min(valid,hit.vt); + } + return foundhit; + } + } +#endif + + /* update hit information */ + const Vec2f uv = hit.uv(i); + const Vec3fa Ng = hit.Ng(i); + ray.tfar[k] = hit.t(i); + ray.Ng.x[k] = Ng.x; + ray.Ng.y[k] = Ng.y; + ray.Ng.z[k] = Ng.z; + ray.u[k] = uv.x; + ray.v[k] = uv.y; + ray.primID[k] = primID; + ray.geomID[k] = geomID; + instance_id_stack::copy_UV<K>(context->user->instID, ray.instID, k); + return true; + } + }; + + template<int M, int K, bool filter> + struct Occluded1KEpilogMU + { + RayK<K>& ray; + size_t k; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Occluded1KEpilogMU(RayK<K>& ray, size_t k, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} + + template<typename Hit> + __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + /* ray mask test */ + if ((geometry->mask & ray.mask[k]) == 0) + return false; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + hit.finalize(); + for (size_t m=movemask(valid_i), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) + { + const Vec2f uv = hit.uv(i); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t(i); + HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); + if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true; + ray.tfar[k] = old_t; + } + return false; + } + } +#endif + return true; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/intersector_iterators.h b/thirdparty/embree/kernels/geometry/intersector_iterators.h new file mode 100644 index 0000000000..9cac1cd25c --- /dev/null +++ b/thirdparty/embree/kernels/geometry/intersector_iterators.h @@ -0,0 +1,172 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "../common/ray.h" +#include "../common/point_query.h" +#include "../bvh/node_intersector1.h" +#include "../bvh/node_intersector_packet.h" + +namespace embree +{ + namespace isa + { + template<typename Intersector> + struct ArrayIntersector1 + { + typedef typename Intersector::Primitive Primitive; + typedef typename Intersector::Precalculations Precalculations; + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + for (size_t i=0; i<num; i++) + Intersector::intersect(pre,ray,context,prim[i]); + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + for (size_t i=0; i<num; i++) { + if (Intersector::occluded(pre,ray,context,prim[i])) + return true; + } + return false; + } + + template<int N> + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node) + { + bool changed = false; + for (size_t i=0; i<num; i++) + changed |= Intersector::pointQuery(query, context, prim[i]); + return changed; + } + + template<int K> + static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + } + + template<int K> + static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + return valid; + } + }; + + template<int K, typename Intersector> + struct ArrayIntersectorK_1 + { + typedef typename Intersector::Primitive Primitive; + typedef typename Intersector::Precalculations Precalculations; + + template<bool robust> + static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + for (size_t i=0; i<num; i++) { + Intersector::intersect(valid,pre,ray,context,prim[i]); + } + } + + template<bool robust> + static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + vbool<K> valid0 = valid; + for (size_t i=0; i<num; i++) { + valid0 &= !Intersector::occluded(valid0,pre,ray,context,prim[i]); + if (none(valid0)) break; + } + return !valid0; + } + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + for (size_t i=0; i<num; i++) { + Intersector::intersect(pre,ray,k,context,prim[i]); + } + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + for (size_t i=0; i<num; i++) { + if (Intersector::occluded(pre,ray,k,context,prim[i])) + return true; + } + return false; + } + }; + + // ============================================================================================= + + template<int K, typename IntersectorK> + struct ArrayIntersectorKStream + { + typedef typename IntersectorK::Primitive PrimitiveK; + typedef typename IntersectorK::Precalculations PrecalculationsK; + + static __forceinline void intersectK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + PrecalculationsK pre(valid,ray); // FIXME: might cause trouble + + for (size_t i=0; i<num; i++) { + IntersectorK::intersect(valid,pre,ray,context,prim[i]); + } + } + + static __forceinline vbool<K> occludedK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + PrecalculationsK pre(valid,ray); // FIXME: might cause trouble + vbool<K> valid0 = valid; + for (size_t i=0; i<num; i++) { + valid0 &= !IntersectorK::occluded(valid0,pre,ray,context,prim[i]); + if (none(valid0)) break; + } + return !valid0; + } + + static __forceinline void intersect(const Accel::Intersectors* This, RayHitK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble + for (size_t i=0; i<num; i++) { + IntersectorK::intersect(pre,ray,k,context,prim[i]); + } + } + + static __forceinline bool occluded(const Accel::Intersectors* This, RayK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble + for (size_t i=0; i<num; i++) { + if (IntersectorK::occluded(pre,ray,k,context,prim[i])) + return true; + } + return false; + } + + static __forceinline size_t occluded(const Accel::Intersectors* This, size_t cur_mask, RayK<K>** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + size_t m_occluded = 0; + for (size_t i=0; i<num; i++) { + size_t bits = cur_mask & (~m_occluded); + for (; bits!=0; ) + { + const size_t rayID = bscf(bits); + RayHitK<K> &ray = *inputPackets[rayID / K]; + const size_t k = rayID % K; + PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble + if (IntersectorK::occluded(pre,ray,k,context,prim[i])) + { + m_occluded |= (size_t)1 << rayID; + ray.tfar[k] = neg_inf; + } + } + } + return m_occluded; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/line_intersector.h b/thirdparty/embree/kernels/geometry/line_intersector.h new file mode 100644 index 0000000000..41096d8794 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/line_intersector.h @@ -0,0 +1,145 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + template<int M> + struct LineIntersectorHitM + { + __forceinline LineIntersectorHitM() {} + + __forceinline LineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng) + : vu(u), vv(v), vt(t), vNg(Ng) {} + + __forceinline void finalize() {} + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); } + __forceinline vfloat<M> t () const { return vt; } + __forceinline Vec3vf<M> Ng() const { return vNg; } + + public: + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + }; + + template<int M> + struct FlatLinearCurveIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + template<typename Ray, typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + Ray& ray, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, const Vec4vf<M>& v1i, + const Epilog& epilog) + { + /* transform end points into ray space */ + vbool<M> valid = valid_i; + vfloat<M> depth_scale = pre.depth_scale; + LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space; + + const Vec3vf<M> ray_org ((Vec3fa)ray.org); + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i); + + Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w); + Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w); + + /* approximative intersection with cone */ + const Vec4vf<M> v = p1-p0; + const Vec4vf<M> w = -p0; + const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y); + const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y); + const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one)); + const Vec4vf<M> p = madd(u,v,p0); + const vfloat<M> t = p.z; + const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y); + const vfloat<M> r = p.w; + const vfloat<M> r2 = r*r; + valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar)); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections + if (unlikely(none(valid))) return false; + + /* ignore denormalized segments */ + const Vec3vf<M> T = v1.xyz()-v0.xyz(); + valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero)); + if (unlikely(none(valid))) return false; + + /* update hit information */ + LineIntersectorHitM<M> hit(u,zero,t,T); + return epilog(valid,hit); + } + }; + + template<int M, int K> + struct FlatLinearCurveIntersectorK + { + typedef CurvePrecalculationsK<K> Precalculations; + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + RayK<K>& ray, size_t k, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, const Vec4vf<M>& v1i, + const Epilog& epilog) + { + /* transform end points into ray space */ + vbool<M> valid = valid_i; + vfloat<M> depth_scale = pre.depth_scale[k]; + LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space[k]; + const Vec3vf<M> ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); + const Vec3vf<M> ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i); + + Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w); + Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w); + + /* approximative intersection with cone */ + const Vec4vf<M> v = p1-p0; + const Vec4vf<M> w = -p0; + const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y); + const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y); + const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one)); + const Vec4vf<M> p = madd(u,v,p0); + const vfloat<M> t = p.z; + const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y); + const vfloat<M> r = p.w; + const vfloat<M> r2 = r*r; + valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k])); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections + if (unlikely(none(valid))) return false; + + /* ignore denormalized segments */ + const Vec3vf<M> T = v1.xyz()-v0.xyz(); + valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero)); + if (unlikely(none(valid))) return false; + + /* update hit information */ + LineIntersectorHitM<M> hit(u,zero,t,T); + return epilog(valid,hit); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/linei.h b/thirdparty/embree/kernels/geometry/linei.h new file mode 100644 index 0000000000..3ee70ac012 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/linei.h @@ -0,0 +1,705 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + template<int M> + struct LineMi + { + /* Virtual interface to query information about the line segment type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored line segments */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N line segments */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + /* Returns required number of bytes for N line segments */ + static __forceinline size_t bytes(size_t N) { return blocks(N)*sizeof(LineMi); } + + public: + + /* Default constructor */ + __forceinline LineMi() { } + + /* Construction from vertices and IDs */ + __forceinline LineMi(const vuint<M>& v0, unsigned short leftExists, unsigned short rightExists, const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype) + : gtype((unsigned char)gtype), m((unsigned char)popcnt(vuint<M>(primIDs) != vuint<M>(-1))), sharedGeomID(geomIDs[0]), leftExists (leftExists), rightExists(rightExists), v0(v0), primIDs(primIDs) + { + assert(all(vuint<M>(geomID()) == geomIDs)); + } + + /* Returns a mask that tells which line segments are valid */ + __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); } + + /* Returns if the specified line segment is valid */ + __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; } + + /* Returns the number of stored line segments */ + __forceinline size_t size() const { return bsf(~movemask(valid())); } + + /* Returns the geometry IDs */ + //template<class T> + //static __forceinline T unmask(T &index) { return index & 0x3fffffff; } + + __forceinline unsigned int geomID(unsigned int i = 0) const { return sharedGeomID; } + //__forceinline vuint<M> geomID() { return unmask(geomIDs); } + //__forceinline const vuint<M> geomID() const { return unmask(geomIDs); } + //__forceinline unsigned int geomID(const size_t i) const { assert(i<M); return unmask(geomIDs[i]); } + + /* Returns the primitive IDs */ + __forceinline vuint<M>& primID() { return primIDs; } + __forceinline const vuint<M>& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; } + + /* gather the line segments */ + __forceinline void gather(Vec4vf<M>& p0, + Vec4vf<M>& p1, + const LineSegments* geom) const; + + __forceinline void gatheri(Vec4vf<M>& p0, + Vec4vf<M>& p1, + const LineSegments* geom, + const int itime) const; + + __forceinline void gather(Vec4vf<M>& p0, + Vec4vf<M>& p1, + const LineSegments* geom, + float time) const; + + /* gather the line segments with lateral info */ + __forceinline void gather(Vec4vf<M>& p0, + Vec4vf<M>& p1, + Vec4vf<M>& pL, + Vec4vf<M>& pR, + const LineSegments* geom) const; + + __forceinline void gatheri(Vec4vf<M>& p0, + Vec4vf<M>& p1, + Vec4vf<M>& pL, + Vec4vf<M>& pR, + const LineSegments* geom, + const int itime) const; + + __forceinline void gather(Vec4vf<M>& p0, + Vec4vf<M>& p1, + Vec4vf<M>& pL, + Vec4vf<M>& pR, + const LineSegments* geom, + float time) const; + + __forceinline void gather(Vec4vf<M>& p0, + Vec4vf<M>& p1, + vbool<M>& cL, + vbool<M>& cR, + const LineSegments* geom) const; + + __forceinline void gatheri(Vec4vf<M>& p0, + Vec4vf<M>& p1, + vbool<M>& cL, + vbool<M>& cR, + const LineSegments* geom, + const int itime) const; + + __forceinline void gather(Vec4vf<M>& p0, + Vec4vf<M>& p1, + vbool<M>& cL, + vbool<M>& cR, + const LineSegments* geom, + float time) const; + + /* Calculate the bounds of the line segments */ + __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const + { + BBox3fa bounds = empty; + for (size_t i=0; i<M && valid(i); i++) + { + const LineSegments* geom = scene->get<LineSegments>(geomID(i)); + const Vec3ff& p0 = geom->vertex(v0[i]+0,itime); + const Vec3ff& p1 = geom->vertex(v0[i]+1,itime); + BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1)); + b = enlarge(b,Vec3fa(max(p0.w,p1.w))); + bounds.extend(b); + } + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) { + return LBBox3fa(bounds(scene,itime+0), bounds(scene,itime+1)); + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) { + LBBox3fa allBounds = empty; + for (size_t i=0; i<M && valid(i); i++) + { + const LineSegments* geom = scene->get<LineSegments>(geomID(i)); + allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps)); + } + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + for (size_t i=0; i<M && valid(i); i++) + { + const LineSegments* geom = scene->get<LineSegments>(geomID((unsigned int)i)); + allBounds.extend(geom->linearBounds(primID(i), time_range)); + } + return allBounds; + } + + /* Fill line segment from line segment list */ + template<typename PrimRefT> + __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) + { + Geometry::GType gty = scene->get(prims[begin].geomID())->getType(); + vuint<M> geomID, primID; + vuint<M> v0; + unsigned short leftExists = 0; + unsigned short rightExists = 0; + const PrimRefT* prim = &prims[begin]; + + for (size_t i=0; i<M; i++) + { + const LineSegments* geom = scene->get<LineSegments>(prim->geomID()); + if (begin<end) { + geomID[i] = prim->geomID(); + primID[i] = prim->primID(); + v0[i] = geom->segment(prim->primID()); + leftExists |= geom->segmentLeftExists(primID[i]) << i; + rightExists |= geom->segmentRightExists(primID[i]) << i; + begin++; + } else { + assert(i); + if (i>0) { + geomID[i] = geomID[i-1]; + primID[i] = -1; + v0[i] = v0[i-1]; + } + } + if (begin<end) prim = &prims[begin]; // FIXME: remove this line + } + new (this) LineMi(v0,leftExists,rightExists,geomID,primID,gty); // FIXME: use non temporal store + } + + template<typename BVH, typename Allocator> + __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) + { + size_t start = set.begin(); + size_t items = LineMi::blocks(set.size()); + size_t numbytes = LineMi::bytes(set.size()); + LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float)); + for (size_t i=0; i<items; i++) { + accel[i].fill(prims,start,set.end(),bvh->scene); + } + return bvh->encodeLeaf((char*)accel,items); + }; + + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) + { + fill(prims,begin,end,scene); + return linearBounds(scene,itime); + } + + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) + { + fill(prims,begin,end,scene); + return linearBounds(scene,time_range); + } + + template<typename BVH, typename SetMB, typename Allocator> + __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) + { + size_t start = prims.begin(); + size_t end = prims.end(); + size_t items = LineMi::blocks(prims.size()); + size_t numbytes = LineMi::bytes(prims.size()); + LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float)); + const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items); + + LBBox3fa bounds = empty; + for (size_t i=0; i<items; i++) + bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range)); + + return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range); + }; + + /* Updates the primitive */ + __forceinline BBox3fa update(LineSegments* geom) + { + BBox3fa bounds = empty; + for (size_t i=0; i<M && valid(i); i++) + { + const Vec3ff& p0 = geom->vertex(v0[i]+0); + const Vec3ff& p1 = geom->vertex(v0[i]+1); + BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1)); + b = enlarge(b,Vec3fa(max(p0.w,p1.w))); + bounds.extend(b); + } + return bounds; + } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const LineMi& line) { + return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}"; + } + + public: + unsigned char gtype; + unsigned char m; + unsigned int sharedGeomID; + unsigned short leftExists, rightExists; + vuint<M> v0; // index of start vertex + private: + vuint<M> primIDs; // primitive ID + }; + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + const LineSegments* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); + transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); + transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); + } + + template<> + __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, + Vec4vf4& p1, + const LineSegments* geom, + const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); + transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); + transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0,a1; + gatheri(a0,a1,geom,itime); + Vec4vf4 b0,b1; + gatheri(b0,b1,geom,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + vbool4& cL, + vbool4& cR, + const LineSegments* geom) const + { + gather(p0,p1,geom); + cL = !vbool4(leftExists); + cR = !vbool4(rightExists); + } + + template<> + __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, + Vec4vf4& p1, + vbool4& cL, + vbool4& cR, + const LineSegments* geom, + const int itime) const + { + gatheri(p0,p1,geom,itime); + cL = !vbool4(leftExists); + cR = !vbool4(rightExists); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + vbool4& cL, + vbool4& cR, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0,a1; + gatheri(a0,a1,geom,itime); + Vec4vf4 b0,b1; + gatheri(b0,b1,geom,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + cL = !vbool4(leftExists); + cR = !vbool4(rightExists); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + Vec4vf4& pL, + Vec4vf4& pR, + const LineSegments* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); + transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); + transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); + + const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf); + const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf); + const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf); + const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf); + transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w); + + const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf); + const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf); + const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf); + const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf); + transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w); + } + + template<> + __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, + Vec4vf4& p1, + Vec4vf4& pL, + Vec4vf4& pR, + const LineSegments* geom, + const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); + transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); + transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); + + const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf); + const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf); + const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf); + const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf); + transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w); + + const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf); + const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf); + const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf); + const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf); + transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + Vec4vf4& pL, + Vec4vf4& pR, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0,a1,aL,aR; + gatheri(a0,a1,aL,aR,geom,itime); + Vec4vf4 b0,b1,bL,bR; + gatheri(b0,b1,bL,bR,geom,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + pL = lerp(aL,bL,vfloat4(ftime)); + pR = lerp(aR,bR,vfloat4(ftime)); + } + +#if defined(__AVX__) + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + const LineSegments* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4])); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5])); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6])); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7])); + transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); + const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1)); + const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1)); + const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1)); + const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1)); + transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); + } + + template<> + __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, + Vec4vf8& p1, + const LineSegments* geom, + const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime)); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime)); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime)); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime)); + transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); + const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime)); + const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime)); + const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime)); + const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime)); + transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0,a1; + gatheri(a0,a1,geom,itime); + Vec4vf8 b0,b1; + gatheri(b0,b1,geom,itime+1); + p0 = lerp(a0,b0,vfloat8(ftime)); + p1 = lerp(a1,b1,vfloat8(ftime)); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + Vec4vf8& pL, + Vec4vf8& pR, + const LineSegments* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4])); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5])); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6])); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7])); + transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); + const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1)); + const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1)); + const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1)); + const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1)); + transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); + + const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf); + const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf); + const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf); + const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf); + const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1)) : vfloat4(inf); + const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1)) : vfloat4(inf); + const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1)) : vfloat4(inf); + const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1)) : vfloat4(inf); + transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w); + + const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf); + const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf); + const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf); + const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf); + const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2)) : vfloat4(inf); + const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2)) : vfloat4(inf); + const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2)) : vfloat4(inf); + const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2)) : vfloat4(inf); + transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w); + } + + template<> + __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, + Vec4vf8& p1, + Vec4vf8& pL, + Vec4vf8& pR, + const LineSegments* geom, + const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime)); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime)); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime)); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime)); + transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); + const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime)); + const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime)); + const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime)); + const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime)); + transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); + + const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf); + const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf); + const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf); + const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf); + const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1,itime)) : vfloat4(inf); + const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1,itime)) : vfloat4(inf); + const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1,itime)) : vfloat4(inf); + const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1,itime)) : vfloat4(inf); + transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w); + + const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf); + const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf); + const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf); + const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf); + const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2,itime)) : vfloat4(inf); + const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2,itime)) : vfloat4(inf); + const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2,itime)) : vfloat4(inf); + const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2,itime)) : vfloat4(inf); + transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + Vec4vf8& pL, + Vec4vf8& pR, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0,a1,aL,aR; + gatheri(a0,a1,aL,aR,geom,itime); + Vec4vf8 b0,b1,bL,bR; + gatheri(b0,b1,bL,bR,geom,itime+1); + p0 = lerp(a0,b0,vfloat8(ftime)); + p1 = lerp(a1,b1,vfloat8(ftime)); + pL = lerp(aL,bL,vfloat8(ftime)); + pR = lerp(aR,bR,vfloat8(ftime)); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + vbool8& cL, + vbool8& cR, + const LineSegments* geom) const + { + gather(p0,p1,geom); + cL = !vbool8(leftExists); + cR = !vbool8(rightExists); + } + + template<> + __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, + Vec4vf8& p1, + vbool8& cL, + vbool8& cR, + const LineSegments* geom, + const int itime) const + { + gatheri(p0,p1,geom,itime); + cL = !vbool8(leftExists); + cR = !vbool8(rightExists); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + vbool8& cL, + vbool8& cR, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0,a1; + gatheri(a0,a1,geom,itime); + Vec4vf8 b0,b1; + gatheri(b0,b1,geom,itime+1); + p0 = lerp(a0,b0,vfloat8(ftime)); + p1 = lerp(a1,b1,vfloat8(ftime)); + cL = !vbool8(leftExists); + cR = !vbool8(rightExists); + } + +#endif + + template<int M> + typename LineMi<M>::Type LineMi<M>::type; + + typedef LineMi<4> Line4i; + typedef LineMi<8> Line8i; +} diff --git a/thirdparty/embree/kernels/geometry/linei_intersector.h b/thirdparty/embree/kernels/geometry/linei_intersector.h new file mode 100644 index 0000000000..5992827f5b --- /dev/null +++ b/thirdparty/embree/kernels/geometry/linei_intersector.h @@ -0,0 +1,124 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "linei.h" +#include "line_intersector.h" +#include "intersector_epilog.h" + +namespace embree +{ + namespace isa + { + template<int M, bool filter> + struct FlatLinearCurveMiIntersector1 + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; line.gather(v0,v1,geom); + const vbool<M> valid = line.valid(); + FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; line.gather(v0,v1,geom); + const vbool<M> valid = line.valid(); + return FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line); + } + }; + + template<int M, bool filter> + struct FlatLinearCurveMiMBIntersector1 + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()); + const vbool<M> valid = line.valid(); + FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()); + const vbool<M> valid = line.valid(); + return FlatLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line); + } + }; + + template<int M, int K, bool filter> + struct FlatLinearCurveMiIntersectorK + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; line.gather(v0,v1,geom); + const vbool<M> valid = line.valid(); + FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; line.gather(v0,v1,geom); + const vbool<M> valid = line.valid(); + return FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + }; + + template<int M, int K, bool filter> + struct FlatLinearCurveMiMBIntersectorK + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]); + const vbool<M> valid = line.valid(); + FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]); + const vbool<M> valid = line.valid(); + return FlatLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/object.h b/thirdparty/embree/kernels/geometry/object.h new file mode 100644 index 0000000000..2a61829ffd --- /dev/null +++ b/thirdparty/embree/kernels/geometry/object.h @@ -0,0 +1,84 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + struct Object + { + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return 1; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return N; } + + public: + + /*! constructs a virtual object */ + Object (unsigned geomID, unsigned primID) + : _geomID(geomID), _primID(primID) {} + + __forceinline unsigned geomID() const { + return _geomID; + } + + __forceinline unsigned primID() const { + return _primID; + } + + /*! fill triangle from triangle list */ + __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene) + { + const PrimRef& prim = prims[i]; i++; + new (this) Object(prim.geomID(), prim.primID()); + } + + /*! fill triangle from triangle list */ + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime) + { + const PrimRef& prim = prims[i]; i++; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + new (this) Object(geomID, primID); + AccelSet* accel = (AccelSet*) scene->get(geomID); + return accel->linearBounds(primID,itime); + } + + /*! fill triangle from triangle list */ + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range) + { + const PrimRefMB& prim = prims[i]; i++; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + new (this) Object(geomID, primID); + AccelSet* accel = (AccelSet*) scene->get(geomID); + return accel->linearBounds(primID,time_range); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(AccelSet* mesh) { + return mesh->bounds(primID()); + } + + private: + unsigned int _geomID; //!< geometry ID + unsigned int _primID; //!< primitive ID + }; +} diff --git a/thirdparty/embree/kernels/geometry/object_intersector.h b/thirdparty/embree/kernels/geometry/object_intersector.h new file mode 100644 index 0000000000..11ceb2f7fe --- /dev/null +++ b/thirdparty/embree/kernels/geometry/object_intersector.h @@ -0,0 +1,127 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "object.h" +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + template<bool mblur> + struct ObjectIntersector1 + { + typedef Object Primitive; + + static const bool validIntersectorK = false; + + struct Precalculations { + __forceinline Precalculations() {} + __forceinline Precalculations (const Ray& ray, const void *ptr) {} + }; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); + + /* perform ray mask test */ +#if defined(EMBREE_RAY_MASK) + if ((ray.mask & accel->mask) == 0) + return; +#endif + + accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); + /* perform ray mask test */ +#if defined(EMBREE_RAY_MASK) + if ((ray.mask & accel->mask) == 0) + return false; +#endif + + accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); + return ray.tfar < 0.0f; + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim) + { + AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID()); + context->geomID = prim.geomID(); + context->primID = prim.primID(); + return accel->pointQuery(query, context); + } + + template<int K> + static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(false); + } + + template<int K> + static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(false); + return valid; + } + }; + + template<int K, bool mblur> + struct ObjectIntersectorK + { + typedef Object Primitive; + + struct Precalculations { + __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {} + }; + + static __forceinline void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim) + { + vbool<K> valid = valid_i; + AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); + + /* perform ray mask test */ +#if defined(EMBREE_RAY_MASK) + valid &= (ray.mask & accel->mask) != 0; + if (none(valid)) return; +#endif + accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1); + } + + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim) + { + vbool<K> valid = valid_i; + AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); + + /* perform ray mask test */ +#if defined(EMBREE_RAY_MASK) + valid &= (ray.mask & accel->mask) != 0; + if (none(valid)) return false; +#endif + accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); + return ray.tfar < 0.0f; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) { + intersect(vbool<K>(1<<int(k)),pre,ray,context,prim); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) { + occluded(vbool<K>(1<<int(k)),pre,ray,context,prim); + return ray.tfar[k] < 0.0f; + } + }; + + typedef ObjectIntersectorK<4,false> ObjectIntersector4; + typedef ObjectIntersectorK<8,false> ObjectIntersector8; + typedef ObjectIntersectorK<16,false> ObjectIntersector16; + + typedef ObjectIntersectorK<4,true> ObjectIntersector4MB; + typedef ObjectIntersectorK<8,true> ObjectIntersector8MB; + typedef ObjectIntersectorK<16,true> ObjectIntersector16MB; + } +} diff --git a/thirdparty/embree/kernels/geometry/plane.h b/thirdparty/embree/kernels/geometry/plane.h new file mode 100644 index 0000000000..e447122eab --- /dev/null +++ b/thirdparty/embree/kernels/geometry/plane.h @@ -0,0 +1,57 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + struct HalfPlane + { + const Vec3fa P; //!< plane origin + const Vec3fa N; //!< plane normal + + __forceinline HalfPlane(const Vec3fa& P, const Vec3fa& N) + : P(P), N(N) {} + + __forceinline BBox1f intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const + { + Vec3fa O = Vec3fa(ray_org) - P; + Vec3fa D = Vec3fa(ray_dir); + float ON = dot(O,N); + float DN = dot(D,N); + bool eps = abs(DN) < min_rcp_input; + float t = -ON*rcp(DN); + float lower = select(eps || DN < 0.0f, float(neg_inf), t); + float upper = select(eps || DN > 0.0f, float(pos_inf), t); + return BBox1f(lower,upper); + } + }; + + template<int M> + struct HalfPlaneN + { + const Vec3vf<M> P; //!< plane origin + const Vec3vf<M> N; //!< plane normal + + __forceinline HalfPlaneN(const Vec3vf<M>& P, const Vec3vf<M>& N) + : P(P), N(N) {} + + __forceinline BBox<vfloat<M>> intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const + { + Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray_org) - P; + Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray_dir); + vfloat<M> ON = dot(O,N); + vfloat<M> DN = dot(D,N); + vbool<M> eps = abs(DN) < min_rcp_input; + vfloat<M> t = -ON*rcp(DN); + vfloat<M> lower = select(eps | DN < 0.0f, vfloat<M>(neg_inf), t); + vfloat<M> upper = select(eps | DN > 0.0f, vfloat<M>(pos_inf), t); + return BBox<vfloat<M>>(lower,upper); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/pointi.h b/thirdparty/embree/kernels/geometry/pointi.h new file mode 100644 index 0000000000..bed04116b0 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/pointi.h @@ -0,0 +1,412 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + template<int M> + struct PointMi + { + /* Virtual interface to query information about the line segment type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored line segments */ + static __forceinline size_t max_size() + { + return M; + } + + /* Returns required number of primitive blocks for N line segments */ + static __forceinline size_t blocks(size_t N) + { + return (N + max_size() - 1) / max_size(); + } + + /* Returns required number of bytes for N line segments */ + static __forceinline size_t bytes(size_t N) + { + return blocks(N) * sizeof(PointMi); + } + + public: + /* Default constructor */ + __forceinline PointMi() {} + + /* Construction from vertices and IDs */ + __forceinline PointMi(const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype, uint32_t numPrimitives) + : gtype((unsigned char)gtype), + numPrimitives(numPrimitives), + sharedGeomID(geomIDs[0]), + primIDs(primIDs) + { + assert(all(vuint<M>(geomID()) == geomIDs)); + } + + /* Returns a mask that tells which line segments are valid */ + __forceinline vbool<M> valid() const { + return vint<M>(step) < vint<M>(numPrimitives); + } + + /* Returns if the specified line segment is valid */ + __forceinline bool valid(const size_t i) const + { + assert(i < M); + return i < numPrimitives; + } + + /* Returns the number of stored line segments */ + __forceinline size_t size() const { + return numPrimitives; + } + + __forceinline unsigned int geomID(unsigned int i = 0) const { + return sharedGeomID; + } + + __forceinline vuint<M>& primID() { + return primIDs; + } + __forceinline const vuint<M>& primID() const { + return primIDs; + } + __forceinline unsigned int primID(const size_t i) const { + assert(i < M); + return primIDs[i]; + } + + /* gather the line segments */ + __forceinline void gather(Vec4vf<M>& p0, const Points* geom) const; + __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom) const; + + __forceinline void gatheri(Vec4vf<M>& p0, const Points* geom, const int itime) const; + __forceinline void gatheri(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, const int itime) const; + + __forceinline void gather(Vec4vf<M>& p0, const Points* geom, float time) const; + __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, float time) const; + + /* Calculate the bounds of the line segments */ + __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const + { + BBox3fa bounds = empty; + for (size_t i = 0; i < M && valid(i); i++) { + const Points* geom = scene->get<Points>(geomID(i)); + bounds.extend(geom->bounds(primID(i),itime)); + } + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) { + return LBBox3fa(bounds(scene, itime + 0), bounds(scene, itime + 1)); + } + + __forceinline LBBox3fa linearBounds(const Scene* const scene, size_t itime, size_t numTimeSteps) + { + LBBox3fa allBounds = empty; + for (size_t i = 0; i < M && valid(i); i++) { + const Points* geom = scene->get<Points>(geomID(i)); + allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps)); + } + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene* const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + for (size_t i = 0; i < M && valid(i); i++) { + const Points* geom = scene->get<Points>(geomID((unsigned int)i)); + allBounds.extend(geom->linearBounds(primID(i), time_range)); + } + return allBounds; + } + + /* Fill line segment from line segment list */ + template<typename PrimRefT> + __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) + { + Geometry::GType gty = scene->get(prims[begin].geomID())->getType(); + vuint<M> geomID, primID; + vuint<M> v0; + const PrimRefT* prim = &prims[begin]; + + int numPrimitives = 0; + for (size_t i = 0; i < M; i++) { + if (begin < end) { + geomID[i] = prim->geomID(); + primID[i] = prim->primID(); + begin++; + numPrimitives++; + } else { + assert(i); + if (i > 0) { + geomID[i] = geomID[i - 1]; + primID[i] = primID[i - 1]; + } + } + if (begin < end) + prim = &prims[begin]; // FIXME: remove this line + } + new (this) PointMi(geomID, primID, gty, numPrimitives); // FIXME: use non temporal store + } + + template<typename BVH, typename Allocator> + __forceinline static typename BVH::NodeRef createLeaf(BVH* bvh, + const PrimRef* prims, + const range<size_t>& set, + const Allocator& alloc) + { + size_t start = set.begin(); + size_t items = PointMi::blocks(set.size()); + size_t numbytes = PointMi::bytes(set.size()); + PointMi* accel = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float)); + for (size_t i = 0; i < items; i++) { + accel[i].fill(prims, start, set.end(), bvh->scene); + } + return bvh->encodeLeaf((char*)accel, items); + }; + + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) + { + fill(prims, begin, end, scene); + return linearBounds(scene, itime); + } + + __forceinline LBBox3fa fillMB( + const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) + { + fill(prims, begin, end, scene); + return linearBounds(scene, time_range); + } + + template<typename BVH, typename SetMB, typename Allocator> + __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) + { + size_t start = prims.object_range.begin(); + size_t end = prims.object_range.end(); + size_t items = PointMi::blocks(prims.object_range.size()); + size_t numbytes = PointMi::bytes(prims.object_range.size()); + PointMi* accel = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float)); + const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel, items); + + LBBox3fa bounds = empty; + for (size_t i = 0; i < items; i++) + bounds.extend(accel[i].fillMB(prims.prims->data(), start, end, bvh->scene, prims.time_range)); + + return typename BVH::NodeRecordMB4D(node, bounds, prims.time_range); + }; + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PointMi& line) + { + return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}"; + } + + public: + unsigned char gtype; + unsigned char numPrimitives; + unsigned int sharedGeomID; + + private: + vuint<M> primIDs; // primitive ID + }; + + template<> + __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); + transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); + } + + template<> + __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); + transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); + const vfloat4 b0 = vfloat4(geom->normal(primID(0))); + const vfloat4 b1 = vfloat4(geom->normal(primID(1))); + const vfloat4 b2 = vfloat4(geom->normal(primID(2))); + const vfloat4 b3 = vfloat4(geom->normal(primID(3))); + transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z); + } + + template<> + __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, const Points* geom, const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); + transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); + } + + template<> + __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); + transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); + const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime)); + const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime)); + const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime)); + const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime)); + transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z); + } + + template<> + __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom, float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0; gatheri(a0, geom, itime); + Vec4vf4 b0; gatheri(b0, geom, itime + 1); + p0 = lerp(a0, b0, vfloat4(ftime)); + } + + template<> + __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0, b0; + Vec3vf4 norm0, norm1; + gatheri(a0, norm0, geom, itime); + gatheri(b0, norm1, geom, itime + 1); + p0 = lerp(a0, b0, vfloat4(ftime)); + n0 = lerp(norm0, norm1, vfloat4(ftime)); + } + +#if defined(__AVX__) + + template<> + __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4))); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5))); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6))); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7))); + transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); + } + + template<> + __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4))); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5))); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6))); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7))); + transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); + const vfloat4 b0 = vfloat4(geom->normal(primID(0))); + const vfloat4 b1 = vfloat4(geom->normal(primID(1))); + const vfloat4 b2 = vfloat4(geom->normal(primID(2))); + const vfloat4 b3 = vfloat4(geom->normal(primID(3))); + const vfloat4 b4 = vfloat4(geom->normal(primID(4))); + const vfloat4 b5 = vfloat4(geom->normal(primID(5))); + const vfloat4 b6 = vfloat4(geom->normal(primID(6))); + const vfloat4 b7 = vfloat4(geom->normal(primID(7))); + transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z); + } + + template<> + __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, const Points* geom, const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime)); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime)); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime)); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime)); + transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); + } + + template<> + __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime)); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime)); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime)); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime)); + transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); + const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime)); + const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime)); + const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime)); + const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime)); + const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime)); + const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime)); + const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime)); + const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime)); + transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z); + } + + template<> + __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom, float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0; + gatheri(a0, geom, itime); + Vec4vf8 b0; + gatheri(b0, geom, itime + 1); + p0 = lerp(a0, b0, vfloat8(ftime)); + } + + template<> + __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0, b0; + Vec3vf8 norm0, norm1; + gatheri(a0, norm0, geom, itime); + gatheri(b0, norm1, geom, itime + 1); + p0 = lerp(a0, b0, vfloat8(ftime)); + n0 = lerp(norm0, norm1, vfloat8(ftime)); + } +#endif + + template<int M> + typename PointMi<M>::Type PointMi<M>::type; + + typedef PointMi<4> Point4i; + typedef PointMi<8> Point8i; + +} // namespace embree diff --git a/thirdparty/embree/kernels/geometry/primitive.h b/thirdparty/embree/kernels/geometry/primitive.h new file mode 100644 index 0000000000..608d981dd7 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/primitive.h @@ -0,0 +1,49 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/scene.h" +#include "../../common/simd/simd.h" +#include "../common/primref.h" +#include "../common/primref_mb.h" + +namespace embree +{ + struct PrimitiveType + { + /*! returns name of this primitive type */ + virtual const char* name() const = 0; + + /*! Returns the number of stored active primitives in a block. */ + virtual size_t sizeActive(const char* This) const = 0; + + /*! Returns the number of stored active and inactive primitives in a block. */ + virtual size_t sizeTotal(const char* This) const = 0; + + /*! Returns the number of bytes of block. */ + virtual size_t getBytes(const char* This) const = 0; + }; + + template<typename Primitive> + struct PrimitivePointQuery1 + { + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim) + { + bool changed = false; + for (size_t i = 0; i < Primitive::max_size(); i++) + { + if (!prim.valid(i)) break; + STAT3(point_query.trav_prims,1,1,1); + AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID(i)); + context->geomID = prim.geomID(i); + context->primID = prim.primID(i); + changed |= accel->pointQuery(query, context); + } + return changed; + } + + static __forceinline void pointQueryNoop(PointQuery* query, PointQueryContext* context, const Primitive& prim) { } + }; +} diff --git a/thirdparty/embree/kernels/geometry/primitive4.cpp b/thirdparty/embree/kernels/geometry/primitive4.cpp new file mode 100644 index 0000000000..9c953c5d35 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/primitive4.cpp @@ -0,0 +1,379 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "primitive.h" +#include "curveNv.h" +#include "curveNi.h" +#include "curveNi_mb.h" +#include "linei.h" +#include "triangle.h" +#include "trianglev.h" +#include "trianglev_mb.h" +#include "trianglei.h" +#include "quadv.h" +#include "quadi.h" +#include "subdivpatch1.h" +#include "object.h" +#include "instance.h" +#include "subgrid.h" + +namespace embree +{ + /********************** Curve4v **************************/ + + template<> + const char* Curve4v::Type::name () const { + return "curve4v"; + } + + template<> + size_t Curve4v::Type::sizeActive(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return ((Line4i*)This)->size(); + else + return ((Curve4v*)This)->N; + } + + template<> + size_t Curve4v::Type::sizeTotal(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return 4; + else + return ((Curve4v*)This)->N; + } + + template<> + size_t Curve4v::Type::getBytes(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return Line4i::bytes(sizeActive(This)); + else + return Curve4v::bytes(sizeActive(This)); + } + + /********************** Curve4i **************************/ + + template<> + const char* Curve4i::Type::name () const { + return "curve4i"; + } + + template<> + size_t Curve4i::Type::sizeActive(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return ((Line4i*)This)->size(); + else + return ((Curve4i*)This)->N; + } + + template<> + size_t Curve4i::Type::sizeTotal(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return 4; + else + return ((Curve4i*)This)->N; + } + + template<> + size_t Curve4i::Type::getBytes(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return Line4i::bytes(sizeActive(This)); + else + return Curve4i::bytes(sizeActive(This)); + } + + /********************** Curve4iMB **************************/ + + template<> + const char* Curve4iMB::Type::name () const { + return "curve4imb"; + } + + template<> + size_t Curve4iMB::Type::sizeActive(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return ((Line4i*)This)->size(); + else + return ((Curve4iMB*)This)->N; + } + + template<> + size_t Curve4iMB::Type::sizeTotal(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return 4; + else + return ((Curve4iMB*)This)->N; + } + + template<> + size_t Curve4iMB::Type::getBytes(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return Line4i::bytes(sizeActive(This)); + else + return Curve4iMB::bytes(sizeActive(This)); + } + + /********************** Line4i **************************/ + + template<> + const char* Line4i::Type::name () const { + return "line4i"; + } + + template<> + size_t Line4i::Type::sizeActive(const char* This) const { + return ((Line4i*)This)->size(); + } + + template<> + size_t Line4i::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Line4i::Type::getBytes(const char* This) const { + return sizeof(Line4i); + } + + /********************** Triangle4 **************************/ + + template<> + const char* Triangle4::Type::name () const { + return "triangle4"; + } + + template<> + size_t Triangle4::Type::sizeActive(const char* This) const { + return ((Triangle4*)This)->size(); + } + + template<> + size_t Triangle4::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Triangle4::Type::getBytes(const char* This) const { + return sizeof(Triangle4); + } + + /********************** Triangle4v **************************/ + + template<> + const char* Triangle4v::Type::name () const { + return "triangle4v"; + } + + template<> + size_t Triangle4v::Type::sizeActive(const char* This) const { + return ((Triangle4v*)This)->size(); + } + + template<> + size_t Triangle4v::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Triangle4v::Type::getBytes(const char* This) const { + return sizeof(Triangle4v); + } + + /********************** Triangle4i **************************/ + + template<> + const char* Triangle4i::Type::name () const { + return "triangle4i"; + } + + template<> + size_t Triangle4i::Type::sizeActive(const char* This) const { + return ((Triangle4i*)This)->size(); + } + + template<> + size_t Triangle4i::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Triangle4i::Type::getBytes(const char* This) const { + return sizeof(Triangle4i); + } + + /********************** Triangle4vMB **************************/ + + template<> + const char* Triangle4vMB::Type::name () const { + return "triangle4vmb"; + } + + template<> + size_t Triangle4vMB::Type::sizeActive(const char* This) const { + return ((Triangle4vMB*)This)->size(); + } + + template<> + size_t Triangle4vMB::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Triangle4vMB::Type::getBytes(const char* This) const { + return sizeof(Triangle4vMB); + } + + /********************** Quad4v **************************/ + + template<> + const char* Quad4v::Type::name () const { + return "quad4v"; + } + + template<> + size_t Quad4v::Type::sizeActive(const char* This) const { + return ((Quad4v*)This)->size(); + } + + template<> + size_t Quad4v::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Quad4v::Type::getBytes(const char* This) const { + return sizeof(Quad4v); + } + + /********************** Quad4i **************************/ + + template<> + const char* Quad4i::Type::name () const { + return "quad4i"; + } + + template<> + size_t Quad4i::Type::sizeActive(const char* This) const { + return ((Quad4i*)This)->size(); + } + + template<> + size_t Quad4i::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Quad4i::Type::getBytes(const char* This) const { + return sizeof(Quad4i); + } + + /********************** SubdivPatch1 **************************/ + + const char* SubdivPatch1::Type::name () const { + return "subdivpatch1"; + } + + size_t SubdivPatch1::Type::sizeActive(const char* This) const { + return 1; + } + + size_t SubdivPatch1::Type::sizeTotal(const char* This) const { + return 1; + } + + size_t SubdivPatch1::Type::getBytes(const char* This) const { + return sizeof(SubdivPatch1); + } + + SubdivPatch1::Type SubdivPatch1::type; + + /********************** Virtual Object **************************/ + + const char* Object::Type::name () const { + return "object"; + } + + size_t Object::Type::sizeActive(const char* This) const { + return 1; + } + + size_t Object::Type::sizeTotal(const char* This) const { + return 1; + } + + size_t Object::Type::getBytes(const char* This) const { + return sizeof(Object); + } + + Object::Type Object::type; + + /********************** Instance **************************/ + + const char* InstancePrimitive::Type::name () const { + return "instance"; + } + + size_t InstancePrimitive::Type::sizeActive(const char* This) const { + return 1; + } + + size_t InstancePrimitive::Type::sizeTotal(const char* This) const { + return 1; + } + + size_t InstancePrimitive::Type::getBytes(const char* This) const { + return sizeof(InstancePrimitive); + } + + InstancePrimitive::Type InstancePrimitive::type; + + /********************** SubGrid **************************/ + + const char* SubGrid::Type::name () const { + return "subgrid"; + } + + size_t SubGrid::Type::sizeActive(const char* This) const { + return 1; + } + + size_t SubGrid::Type::sizeTotal(const char* This) const { + return 1; + } + + size_t SubGrid::Type::getBytes(const char* This) const { + return sizeof(SubGrid); + } + + SubGrid::Type SubGrid::type; + + /********************** SubGridQBVH4 **************************/ + + template<> + const char* SubGridQBVH4::Type::name () const { + return "SubGridQBVH4"; + } + + template<> + size_t SubGridQBVH4::Type::sizeActive(const char* This) const { + return 1; + } + + template<> + size_t SubGridQBVH4::Type::sizeTotal(const char* This) const { + return 1; + } + + template<> + size_t SubGridQBVH4::Type::getBytes(const char* This) const { + return sizeof(SubGridQBVH4); + } +} diff --git a/thirdparty/embree/kernels/geometry/quad_intersector.h b/thirdparty/embree/kernels/geometry/quad_intersector.h new file mode 100644 index 0000000000..93c9526912 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/quad_intersector.h @@ -0,0 +1,76 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + namespace isa + { + /*! Intersects a ray with a quad with backface culling + * enabled. The quad v0,v1,v2,v3 is split into two triangles + * v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two + * triangles gets intersected. */ + template<int N> + __forceinline vbool<N> intersect_quad_backface_culling(const vbool<N>& valid0, + const Vec3fa& ray_org, + const Vec3fa& ray_dir, + const float ray_tnear, + const float ray_tfar, + const Vec3vf<N>& quad_v0, + const Vec3vf<N>& quad_v1, + const Vec3vf<N>& quad_v2, + const Vec3vf<N>& quad_v3, + vfloat<N>& u_o, + vfloat<N>& v_o, + vfloat<N>& t_o) + { + /* calculate vertices relative to ray origin */ + vbool<N> valid = valid0; + const Vec3vf<N> O = Vec3vf<N>(ray_org); + const Vec3vf<N> D = Vec3vf<N>(ray_dir); + const Vec3vf<N> va = quad_v0-O; + const Vec3vf<N> vb = quad_v1-O; + const Vec3vf<N> vc = quad_v2-O; + const Vec3vf<N> vd = quad_v3-O; + + const Vec3vf<N> edb = vb-vd; + const vfloat<N> WW = dot(cross(vd,edb),D); + const Vec3vf<N> v0 = select(WW <= 0.0f,va,vc); + const Vec3vf<N> v1 = select(WW <= 0.0f,vb,vd); + const Vec3vf<N> v2 = select(WW <= 0.0f,vd,vb); + + /* calculate edges */ + const Vec3vf<N> e0 = v2-v0; + const Vec3vf<N> e1 = v0-v1; + + /* perform edge tests */ + const vfloat<N> U = dot(cross(v0,e0),D); + const vfloat<N> V = dot(cross(v1,e1),D); + valid &= max(U,V) <= 0.0f; + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf<N> Ng = cross(e1,e0); + const vfloat<N> den = dot(Ng,D); + const vfloat<N> rcpDen = rcp(den); + + /* perform depth test */ + const vfloat<N> t = rcpDen*dot(v0,Ng); + valid &= vfloat<N>(ray_tnear) <= t & t <= vfloat<N>(ray_tfar); + if (unlikely(none(valid))) return false; + + /* avoid division by 0 */ + valid &= den != vfloat<N>(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + t_o = t; + u_o = U * rcpDen; + v_o = V * rcpDen; + u_o = select(WW <= 0.0f,u_o,1.0f-u_o); + v_o = select(WW <= 0.0f,v_o,1.0f-v_o); + return valid; + } + } +} diff --git a/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h new file mode 100644 index 0000000000..3abc9d6f70 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/quad_intersector_moeller.h @@ -0,0 +1,460 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "quadv.h" +#include "triangle_intersector_moeller.h" + +namespace embree +{ + namespace isa + { + template<int M> + struct QuadHitM + { + __forceinline QuadHitM() {} + + __forceinline QuadHitM(const vbool<M>& valid, + const vfloat<M>& U, + const vfloat<M>& V, + const vfloat<M>& T, + const vfloat<M>& absDen, + const Vec3vf<M>& Ng, + const vbool<M>& flags) + : U(U), V(V), T(T), absDen(absDen), tri_Ng(Ng), valid(valid), flags(flags) {} + + __forceinline void finalize() + { + const vfloat<M> rcpAbsDen = rcp(absDen); + vt = T * rcpAbsDen; + const vfloat<M> u = min(U * rcpAbsDen,1.0f); + const vfloat<M> v = min(V * rcpAbsDen,1.0f); + const vfloat<M> u1 = vfloat<M>(1.0f) - u; + const vfloat<M> v1 = vfloat<M>(1.0f) - v; +#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING) + vu = select(flags,u1,u); + vv = select(flags,v1,v); + vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z); +#else + const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f)); + vv = select(flags,u1,v); + vu = select(flags,v1,u); + vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z); +#endif + } + + __forceinline Vec2f uv(const size_t i) + { + const float u = vu[i]; + const float v = vv[i]; + return Vec2f(u,v); + } + + __forceinline float t(const size_t i) { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + private: + vfloat<M> U; + vfloat<M> V; + vfloat<M> T; + vfloat<M> absDen; + Vec3vf<M> tri_Ng; + + public: + vbool<M> valid; + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + + public: + const vbool<M> flags; + }; + + template<int K> + struct QuadHitK + { + __forceinline QuadHitK(const vfloat<K>& U, + const vfloat<K>& V, + const vfloat<K>& T, + const vfloat<K>& absDen, + const Vec3vf<K>& Ng, + const vbool<K>& flags) + : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng) {} + + __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const + { + const vfloat<K> rcpAbsDen = rcp(absDen); + const vfloat<K> t = T * rcpAbsDen; + const vfloat<K> u0 = min(U * rcpAbsDen,1.0f); + const vfloat<K> v0 = min(V * rcpAbsDen,1.0f); + const vfloat<K> u1 = vfloat<K>(1.0f) - u0; + const vfloat<K> v1 = vfloat<K>(1.0f) - v0; + const vfloat<K> u = select(flags,u1,u0); + const vfloat<K> v = select(flags,v1,v0); + const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat<K> U; + const vfloat<K> V; + const vfloat<K> T; + const vfloat<K> absDen; + const vbool<K> flags; + const Vec3vf<K> tri_Ng; + }; + + /* ----------------------------- */ + /* -- single ray intersectors -- */ + /* ----------------------------- */ + + + template<int M, bool filter> + struct QuadMIntersector1MoellerTrumbore; + + /*! Intersects M quads with 1 ray */ + template<int M, bool filter> + struct QuadMIntersector1MoellerTrumbore + { + __forceinline QuadMIntersector1MoellerTrumbore() {} + + __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + __forceinline void intersect(RayHit& ray, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const vuint<M>& geomID, const vuint<M>& primID) const + { + UVIdentity<M> mapUV; + MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV); + MoellerTrumboreIntersector1<M> intersector(ray,nullptr); + Intersect1EpilogM<M,filter> epilog(ray,context,geomID,primID); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) + epilog(hit.valid,hit); + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + epilog(hit.valid,hit); + } + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const vuint<M>& geomID, const vuint<M>& primID) const + { + UVIdentity<M> mapUV; + MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV); + MoellerTrumboreIntersector1<M> intersector(ray,nullptr); + Occluded1EpilogM<M,filter> epilog(ray,context,geomID,primID); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) + { + if (epilog(hit.valid,hit)) + return true; + } + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + if (epilog(hit.valid,hit)) + return true; + } + return false; + } + }; + +#if defined(__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template<bool filter> + struct QuadMIntersector1MoellerTrumbore<4,filter> + { + __forceinline QuadMIntersector1MoellerTrumbore() {} + + __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + template<typename Epilog> + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + UVIdentity<8> mapUV; + MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV); + MoellerTrumboreIntersector1<8> intersector(ray,nullptr); + const vbool8 flags(0,0,0,0,1,1,1,1); + if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit))) + { + vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen; + +#if !defined(EMBREE_BACKFACE_CULLING) + hit.U = select(flags,absDen-V,U); + hit.V = select(flags,absDen-U,V); + hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); // FIXME: use XOR +#else + hit.U = select(flags,absDen-U,U); + hit.V = select(flags,absDen-V,V); +#endif + if (unlikely(epilog(hit.valid,hit))) + return true; + } + return false; + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + }; + +#endif + + /* ----------------------------- */ + /* -- ray packet intersectors -- */ + /* ----------------------------- */ + + + struct MoellerTrumboreIntersector1KTriangleM + { + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + template<int M, int K, typename Epilog> + static __forceinline bool intersect(RayK<K>& ray, + size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const Vec3vf<M>& tri_Ng, + const vbool<M>& flags, + const Epilog& epilog) + { + /* calculate denominator */ + const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k); + const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k); + const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O; + const Vec3vf<M> R = cross(C,D); + const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D); + const vfloat<M> absDen = abs(den); + const vfloat<M> sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen; + const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k])); + if (likely(none(valid))) return false; + + /* calculate hit information */ + QuadHitM<M> hit(valid,U,V,T,absDen,tri_Ng,flags); + return epilog(valid,hit); + } + + template<int M, int K, typename Epilog> + static __forceinline bool intersect1(RayK<K>& ray, + size_t k, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const vbool<M>& flags, + const Epilog& epilog) + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + const Vec3vf<M> Ng = cross(e2,e1); + return intersect<M,K>(ray,k,v0,e1,e2,Ng,flags,epilog); + } + }; + + template<int M, int K, bool filter> + struct QuadMIntersectorKMoellerTrumboreBase + { + __forceinline QuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template<typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_e1, + const Vec3vf<K>& tri_e2, + const Vec3vf<K>& tri_Ng, + const vbool<K>& flags, + const Epilog& epilog) const + { + /* calculate denominator */ + vbool<K> valid = valid0; + const Vec3vf<K> C = tri_v0 - ray.org; + const Vec3vf<K> R = cross(C,ray.dir); + const vfloat<K> den = dot(tri_Ng,ray.dir); + const vfloat<K> absDen = abs(den); + const vfloat<K> sgnDen = signmsk(den); + + /* test against edge p2 p0 */ + const vfloat<K> U = dot(R,tri_e2) ^ sgnDen; + valid &= U >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p0 p1 */ + const vfloat<K> V = dot(R,tri_e1) ^ sgnDen; + valid &= V >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p1 p2 */ + const vfloat<K> W = absDen-U-V; + valid &= W >= 0.0f; + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen; + valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar); + if (unlikely(none(valid))) return false; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= den < vfloat<K>(zero); + if (unlikely(none(valid))) return false; +#else + valid &= den != vfloat<K>(zero); + if (unlikely(none(valid))) return false; +#endif + + /* calculate hit information */ + QuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags); + return epilog(valid,hit); + } + + /*! Intersects K rays with one of M quads. */ + template<typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const vbool<K>& flags, + const Epilog& epilog) const + { + const Vec3vf<K> e1 = tri_v0-tri_v1; + const Vec3vf<K> e2 = tri_v2-tri_v0; + const Vec3vf<K> Ng = cross(e2,e1); + return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,epilog); + } + + /*! Intersects K rays with one of M quads. */ + template<typename Epilog> + __forceinline bool intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& v0, + const Vec3vf<K>& v1, + const Vec3vf<K>& v2, + const Vec3vf<K>& v3, + const Epilog& epilog) const + { + intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog); + if (none(valid0)) return true; + intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog); + return none(valid0); + } + }; + + template<int M, int K, bool filter> + struct QuadMIntersectorKMoellerTrumbore : public QuadMIntersectorKMoellerTrumboreBase<M,K,filter> + { + __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray) + : QuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {} + + __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const vuint<M>& geomID, const vuint<M>& primID) const + { + Intersect1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID); + MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog); + MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog); + } + + __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const vuint<M>& geomID, const vuint<M>& primID) const + { + Occluded1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID); + if (MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true; + if (MoellerTrumboreIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true; + return false; + } + }; + + +#if defined(__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template<int K, bool filter> + struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter> + { + __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray) + : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {} + + template<typename Epilog> + __forceinline bool intersect1(RayK<K>& ray, size_t k, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + const vbool8 flags(0,0,0,0,1,1,1,1); + return MoellerTrumboreIntersector1KTriangleM::intersect1<8,K>(ray,k,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + }; + +#endif + } +} diff --git a/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h new file mode 100644 index 0000000000..9873ff76ac --- /dev/null +++ b/thirdparty/embree/kernels/geometry/quad_intersector_pluecker.h @@ -0,0 +1,438 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "quad_intersector_moeller.h" + +/*! Modified Pluecker ray/triangle intersector. The test first shifts + * the ray origin into the origin of the coordinate system and then + * uses Pluecker coordinates for the intersection. Due to the shift, + * the Pluecker coordinate calculation simplifies and the tests get + * numerically stable. The edge equations are watertight along the + * edge for neighboring triangles. */ + +namespace embree +{ + namespace isa + { + template<int M> + struct QuadHitPlueckerM + { + __forceinline QuadHitPlueckerM() {} + + __forceinline QuadHitPlueckerM(const vbool<M>& valid, + const vfloat<M>& U, + const vfloat<M>& V, + const vfloat<M>& UVW, + const vfloat<M>& t, + const Vec3vf<M>& Ng, + const vbool<M>& flags) + : U(U), V(V), UVW(UVW), tri_Ng(Ng), valid(valid), vt(t), flags(flags) {} + + __forceinline void finalize() + { + const vbool<M> invalid = abs(UVW) < min_rcp_input; + const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW)); + const vfloat<M> u = min(U * rcpUVW,1.0f); + const vfloat<M> v = min(V * rcpUVW,1.0f); + const vfloat<M> u1 = vfloat<M>(1.0f) - u; + const vfloat<M> v1 = vfloat<M>(1.0f) - v; +#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING) + vu = select(flags,u1,u); + vv = select(flags,v1,v); + vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z); +#else + const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f)); + vv = select(flags,u1,v); + vu = select(flags,v1,u); + vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z); +#endif + } + + __forceinline Vec2f uv(const size_t i) + { + const float u = vu[i]; + const float v = vv[i]; + return Vec2f(u,v); + } + + __forceinline float t(const size_t i) { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + private: + vfloat<M> U; + vfloat<M> V; + vfloat<M> UVW; + Vec3vf<M> tri_Ng; + + public: + vbool<M> valid; + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + + public: + const vbool<M> flags; + }; + + template<int K> + struct QuadHitPlueckerK + { + __forceinline QuadHitPlueckerK(const vfloat<K>& U, + const vfloat<K>& V, + const vfloat<K>& UVW, + const vfloat<K>& t, + const Vec3vf<K>& Ng, + const vbool<K>& flags) + : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng) {} + + __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const + { + const vbool<K> invalid = abs(UVW) < min_rcp_input; + const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW)); + const vfloat<K> u0 = min(U * rcpUVW,1.0f); + const vfloat<K> v0 = min(V * rcpUVW,1.0f); + const vfloat<K> u1 = vfloat<K>(1.0f) - u0; + const vfloat<K> v1 = vfloat<K>(1.0f) - v0; + const vfloat<K> u = select(flags,u1,u0); + const vfloat<K> v = select(flags,v1,v0); + const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat<K> U; + const vfloat<K> V; + const vfloat<K> UVW; + const vfloat<K> t; + const vbool<K> flags; + const Vec3vf<K> tri_Ng; + }; + + struct PlueckerIntersectorTriangle1 + { + template<int M, typename Epilog> + static __forceinline bool intersect(Ray& ray, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const vbool<M>& flags, + const Epilog& epilog) + { + /* calculate vertices relative to ray origin */ + const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org); + const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir); + const Vec3vf<M> v0 = tri_v0-O; + const Vec3vf<M> v1 = tri_v1-O; + const Vec3vf<M> v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf<M> e0 = v2-v0; + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v1-v2; + + /* perform edge tests */ + const vfloat<M> U = dot(cross(e0,v2+v0),D); + const vfloat<M> V = dot(cross(e1,v0+v1),D); + const vfloat<M> W = dot(cross(e2,v1+v2),D); + const vfloat<M> UVW = U+V+W; + const vfloat<M> eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool<M> valid = max(U,V,W) <= eps; +#else + vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2); + const vfloat<M> den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat<M> T = twice(dot(v0,Ng)); + const vfloat<M> t = rcp(den)*T; + valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar); + valid &= den != vfloat<M>(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags); + return epilog(valid,hit); + } + }; + + /*! Intersects M quads with 1 ray */ + template<int M, bool filter> + struct QuadMIntersector1Pluecker + { + __forceinline QuadMIntersector1Pluecker() {} + + __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + __forceinline void intersect(RayHit& ray, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const vuint<M>& geomID, const vuint<M>& primID) const + { + Intersect1EpilogM<M,filter> epilog(ray,context,geomID,primID); + PlueckerIntersectorTriangle1::intersect<M>(ray,v0,v1,v3,vbool<M>(false),epilog); + PlueckerIntersectorTriangle1::intersect<M>(ray,v2,v3,v1,vbool<M>(true),epilog); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const vuint<M>& geomID, const vuint<M>& primID) const + { + Occluded1EpilogM<M,filter> epilog(ray,context,geomID,primID); + if (PlueckerIntersectorTriangle1::intersect<M>(ray,v0,v1,v3,vbool<M>(false),epilog)) return true; + if (PlueckerIntersectorTriangle1::intersect<M>(ray,v2,v3,v1,vbool<M>(true ),epilog)) return true; + return false; + } + }; + +#if defined(__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template<bool filter> + struct QuadMIntersector1Pluecker<4,filter> + { + __forceinline QuadMIntersector1Pluecker() {} + + __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + template<typename Epilog> + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + const vbool8 flags(0,0,0,0,1,1,1,1); + return PlueckerIntersectorTriangle1::intersect<8>(ray,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + }; + +#endif + + + /* ----------------------------- */ + /* -- ray packet intersectors -- */ + /* ----------------------------- */ + + struct PlueckerIntersector1KTriangleM + { + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + template<int M, int K, typename Epilog> + static __forceinline bool intersect1(RayK<K>& ray, + size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const vbool<M>& flags, + const Epilog& epilog) + { + /* calculate vertices relative to ray origin */ + const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k); + const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k); + const Vec3vf<M> v0 = tri_v0-O; + const Vec3vf<M> v1 = tri_v1-O; + const Vec3vf<M> v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf<M> e0 = v2-v0; + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v1-v2; + + /* perform edge tests */ + const vfloat<M> U = dot(cross(e0,v2+v0),D); + const vfloat<M> V = dot(cross(e1,v0+v1),D); + const vfloat<M> W = dot(cross(e2,v1+v2),D); + + const vfloat<M> UVW = U+V+W; + const vfloat<M> eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool<M> valid = max(U,V,W) <= eps; +#else + vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2); + const vfloat<M> den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat<M> T = twice(dot(v0,Ng)); + const vfloat<M> t = rcp(den)*T; + valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]); + if (unlikely(none(valid))) return false; + + /* avoid division by 0 */ + valid &= den != vfloat<M>(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags); + return epilog(valid,hit); + } + }; + + template<int M, int K, bool filter> + struct QuadMIntersectorKPlueckerBase + { + __forceinline QuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template<typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const vbool<K>& flags, + const Epilog& epilog) const + { + /* calculate vertices relative to ray origin */ + vbool<K> valid = valid0; + const Vec3vf<K> O = ray.org; + const Vec3vf<K> D = ray.dir; + const Vec3vf<K> v0 = tri_v0-O; + const Vec3vf<K> v1 = tri_v1-O; + const Vec3vf<K> v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf<K> e0 = v2-v0; + const Vec3vf<K> e1 = v0-v1; + const Vec3vf<K> e2 = v1-v2; + + /* perform edge tests */ + const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D); + const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D); + const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D); + const vfloat<K> UVW = U+V+W; + const vfloat<K> eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + valid &= max(U,V,W) <= eps; +#else + valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2); + const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D)); + + /* perform depth test */ + const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng))); + const vfloat<K> t = rcp(den)*T; + valid &= ray.tnear() <= t & t <= ray.tfar; + valid &= den != vfloat<K>(zero); + if (unlikely(none(valid))) return false; + + /* calculate hit information */ + QuadHitPlueckerK<K> hit(U,V,UVW,t,Ng,flags); + return epilog(valid,hit); + } + + /*! Intersects K rays with one of M quads. */ + template<typename Epilog> + __forceinline bool intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& v0, + const Vec3vf<K>& v1, + const Vec3vf<K>& v2, + const Vec3vf<K>& v3, + const Epilog& epilog) const + { + intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog); + if (none(valid0)) return true; + intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog); + return none(valid0); + } + }; + + template<int M, int K, bool filter> + struct QuadMIntersectorKPluecker : public QuadMIntersectorKPlueckerBase<M,K,filter> + { + __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray) + : QuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {} + + __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const vuint<M>& geomID, const vuint<M>& primID) const + { + Intersect1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID); + PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog); + PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog); + } + + __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const vuint<M>& geomID, const vuint<M>& primID) const + { + Occluded1KEpilogM<M,K,filter> epilog(ray,k,context,geomID,primID); + if (PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true; + if (PlueckerIntersector1KTriangleM::intersect1<M,K>(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true; + return false; + } + }; + +#if defined(__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template<int K, bool filter> + struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter> + { + __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray) + : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {} + + template<typename Epilog> + __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); + const vbool8 flags(0,0,0,0,1,1,1,1); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + return PlueckerIntersector1KTriangleM::intersect1<8,K>(ray,k,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + }; + +#endif + } +} diff --git a/thirdparty/embree/kernels/geometry/quadi.h b/thirdparty/embree/kernels/geometry/quadi.h new file mode 100644 index 0000000000..70a7bdf158 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/quadi.h @@ -0,0 +1,483 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../common/scene.h" + +namespace embree +{ + /* Stores M quads from an indexed face set */ + template <int M> + struct QuadMi + { + /* Virtual interface to query information about the quad type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored quads */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline QuadMi() { } + + /* Construction from vertices and IDs */ + __forceinline QuadMi(const vuint<M>& v0, + const vuint<M>& v1, + const vuint<M>& v2, + const vuint<M>& v3, + const vuint<M>& geomIDs, + const vuint<M>& primIDs) +#if defined(EMBREE_COMPACT_POLYS) + : geomIDs(geomIDs), primIDs(primIDs) {} +#else + : v0_(v0),v1_(v1), v2_(v2), v3_(v3), geomIDs(geomIDs), primIDs(primIDs) {} +#endif + + /* Returns a mask that tells which quads are valid */ + __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); } + + /* Returns if the specified quad is valid */ + __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; } + + /* Returns the number of stored quads */ + __forceinline size_t size() const { return bsf(~movemask(valid())); } + + /* Returns the geometry IDs */ + __forceinline vuint<M>& geomID() { return geomIDs; } + __forceinline const vuint<M>& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i<M); assert(geomIDs[i] != -1); return geomIDs[i]; } + + /* Returns the primitive IDs */ + __forceinline vuint<M>& primID() { return primIDs; } + __forceinline const vuint<M>& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; } + + /* Calculate the bounds of the quads */ + __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const + { + BBox3fa bounds = empty; + for (size_t i=0; i<M && valid(i); i++) { + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i)); + bounds.extend(mesh->bounds(primID(i),itime)); + } + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) { + return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) + { + LBBox3fa allBounds = empty; + for (size_t i=0; i<M && valid(i); i++) + { + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i)); + allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps)); + } + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + for (size_t i=0; i<M && valid(i); i++) + { + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i)); + allBounds.extend(mesh->linearBounds(primID(i), time_range)); + } + return allBounds; + } + + /* Fill quad from quad list */ + template<typename PrimRefT> + __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) + { + vuint<M> geomID = -1, primID = -1; + const PrimRefT* prim = &prims[begin]; + vuint<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero; + + for (size_t i=0; i<M; i++) + { + if (begin<end) { + geomID[i] = prim->geomID(); + primID[i] = prim->primID(); +#if !defined(EMBREE_COMPACT_POLYS) + const QuadMesh* mesh = scene->get<QuadMesh>(prim->geomID()); + const QuadMesh::Quad& q = mesh->quad(prim->primID()); + unsigned int_stride = mesh->vertices0.getStride()/4; + v0[i] = q.v[0] * int_stride; + v1[i] = q.v[1] * int_stride; + v2[i] = q.v[2] * int_stride; + v3[i] = q.v[3] * int_stride; +#endif + begin++; + } else { + assert(i); + if (likely(i > 0)) { + geomID[i] = geomID[0]; // always valid geomIDs + primID[i] = -1; // indicates invalid data + v0[i] = v0[0]; + v1[i] = v0[0]; + v2[i] = v0[0]; + v3[i] = v0[0]; + } + } + if (begin<end) prim = &prims[begin]; + } + new (this) QuadMi(v0,v1,v2,v3,geomID,primID); // FIXME: use non temporal store + } + + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) + { + fill(prims, begin, end, scene); + return linearBounds(scene, itime); + } + + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) + { + fill(prims, begin, end, scene); + return linearBounds(scene, time_range); + } + + friend embree_ostream operator<<(embree_ostream cout, const QuadMi& quad) { + return cout << "QuadMi<" << M << ">( " +#if !defined(EMBREE_COMPACT_POLYS) + << "v0 = " << quad.v0_ << ", v1 = " << quad.v1_ << ", v2 = " << quad.v2_ << ", v3 = " << quad.v3_ << ", " +#endif + << "geomID = " << quad.geomIDs << ", primID = " << quad.primIDs << " )"; + } + + protected: +#if !defined(EMBREE_COMPACT_POLYS) + vuint<M> v0_; // 4 byte offset of 1st vertex + vuint<M> v1_; // 4 byte offset of 2nd vertex + vuint<M> v2_; // 4 byte offset of 3rd vertex + vuint<M> v3_; // 4 byte offset of 4th vertex +#endif + vuint<M> geomIDs; // geometry ID of mesh + vuint<M> primIDs; // primitive ID of primitive inside mesh + }; + + namespace isa + { + + template<int M> + struct QuadMi : public embree::QuadMi<M> + { +#if !defined(EMBREE_COMPACT_POLYS) + using embree::QuadMi<M>::v0_; + using embree::QuadMi<M>::v1_; + using embree::QuadMi<M>::v2_; + using embree::QuadMi<M>::v3_; +#endif + using embree::QuadMi<M>::geomIDs; + using embree::QuadMi<M>::primIDs; + using embree::QuadMi<M>::geomID; + using embree::QuadMi<M>::primID; + using embree::QuadMi<M>::valid; + + template<int vid> + __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const + { +#if defined(EMBREE_COMPACT_POLYS) + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index)); + const QuadMesh::Quad& quad = mesh->quad(primID(index)); + return (Vec3f) mesh->vertices[0][quad.v[vid]]; +#else + const vuint<M>& v = getVertexOffset<vid>(); + const float* vertices = scene->vertices[geomID(index)]; + return (Vec3f&) vertices[v[index]]; +#endif + } + + template<int vid, typename T> + __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const + { +#if defined(EMBREE_COMPACT_POLYS) + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index)); + const QuadMesh::Quad& quad = mesh->quad(primID(index)); + const Vec3fa v0 = mesh->vertices[itime+0][quad.v[vid]]; + const Vec3fa v1 = mesh->vertices[itime+1][quad.v[vid]]; +#else + const vuint<M>& v = getVertexOffset<vid>(); + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index)); + const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0); + const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1); + const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); + const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); +#endif + const Vec3<T> p0(v0.x,v0.y,v0.z); + const Vec3<T> p1(v1.x,v1.y,v1.z); + return lerp(p0,p1,ftime); + } + + template<int vid, int K, typename T> + __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const + { + Vec3<T> p0, p1; + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index)); + + for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask)) + { +#if defined(EMBREE_COMPACT_POLYS) + const QuadMesh::Quad& quad = mesh->quad(primID(index)); + const Vec3fa v0 = mesh->vertices[itime[i]+0][quad.v[vid]]; + const Vec3fa v1 = mesh->vertices[itime[i]+1][quad.v[vid]]; +#else + const vuint<M>& v = getVertexOffset<vid>(); + const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0); + const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1); + const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); + const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); +#endif + p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z; + p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z; + } + return (T(one)-ftime)*p0 + ftime*p1; + } + + struct Quad { + vfloat4 v0,v1,v2,v3; + }; + +#if defined(EMBREE_COMPACT_POLYS) + + __forceinline Quad loadQuad(const int i, const Scene* const scene) const + { + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + if (unlikely(primID == -1)) return { zero, zero, zero, zero }; + const QuadMesh* mesh = scene->get<QuadMesh>(geomID); + const QuadMesh::Quad& quad = mesh->quad(primID); + const vfloat4 v0 = (vfloat4) mesh->vertices0[quad.v[0]]; + const vfloat4 v1 = (vfloat4) mesh->vertices0[quad.v[1]]; + const vfloat4 v2 = (vfloat4) mesh->vertices0[quad.v[2]]; + const vfloat4 v3 = (vfloat4) mesh->vertices0[quad.v[3]]; + return { v0, v1, v2, v3 }; + } + + __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const + { + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + if (unlikely(primID == -1)) return { zero, zero, zero, zero }; + const QuadMesh* mesh = scene->get<QuadMesh>(geomID); + const QuadMesh::Quad& quad = mesh->quad(primID); + const vfloat4 v0 = (vfloat4) mesh->vertices[itime][quad.v[0]]; + const vfloat4 v1 = (vfloat4) mesh->vertices[itime][quad.v[1]]; + const vfloat4 v2 = (vfloat4) mesh->vertices[itime][quad.v[2]]; + const vfloat4 v3 = (vfloat4) mesh->vertices[itime][quad.v[3]]; + return { v0, v1, v2, v3 }; + } + +#else + + __forceinline Quad loadQuad(const int i, const Scene* const scene) const + { + const float* vertices = scene->vertices[geomID(i)]; + const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); + const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); + const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); + const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]); + return { v0, v1, v2, v3 }; + } + + __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const + { + const unsigned int geomID = geomIDs[i]; + const QuadMesh* mesh = scene->get<QuadMesh>(geomID); + const float* vertices = (const float*) mesh->vertexPtr(0,itime); + const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); + const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); + const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); + const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]); + return { v0, v1, v2, v3 }; + } + +#endif + + /* Gather the quads */ + __forceinline void gather(Vec3vf<M>& p0, + Vec3vf<M>& p1, + Vec3vf<M>& p2, + Vec3vf<M>& p3, + const Scene *const scene) const; + +#if defined(__AVX512F__) + __forceinline void gather(Vec3vf16& p0, + Vec3vf16& p1, + Vec3vf16& p2, + Vec3vf16& p3, + const Scene *const scene) const; +#endif + + template<int K> +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019 + __noinline +#else + __forceinline +#endif + void gather(const vbool<K>& valid, + Vec3vf<K>& p0, + Vec3vf<K>& p1, + Vec3vf<K>& p2, + Vec3vf<K>& p3, + const size_t index, + const Scene* const scene, + const vfloat<K>& time) const + { + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index)); + + vfloat<K> ftime; + const vint<K> itime = mesh->timeSegment<K>(time, ftime); + + const size_t first = bsf(movemask(valid)); + if (likely(all(valid,itime[first] == itime))) + { + p0 = getVertex<0>(index, scene, itime[first], ftime); + p1 = getVertex<1>(index, scene, itime[first], ftime); + p2 = getVertex<2>(index, scene, itime[first], ftime); + p3 = getVertex<3>(index, scene, itime[first], ftime); + } + else + { + p0 = getVertex<0,K>(valid, index, scene, itime, ftime); + p1 = getVertex<1,K>(valid, index, scene, itime, ftime); + p2 = getVertex<2,K>(valid, index, scene, itime, ftime); + p3 = getVertex<3,K>(valid, index, scene, itime, ftime); + } + } + + __forceinline void gather(Vec3vf<M>& p0, + Vec3vf<M>& p1, + Vec3vf<M>& p2, + Vec3vf<M>& p3, + const QuadMesh* mesh, + const Scene *const scene, + const int itime) const; + + __forceinline void gather(Vec3vf<M>& p0, + Vec3vf<M>& p1, + Vec3vf<M>& p2, + Vec3vf<M>& p3, + const Scene *const scene, + const float time) const; + + /* Updates the primitive */ + __forceinline BBox3fa update(QuadMesh* mesh) + { + BBox3fa bounds = empty; + for (size_t i=0; i<M; i++) + { + if (!valid(i)) break; + const unsigned primId = primID(i); + const QuadMesh::Quad& q = mesh->quad(primId); + const Vec3fa p0 = mesh->vertex(q.v[0]); + const Vec3fa p1 = mesh->vertex(q.v[1]); + const Vec3fa p2 = mesh->vertex(q.v[2]); + const Vec3fa p3 = mesh->vertex(q.v[3]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3))); + } + return bounds; + } + + private: +#if !defined(EMBREE_COMPACT_POLYS) + template<int N> const vuint<M>& getVertexOffset() const; +#endif + }; + +#if !defined(EMBREE_COMPACT_POLYS) + template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<0>() const { return v0_; } + template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<1>() const { return v1_; } + template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<2>() const { return v2_; } + template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<3>() const { return v3_; } +#endif + + template<> + __forceinline void QuadMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const Scene *const scene) const + { + prefetchL1(((char*)this)+0*64); + prefetchL1(((char*)this)+1*64); + const Quad tri0 = loadQuad(0,scene); + const Quad tri1 = loadQuad(1,scene); + const Quad tri2 = loadQuad(2,scene); + const Quad tri3 = loadQuad(3,scene); + transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); + transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); + transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); + transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z); + } + + template<> + __forceinline void QuadMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const QuadMesh* mesh, + const Scene *const scene, + const int itime) const + { + // FIXME: for trianglei there all geometries are identical, is this the case here too? + + const Quad tri0 = loadQuad(0,itime,scene); + const Quad tri1 = loadQuad(1,itime,scene); + const Quad tri2 = loadQuad(2,itime,scene); + const Quad tri3 = loadQuad(3,itime,scene); + transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); + transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); + transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); + transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z); + } + + template<> + __forceinline void QuadMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const Scene *const scene, + const float time) const + { + const QuadMesh* mesh = scene->get<QuadMesh>(geomID(0)); // in mblur mode all geometries are identical + + float ftime; + const int itime = mesh->timeSegment(time, ftime); + + Vec3vf4 a0,a1,a2,a3; gather(a0,a1,a2,a3,mesh,scene,itime); + Vec3vf4 b0,b1,b2,b3; gather(b0,b1,b2,b3,mesh,scene,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + p2 = lerp(a2,b2,vfloat4(ftime)); + p3 = lerp(a3,b3,vfloat4(ftime)); + } + } + + template<int M> + typename QuadMi<M>::Type QuadMi<M>::type; + + typedef QuadMi<4> Quad4i; +} diff --git a/thirdparty/embree/kernels/geometry/quadi_intersector.h b/thirdparty/embree/kernels/geometry/quadi_intersector.h new file mode 100644 index 0000000000..20a98c3406 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/quadi_intersector.h @@ -0,0 +1,350 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "quadi.h" +#include "quad_intersector_moeller.h" +#include "quad_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M quads with 1 ray */ + template<int M, bool filter> + struct QuadMiIntersector1Moeller + { + typedef QuadMi<M> Primitive; + typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad); + } + }; + + /*! Intersects M triangles with K rays. */ + template<int M, int K, bool filter> + struct QuadMiIntersectorKMoeller + { + typedef QuadMi<M> Primitive; + typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad) + { + Scene* scene = context->scene; + for (size_t i=0; i<QuadMi<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene); + const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene); + const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene); + const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad) + { + Scene* scene = context->scene; + vbool<K> valid0 = valid_i; + for (size_t i=0; i<QuadMi<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene); + const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene); + const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene); + const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + }; + + /*! Intersects M quads with 1 ray */ + template<int M, bool filter> + struct QuadMiIntersector1Pluecker + { + typedef QuadMi<M> Primitive; + typedef QuadMIntersector1Pluecker<M,filter> Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad); + } + }; + + /*! Intersects M triangles with K rays. */ + template<int M, int K, bool filter> + struct QuadMiIntersectorKPluecker + { + typedef QuadMi<M> Primitive; + typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad) + { + Scene* scene = context->scene; + for (size_t i=0; i<QuadMi<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene); + const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene); + const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene); + const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad) + { + Scene* scene = context->scene; + vbool<K> valid0 = valid_i; + for (size_t i=0; i<QuadMi<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene); + const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene); + const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene); + const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + }; + + /*! Intersects M motion blur quads with 1 ray */ + template<int M, bool filter> + struct QuadMiMBIntersector1Moeller + { + typedef QuadMi<M> Primitive; + typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); + pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); + return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad); + } + }; + + /*! Intersects M motion blur quads with K rays. */ + template<int M, int K, bool filter> + struct QuadMiMBIntersectorKMoeller + { + typedef QuadMi<M> Primitive; + typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations; + + /*! Intersects K rays with M quads. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad) + { + for (size_t i=0; i<QuadMi<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); + pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M quads. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad) + { + vbool<K> valid0 = valid_i; + for (size_t i=0; i<QuadMi<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); + if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M quads and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); + pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M quads. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + }; + + /*! Intersects M motion blur quads with 1 ray */ + template<int M, bool filter> + struct QuadMiMBIntersector1Pluecker + { + typedef QuadMi<M> Primitive; + typedef QuadMIntersector1Pluecker<M,filter> Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); + pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); + return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad); + } + }; + + /*! Intersects M motion blur quads with K rays. */ + template<int M, int K, bool filter> + struct QuadMiMBIntersectorKPluecker + { + typedef QuadMi<M> Primitive; + typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations; + + /*! Intersects K rays with M quads. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad) + { + for (size_t i=0; i<QuadMi<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); + pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M quads. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad) + { + vbool<K> valid0 = valid_i; + for (size_t i=0; i<QuadMi<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + Vec3vf<K> v0,v1,v2,v3; quad.template gather<K>(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); + if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M quads and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); + pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M quads. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/quadv.h b/thirdparty/embree/kernels/geometry/quadv.h new file mode 100644 index 0000000000..2137356ff2 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/quadv.h @@ -0,0 +1,165 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + /* Stores the vertices of M quads in struct of array layout */ + template <int M> + struct QuadMv + { + public: + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored quads */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline QuadMv() {} + + /* Construction from vertices and IDs */ + __forceinline QuadMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const vuint<M>& geomIDs, const vuint<M>& primIDs) + : v0(v0), v1(v1), v2(v2), v3(v3), geomIDs(geomIDs), primIDs(primIDs) {} + + /* Returns a mask that tells which quads are valid */ + __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); } + + /* Returns true if the specified quad is valid */ + __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; } + + /* Returns the number of stored quads */ + __forceinline size_t size() const { return bsf(~movemask(valid())); } + + /* Returns the geometry IDs */ + __forceinline vuint<M>& geomID() { return geomIDs; } + __forceinline const vuint<M>& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; } + + /* Returns the primitive IDs */ + __forceinline vuint<M> primID() { return primIDs; } + __forceinline const vuint<M> primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; } + + /* Calculate the bounds of the quads */ + __forceinline BBox3fa bounds() const + { + Vec3vf<M> lower = min(v0,v1,v2,v3); + Vec3vf<M> upper = max(v0,v1,v2,v3); + vbool<M> mask = valid(); + lower.x = select(mask,lower.x,vfloat<M>(pos_inf)); + lower.y = select(mask,lower.y,vfloat<M>(pos_inf)); + lower.z = select(mask,lower.z,vfloat<M>(pos_inf)); + upper.x = select(mask,upper.x,vfloat<M>(neg_inf)); + upper.y = select(mask,upper.y,vfloat<M>(neg_inf)); + upper.z = select(mask,upper.z,vfloat<M>(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Non temporal store */ + __forceinline static void store_nt(QuadMv* dst, const QuadMv& src) + { + vfloat<M>::store_nt(&dst->v0.x,src.v0.x); + vfloat<M>::store_nt(&dst->v0.y,src.v0.y); + vfloat<M>::store_nt(&dst->v0.z,src.v0.z); + vfloat<M>::store_nt(&dst->v1.x,src.v1.x); + vfloat<M>::store_nt(&dst->v1.y,src.v1.y); + vfloat<M>::store_nt(&dst->v1.z,src.v1.z); + vfloat<M>::store_nt(&dst->v2.x,src.v2.x); + vfloat<M>::store_nt(&dst->v2.y,src.v2.y); + vfloat<M>::store_nt(&dst->v2.z,src.v2.z); + vfloat<M>::store_nt(&dst->v3.x,src.v3.x); + vfloat<M>::store_nt(&dst->v3.y,src.v3.y); + vfloat<M>::store_nt(&dst->v3.z,src.v3.z); + vuint<M>::store_nt(&dst->geomIDs,src.geomIDs); + vuint<M>::store_nt(&dst->primIDs,src.primIDs); + } + + /* Fill quad from quad list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) + { + vuint<M> vgeomID = -1, vprimID = -1; + Vec3vf<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero; + + for (size_t i=0; i<M && begin<end; i++, begin++) + { + const PrimRef& prim = prims[begin]; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const QuadMesh* __restrict__ const mesh = scene->get<QuadMesh>(geomID); + const QuadMesh::Quad& quad = mesh->quad(primID); + const Vec3fa& p0 = mesh->vertex(quad.v[0]); + const Vec3fa& p1 = mesh->vertex(quad.v[1]); + const Vec3fa& p2 = mesh->vertex(quad.v[2]); + const Vec3fa& p3 = mesh->vertex(quad.v[3]); + vgeomID [i] = geomID; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; + } + QuadMv::store_nt(this,QuadMv(v0,v1,v2,v3,vgeomID,vprimID)); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(QuadMesh* mesh) + { + BBox3fa bounds = empty; + vuint<M> vgeomID = -1, vprimID = -1; + Vec3vf<M> v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; i<M; i++) + { + if (primID(i) == -1) break; + const unsigned geomId = geomID(i); + const unsigned primId = primID(i); + const QuadMesh::Quad& quad = mesh->quad(primId); + const Vec3fa p0 = mesh->vertex(quad.v[0]); + const Vec3fa p1 = mesh->vertex(quad.v[1]); + const Vec3fa p2 = mesh->vertex(quad.v[2]); + const Vec3fa p3 = mesh->vertex(quad.v[3]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3))); + vgeomID [i] = geomId; + vprimID [i] = primId; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; + } + new (this) QuadMv(v0,v1,v2,v3,vgeomID,vprimID); + return bounds; + } + + public: + Vec3vf<M> v0; // 1st vertex of the quads + Vec3vf<M> v1; // 2nd vertex of the quads + Vec3vf<M> v2; // 3rd vertex of the quads + Vec3vf<M> v3; // 4rd vertex of the quads + private: + vuint<M> geomIDs; // geometry ID + vuint<M> primIDs; // primitive ID + }; + + template<int M> + typename QuadMv<M>::Type QuadMv<M>::type; + + typedef QuadMv<4> Quad4v; +} diff --git a/thirdparty/embree/kernels/geometry/quadv_intersector.h b/thirdparty/embree/kernels/geometry/quadv_intersector.h new file mode 100644 index 0000000000..9b28e05614 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/quadv_intersector.h @@ -0,0 +1,181 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "quadv.h" +#include "quad_intersector_moeller.h" +#include "quad_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M quads with 1 ray */ + template<int M, bool filter> + struct QuadMvIntersector1Moeller + { + typedef QuadMv<M> Primitive; + typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad); + } + }; + + /*! Intersects M triangles with K rays. */ + template<int M, int K, bool filter> + struct QuadMvIntersectorKMoeller + { + typedef QuadMv<M> Primitive; + typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad) + { + for (size_t i=0; i<QuadMv<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i); + const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i); + const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i); + const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad) + { + vbool<K> valid0 = valid_i; + + for (size_t i=0; i<QuadMv<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i); + const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i); + const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i); + const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + }; + + /*! Intersects M quads with 1 ray */ + template<int M, bool filter> + struct QuadMvIntersector1Pluecker + { + typedef QuadMv<M> Primitive; + typedef QuadMIntersector1Pluecker<M,filter> Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad); + } + }; + + /*! Intersects M triangles with K rays. */ + template<int M, int K, bool filter> + struct QuadMvIntersectorKPluecker + { + typedef QuadMv<M> Primitive; + typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad) + { + for (size_t i=0; i<QuadMv<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i); + const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i); + const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i); + const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad) + { + vbool<K> valid0 = valid_i; + + for (size_t i=0; i<QuadMv<M>::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i); + const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i); + const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i); + const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + }; + } +} + diff --git a/thirdparty/embree/kernels/geometry/roundline_intersector.h b/thirdparty/embree/kernels/geometry/roundline_intersector.h new file mode 100644 index 0000000000..0e9393442b --- /dev/null +++ b/thirdparty/embree/kernels/geometry/roundline_intersector.h @@ -0,0 +1,715 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" + + +/* + + This file implements the intersection of a ray with a round linear + curve segment. We define the geometry of such a round linear curve + segment from point p0 with radius r0 to point p1 with radius r1 + using the cone that touches spheres p0/r0 and p1/r1 tangentially + plus the sphere p1/r1. We denote the tangentially touching cone from + p0/r0 to p1/r1 with cone(p0,r0,p1,r1) and the cone plus the ending + sphere with cone_sphere(p0,r0,p1,r1). + + For multiple connected round linear curve segments this construction + yield a proper shape when viewed from the outside. Using the + following CSG we can also handle the interiour in most common cases: + + round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) = + cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr) + + Thus by subtracting the neighboring cone geometries, we cut away + parts of the center cone_sphere surface which lie inside the + combined curve. This approach works as long as geometry of the + current cone_sphere penetrates into direct neighbor segments only, + and not into segments further away. + + To construct a cone that touches two spheres at p0 and p1 with r0 + and r1, one has to increase the cone radius at r0 and r1 to obtain + larger radii w0 and w1, such that the infinite cone properly touches + the spheres. From the paper "Ray Tracing Generalized Tube + Primitives: Method and Applications" + (https://www.researchgate.net/publication/334378683_Ray_Tracing_Generalized_Tube_Primitives_Method_and_Applications) + one can derive the following equations for these increased + radii: + + sr = 1.0f / sqrt(1-sqr(dr)/sqr(p1-p0)) + w0 = sr*r0 + w1 = sr*r1 + + Further, we want the cone to start where it touches the sphere at p0 + and to end where it touches sphere at p1. Therefore, we need to + construct clipping locations y0 and y1 for the start and end of the + cone. These start and end clipping location of the cone can get + calculated as: + + Y0 = - r0 * (r1-r0) / length(p1-p0) + Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0) + + Where the cone starts a distance Y0 and ends a distance Y1 away of + point p0 along the cone center. The distance between Y1-Y0 can get + calculated as: + + dY = length(p1-p0) - (r1-r0)^2 / length(p1-p0) + + In the code below, Y will always be scaled by length(p1-p0) to + obtain y and you will find the terms r0*(r1-r0) and + (p1-p0)^2-(r1-r0)^2. + + */ + +namespace embree +{ + namespace isa + { + template<int M> + struct RoundLineIntersectorHitM + { + __forceinline RoundLineIntersectorHitM() {} + + __forceinline RoundLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng) + : vu(u), vv(v), vt(t), vNg(Ng) {} + + __forceinline void finalize() {} + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); } + __forceinline vfloat<M> t () const { return vt; } + __forceinline Vec3vf<M> Ng() const { return vNg; } + + public: + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + }; + + namespace __roundline_internal + { + template<int M> + struct ConeGeometry + { + ConeGeometry (const Vec4vf<M>& a, const Vec4vf<M>& b) + : p0(a.xyz()), p1(b.xyz()), dP(p1-p0), dPdP(dot(dP,dP)), r0(a.w), sqr_r0(sqr(r0)), r1(b.w), dr(r1-r0), drdr(dr*dr), r0dr (r0*dr), g(dPdP - drdr) {} + + /* + + This function tests if a point is accepted by first cone + clipping plane. + + First, we need to project the point onto the line p0->p1: + + Y = (p-p0)*(p1-p0)/length(p1-p0) + + This value y is the distance to the projection point from + p0. The clip distances are calculated as: + + Y0 = - r0 * (r1-r0) / length(p1-p0) + Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0) + + Thus to test if the point p is accepted by the first + clipping plane we need to test Y > Y0 and to test if it + is accepted by the second clipping plane we need to test + Y < Y1. + + By multiplying the calculations with length(p1-p0) these + calculation can get simplied to: + + y = (p-p0)*(p1-p0) + y0 = - r0 * (r1-r0) + y1 = (p1-p0)^2 - r1 * (r1-r0) + + and the test y > y0 and y < y1. + + */ + + __forceinline vbool<M> isClippedByPlane (const vbool<M>& valid_i, const Vec3vf<M>& p) const + { + const Vec3vf<M> p0p = p - p0; + const vfloat<M> y = dot(p0p,dP); + const vfloat<M> cap0 = -r0dr; + const vbool<M> inside_cone = y > cap0; + return valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf)) & inside_cone; + } + + /* + + This function tests whether a point lies inside the capped cone + tangential to its ending spheres. + + Therefore one has to check if the point is inside the + region defined by the cone clipping planes, which is + performed similar as in the previous function. + + To perform the inside cone test we need to project the + point onto the line p0->p1: + + dP = p1-p0 + Y = (p-p0)*dP/length(dP) + + This value Y is the distance to the projection point from + p0. To obtain a parameter value u going from 0 to 1 along + the line p0->p1 we calculate: + + U = Y/length(dP) + + The radii to use at points p0 and p1 are: + + w0 = sr * r0 + w1 = sr * r1 + dw = w1-w0 + + Using these radii and u one can directly test if the point + lies inside the cone using the formula dP*dP < wy*wy with: + + wy = w0 + u*dw + py = p0 + u*dP - p + + By multiplying the calculations with length(p1-p0) and + inserting the definition of w can obtain simpler equations: + + y = (p-p0)*dP + ry = r0 + y/dP^2 * dr + wy = sr*ry + py = p0 + y/dP^2*dP - p + y0 = - r0 * dr + y1 = dP^2 - r1 * dr + + Thus for the in-cone test we get: + + py^2 < wy^2 + <=> py^2 < sr^2 * ry^2 + <=> py^2 * ( dP^2 - dr^2 ) < dP^2 * ry^2 + + This can further get simplified to: + + (p0-p)^2 * (dP^2 - dr^2) - y^2 < dP^2 * r0^2 + 2.0f*r0*dr*y; + + */ + + __forceinline vbool<M> isInsideCappedCone (const vbool<M>& valid_i, const Vec3vf<M>& p) const + { + const Vec3vf<M> p0p = p - p0; + const vfloat<M> y = dot(p0p,dP); + const vfloat<M> cap0 = -r0dr+vfloat<M>(ulp); + const vfloat<M> cap1 = -r1*dr + dPdP; + + vbool<M> inside_cone = valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf)); + inside_cone &= y > cap0; // start clipping plane + inside_cone &= y < cap1; // end clipping plane + inside_cone &= sqr(p0p)*g - sqr(y) < dPdP * sqr_r0 + 2.0f*r0dr*y; // in cone test + return inside_cone; + } + + protected: + Vec3vf<M> p0; + Vec3vf<M> p1; + Vec3vf<M> dP; + vfloat<M> dPdP; + vfloat<M> r0; + vfloat<M> sqr_r0; + vfloat<M> r1; + vfloat<M> dr; + vfloat<M> drdr; + vfloat<M> r0dr; + vfloat<M> g; + }; + + template<int M> + struct ConeGeometryIntersector : public ConeGeometry<M> + { + using ConeGeometry<M>::p0; + using ConeGeometry<M>::p1; + using ConeGeometry<M>::dP; + using ConeGeometry<M>::dPdP; + using ConeGeometry<M>::r0; + using ConeGeometry<M>::sqr_r0; + using ConeGeometry<M>::r1; + using ConeGeometry<M>::dr; + using ConeGeometry<M>::r0dr; + using ConeGeometry<M>::g; + + ConeGeometryIntersector (const Vec3vf<M>& ray_org, const Vec3vf<M>& ray_dir, const vfloat<M>& dOdO, const vfloat<M>& rcp_dOdO, const Vec4vf<M>& a, const Vec4vf<M>& b) + : ConeGeometry<M>(a,b), org(ray_org), O(ray_org-p0), dO(ray_dir), dOdO(dOdO), rcp_dOdO(rcp_dOdO), OdP(dot(dP,O)), dOdP(dot(dP,dO)), yp(OdP + r0dr) {} + + /* + + This function intersects a ray with a cone that touches a + start sphere p0/r0 and end sphere p1/r1. + + To find this ray/cone intersections one could just + calculate radii w0 and w1 as described above and use a + standard ray/cone intersection routine with these + radii. However, it turns out that calculations can get + simplified when deriving a specialized ray/cone + intersection for this special case. We perform + calculations relative to the cone origin p0 and define: + + O = ray_org - p0 + dO = ray_dir + dP = p1-p0 + dr = r1-r0 + dw = w1-w0 + + For some t we can compute the potential hit point h = O + t*dO and + project it onto the cone vector dP to obtain u = (h*dP)/(dP*dP). In + case of an intersection, the squared distance from the hit point + projected onto the cone center line to the hit point should be equal + to the squared cone radius at u: + + (u*dP - h)^2 = (w0 + u*dw)^2 + + Inserting the definition of h, u, w0, and dw into this formula, then + factoring out all terms, and sorting by t^2, t^1, and t^0 terms + yields a quadratic equation to solve. + + Inserting u: + ( (h*dP)*dP/dP^2 - h )^2 = ( w0 + (h*dP)*dw/dP^2 )^2 + + Multiplying by dP^4: + ( (h*dP)*dP - h*dP^2 )^2 = ( w0*dP^2 + (h*dP)*dw )^2 + + Inserting w0 and dw: + ( (h*dP)*dP - h*dP^2 )^2 = ( r0*dP^2 + (h*dP)*dr )^2 / (1-dr^2/dP^2) + ( (h*dP)*dP - h*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (h*dP)*dr )^2 + + Now one can insert the definition of h, factor out, and presort by t: + ( ((O + t*dO)*dP)*dP - (O + t*dO)*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + ((O + t*dO)*dP)*dr )^2 + ( (O*dP)*dP-O*dP^2 + t*( (dO*dP)*dP - dO*dP^2 ) )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (O*dP)*dr + t*(dO*dP)*dr )^2 + + Factoring out further and sorting by t^2, t^1 and t^0 yields: + + 0 = t^2 * [ ((dO*dP)*dP - dO-dP^2)^2 * (dP^2 - dr^2) - dP^2*(dO*dP)^2*dr^2 ] + + 2*t^1 * [ ((O*dP)*dP - O*dP^2) * ((dO*dP)*dP - dO*dP^2) * (dP^2 - dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)*(dO*dP)*dr ] + + t^0 * [ ( (O*dP)*dP - O*dP^2)^2 * (dP^2-dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)^2 ] + + This can be simplified to: + + 0 = t^2 * [ (dP^2 - dr^2)*dO^2 - (dO*dP)^2 ] + + 2*t^1 * [ (dP^2 - dr^2)*(O*dO) - (dO*dP)*(O*dP + r0*dr) ] + + t^0 * [ (dP^2 - dr^2)*O^2 - (O*dP)^2 - r0^2*dP^2 - 2.0f*r0*dr*(O*dP) ] + + Solving this quadratic equation yields the values for t at which the + ray intersects the cone. + + */ + + __forceinline bool intersectCone(vbool<M>& valid, vfloat<M>& lower, vfloat<M>& upper) + { + /* return no hit by default */ + lower = pos_inf; + upper = neg_inf; + + /* compute quadratic equation A*t^2 + B*t + C = 0 */ + const vfloat<M> OO = dot(O,O); + const vfloat<M> OdO = dot(dO,O); + const vfloat<M> A = g * dOdO - sqr(dOdP); + const vfloat<M> B = 2.0f * (g*OdO - dOdP*yp); + const vfloat<M> C = g*OO - sqr(OdP) - sqr_r0*dPdP - 2.0f*r0dr*OdP; + + /* we miss the cone if determinant is smaller than zero */ + const vfloat<M> D = B*B - 4.0f*A*C; + valid &= (D >= 0.0f & g > 0.0f); // if g <= 0 then the cone is inside a sphere end + + /* When rays are parallel to the cone surface, then the + * ray may be inside or outside the cone. We just assume a + * miss in that case, which is fine as rays inside the + * cone would anyway hit the ending spheres in that + * case. */ + valid &= abs(A) > min_rcp_input; + if (unlikely(none(valid))) { + return false; + } + + /* compute distance to front and back hit */ + const vfloat<M> Q = sqrt(D); + const vfloat<M> rcp_2A = rcp(2.0f*A); + t_cone_front = (-B-Q)*rcp_2A; + y_cone_front = yp + t_cone_front*dOdP; + lower = select( (y_cone_front > -(float)ulp) & (y_cone_front <= g) & (g > 0.0f), t_cone_front, vfloat<M>(pos_inf)); +#if !defined (EMBREE_BACKFACE_CULLING_CURVES) + t_cone_back = (-B+Q)*rcp_2A; + y_cone_back = yp + t_cone_back *dOdP; + upper = select( (y_cone_back > -(float)ulp) & (y_cone_back <= g) & (g > 0.0f), t_cone_back , vfloat<M>(neg_inf)); +#endif + return true; + } + + /* + This function intersects the ray with the end sphere at + p1. We already clip away hits that are inside the + neighboring cone segment. + + */ + + __forceinline void intersectEndSphere(vbool<M>& valid, + const ConeGeometry<M>& coneR, + vfloat<M>& lower, vfloat<M>& upper) + { + /* calculate front and back hit with end sphere */ + const Vec3vf<M> O1 = org - p1; + const vfloat<M> O1dO = dot(O1,dO); + const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r1)); + const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) ); + + /* clip away front hit if it is inside next cone segment */ + t_sph1_front = (-O1dO - rhs1)*rcp_dOdO; + const Vec3vf<M> hit_front = org + t_sph1_front*dO; + vbool<M> valid_sph1_front = h2 >= 0.0f & yp + t_sph1_front*dOdP > g & !coneR.isClippedByPlane (valid, hit_front); + lower = select(valid_sph1_front, t_sph1_front, vfloat<M>(pos_inf)); + +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + /* clip away back hit if it is inside next cone segment */ + t_sph1_back = (-O1dO + rhs1)*rcp_dOdO; + const Vec3vf<M> hit_back = org + t_sph1_back*dO; + vbool<M> valid_sph1_back = h2 >= 0.0f & yp + t_sph1_back*dOdP > g & !coneR.isClippedByPlane (valid, hit_back); + upper = select(valid_sph1_back, t_sph1_back, vfloat<M>(neg_inf)); +#else + upper = vfloat<M>(neg_inf); +#endif + } + + __forceinline void intersectBeginSphere(const vbool<M>& valid, + vfloat<M>& lower, vfloat<M>& upper) + { + /* calculate front and back hit with end sphere */ + const Vec3vf<M> O1 = org - p0; + const vfloat<M> O1dO = dot(O1,dO); + const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r0)); + const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) ); + + /* clip away front hit if it is inside next cone segment */ + t_sph0_front = (-O1dO - rhs1)*rcp_dOdO; + vbool<M> valid_sph1_front = valid & h2 >= 0.0f & yp + t_sph0_front*dOdP < 0; + lower = select(valid_sph1_front, t_sph0_front, vfloat<M>(pos_inf)); + +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + /* clip away back hit if it is inside next cone segment */ + t_sph0_back = (-O1dO + rhs1)*rcp_dOdO; + vbool<M> valid_sph1_back = valid & h2 >= 0.0f & yp + t_sph0_back*dOdP < 0; + upper = select(valid_sph1_back, t_sph0_back, vfloat<M>(neg_inf)); +#else + upper = vfloat<M>(neg_inf); +#endif + } + + /* + + This function calculates the geometry normal of some cone hit. + + For a given hit point h (relative to p0) with a cone + starting at p0 with radius w0 and ending at p1 with + radius w1 one normally calculates the geometry normal by + first calculating the parmetric u hit location along the + cone: + + u = dot(h,dP)/dP^2 + + Using this value one can now directly calculate the + geometry normal by bending the connection vector (h-u*dP) + from hit to projected hit with some cone dependent value + dw/sqrt(dP^2) * normalize(dP): + + Ng = normalize(h-u*dP) - dw/length(dP) * normalize(dP) + + The length of the vector (h-u*dP) can also get calculated + by interpolating the radii as w0+u*dw which yields: + + Ng = (h-u*dP)/(w0+u*dw) - dw/dP^2 * dP + + Multiplying with (w0+u*dw) yield a scaled Ng': + + Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP + + Inserting the definition of w0 and dw and refactoring + yield a furhter scaled Ng'': + + Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP + + Now inserting the definition of u gives and multiplying + with the denominator yields: + + Ng''' = (dP^2-dr^2)*(dP^2*h-dot(h,dP)*dP) - (dP^2*r0+dot(h,dP)*dr)*dr*dP + + Factoring out, cancelling terms, dividing by dP^2, and + factoring again yields finally: + + Ng'''' = (dP^2-dr^2)*h - dP*(dot(h,dP) + r0*dr) + + */ + + __forceinline Vec3vf<M> Ng_cone(const vbool<M>& front_hit) const + { +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back); + const vfloat<M> t = select(front_hit, t_cone_front, t_cone_back); + const Vec3vf<M> h = O + t*dO; + return g*h-dP*y; +#else + const Vec3vf<M> h = O + t_cone_front*dO; + return g*h-dP*y_cone_front; +#endif + } + + /* compute geometry normal of sphere hit as the difference + * vector from hit point to sphere center */ + + __forceinline Vec3vf<M> Ng_sphere1(const vbool<M>& front_hit) const + { +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const vfloat<M> t_sph1 = select(front_hit, t_sph1_front, t_sph1_back); + return org+t_sph1*dO-p1; +#else + return org+t_sph1_front*dO-p1; +#endif + } + + __forceinline Vec3vf<M> Ng_sphere0(const vbool<M>& front_hit) const + { +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const vfloat<M> t_sph0 = select(front_hit, t_sph0_front, t_sph0_back); + return org+t_sph0*dO-p0; +#else + return org+t_sph0_front*dO-p0; +#endif + } + + /* + This function calculates the u coordinate of a + hit. Therefore we use the hit distance y (which is zero + at the first cone clipping plane) and divide by distance + g between the clipping planes. + + */ + + __forceinline vfloat<M> u_cone(const vbool<M>& front_hit) const + { +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back); + return clamp(y*rcp(g)); +#else + return clamp(y_cone_front*rcp(g)); +#endif + } + + private: + Vec3vf<M> org; + Vec3vf<M> O; + Vec3vf<M> dO; + vfloat<M> dOdO; + vfloat<M> rcp_dOdO; + vfloat<M> OdP; + vfloat<M> dOdP; + + /* for ray/cone intersection */ + private: + vfloat<M> yp; + vfloat<M> y_cone_front; + vfloat<M> t_cone_front; +#if !defined (EMBREE_BACKFACE_CULLING_CURVES) + vfloat<M> y_cone_back; + vfloat<M> t_cone_back; +#endif + + /* for ray/sphere intersection */ + private: + vfloat<M> t_sph1_front; + vfloat<M> t_sph0_front; +#if !defined (EMBREE_BACKFACE_CULLING_CURVES) + vfloat<M> t_sph1_back; + vfloat<M> t_sph0_back; +#endif + }; + + + template<int M, typename Epilog, typename ray_tfar_func> + static __forceinline bool intersectConeSphere(const vbool<M>& valid_i, + const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, + const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar, + const Vec4vf<M>& v0, const Vec4vf<M>& v1, + const Vec4vf<M>& vL, const Vec4vf<M>& vR, + const Epilog& epilog) + { + vbool<M> valid = valid_i; + + /* move ray origin closer to make calculations numerically stable */ + const vfloat<M> dOdO = sqr(ray_dir); + const vfloat<M> rcp_dOdO = rcp(dOdO); + const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz()); + const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO; + const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir; + + /* intersect with cone from v0 to v1 */ + vfloat<M> t_cone_lower, t_cone_upper; + ConeGeometryIntersector<M> cone (ray_org, ray_dir, dOdO, rcp_dOdO, v0, v1); + vbool<M> validCone = valid; + cone.intersectCone(validCone, t_cone_lower, t_cone_upper); + + valid &= (validCone | (cone.g <= 0.0f)); // if cone is entirely in sphere end - check sphere + if (unlikely(none(valid))) + return false; + + /* cone hits inside the neighboring capped cones are inside the geometry and thus ignored */ + const ConeGeometry<M> coneL (v0, vL); + const ConeGeometry<M> coneR (v1, vR); +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const Vec3vf<M> hit_lower = ray_org + t_cone_lower*ray_dir; + const Vec3vf<M> hit_upper = ray_org + t_cone_upper*ray_dir; + t_cone_lower = select (!coneL.isInsideCappedCone (validCone, hit_lower) & !coneR.isInsideCappedCone (validCone, hit_lower), t_cone_lower, vfloat<M>(pos_inf)); + t_cone_upper = select (!coneL.isInsideCappedCone (validCone, hit_upper) & !coneR.isInsideCappedCone (validCone, hit_upper), t_cone_upper, vfloat<M>(neg_inf)); +#endif + + /* intersect ending sphere */ + vfloat<M> t_sph1_lower, t_sph1_upper; + vfloat<M> t_sph0_lower = vfloat<M>(pos_inf); + vfloat<M> t_sph0_upper = vfloat<M>(neg_inf); + cone.intersectEndSphere(valid, coneR, t_sph1_lower, t_sph1_upper); + + const vbool<M> isBeginPoint = valid & (vL[0] == vfloat<M>(pos_inf)); + if (unlikely(any(isBeginPoint))) { + cone.intersectBeginSphere (isBeginPoint, t_sph0_lower, t_sph0_upper); + } + + /* CSG union of cone and end sphere */ + vfloat<M> t_sph_lower = min(t_sph0_lower, t_sph1_lower); + vfloat<M> t_cone_sphere_lower = min(t_cone_lower, t_sph_lower); +#if !defined (EMBREE_BACKFACE_CULLING_CURVES) + vfloat<M> t_sph_upper = max(t_sph0_upper, t_sph1_upper); + vfloat<M> t_cone_sphere_upper = max(t_cone_upper, t_sph_upper); + + /* filter out hits that are not in tnear/tfar range */ + const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf); + const vbool<M> valid_upper = valid & ray_tnear <= dt+t_cone_sphere_upper & dt+t_cone_sphere_upper <= ray_tfar() & t_cone_sphere_upper != vfloat<M>(neg_inf); + + /* check if there is a first hit */ + const vbool<M> valid_first = valid_lower | valid_upper; + if (unlikely(none(valid_first))) + return false; + + /* construct first hit */ + const vfloat<M> t_first = select(valid_lower, t_cone_sphere_lower, t_cone_sphere_upper); + const vbool<M> cone_hit_first = t_first == t_cone_lower | t_first == t_cone_upper; + const vbool<M> sph0_hit_first = t_first == t_sph0_lower | t_first == t_sph0_upper; + const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower))); + const vfloat<M> u_first = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one))); + + /* invoke intersection filter for first hit */ + RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first); + const bool is_hit_first = epilog(valid_first, hit); + + /* check for possible second hits before potentially accepted hit */ + const vfloat<M> t_second = t_cone_sphere_upper; + const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_cone_sphere_upper <= ray_tfar()); + if (unlikely(none(valid_second))) + return is_hit_first; + + /* invoke intersection filter for second hit */ + const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper; + const vbool<M> sph0_hit_second = t_second == t_sph0_lower | t_second == t_sph0_upper; + const Vec3vf<M> Ng_second = select(cone_hit_second, cone.Ng_cone(false), select (sph0_hit_second, cone.Ng_sphere0(false), cone.Ng_sphere1(false))); + const vfloat<M> u_second = select(cone_hit_second, cone.u_cone(false), select (sph0_hit_second, vfloat<M>(zero), vfloat<M>(one))); + + hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second); + const bool is_hit_second = epilog(valid_second, hit); + + return is_hit_first | is_hit_second; +#else + /* filter out hits that are not in tnear/tfar range */ + const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf); + + /* check if there is a valid hit */ + if (unlikely(none(valid_lower))) + return false; + + /* construct first hit */ + const vbool<M> cone_hit_first = t_cone_sphere_lower == t_cone_lower | t_cone_sphere_lower == t_cone_upper; + const vbool<M> sph0_hit_first = t_cone_sphere_lower == t_sph0_lower | t_cone_sphere_lower == t_sph0_upper; + const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower))); + const vfloat<M> u_first = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one))); + + /* invoke intersection filter for first hit */ + RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_cone_sphere_lower,Ng_first); + const bool is_hit_first = epilog(valid_lower, hit); + + return is_hit_first; +#endif + } + + } // end namespace __roundline_internal + + template<int M> + struct RoundLinearCurveIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + template<typename Ray> + struct ray_tfar { + Ray& ray; + __forceinline ray_tfar(Ray& ray) : ray(ray) {} + __forceinline vfloat<M> operator() () const { return ray.tfar; }; + }; + + template<typename Ray, typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + Ray& ray, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, const Vec4vf<M>& v1i, + const Vec4vf<M>& vLi, const Vec4vf<M>& vRi, + const Epilog& epilog) + { + const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); + const vfloat<M> ray_tnear(ray.tnear()); + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i); + const Vec4vf<M> vL = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vLi); + const Vec4vf<M> vR = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vRi); + return __roundline_internal::intersectConeSphere<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar<Ray>(ray),v0,v1,vL,vR,epilog); + } + }; + + template<int M, int K> + struct RoundLinearCurveIntersectorK + { + typedef CurvePrecalculationsK<K> Precalculations; + + struct ray_tfar { + RayK<K>& ray; + size_t k; + __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {} + __forceinline vfloat<M> operator() () const { return ray.tfar[k]; }; + }; + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + RayK<K>& ray, size_t k, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, const Vec4vf<M>& v1i, + const Vec4vf<M>& vLi, const Vec4vf<M>& vRi, + const Epilog& epilog) + { + const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + const vfloat<M> ray_tnear = ray.tnear()[k]; + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec4vf<M> v1 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v1i); + const Vec4vf<M> vL = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vLi); + const Vec4vf<M> vR = enlargeRadiusToMinWidth<M>(context,geom,ray_org,vRi); + return __roundline_internal::intersectConeSphere<M>(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,vL,vR,epilog); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree/kernels/geometry/roundlinei_intersector.h new file mode 100644 index 0000000000..29061d6475 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/roundlinei_intersector.h @@ -0,0 +1,123 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "roundline_intersector.h" +#include "intersector_epilog.h" + +namespace embree +{ + namespace isa + { + template<int M, bool filter> + struct RoundLinearCurveMiIntersector1 + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); + const vbool<M> valid = line.valid(); + RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); + const vbool<M> valid = line.valid(); + return RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line); + } + }; + + template<int M, bool filter> + struct RoundLinearCurveMiMBIntersector1 + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()); + const vbool<M> valid = line.valid(); + RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()); + const vbool<M> valid = line.valid(); + return RoundLinearCurveIntersector1<M>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,filter>(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line); + } + }; + + template<int M, int K, bool filter> + struct RoundLinearCurveMiIntersectorK + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); + const vbool<M> valid = line.valid(); + RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); + const vbool<M> valid = line.valid(); + return RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + }; + + template<int M, int K, bool filter> + struct RoundLinearCurveMiMBIntersectorK + { + typedef LineMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]); + const vbool<M> valid = line.valid(); + RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get<LineSegments>(line.geomID()); + Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]); + const vbool<M> valid = line.valid(); + return RoundLinearCurveIntersectorK<M,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,K,filter>(ray,k,context,line.geomID(),line.primID())); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/sphere_intersector.h b/thirdparty/embree/kernels/geometry/sphere_intersector.h new file mode 100644 index 0000000000..2670f9762d --- /dev/null +++ b/thirdparty/embree/kernels/geometry/sphere_intersector.h @@ -0,0 +1,183 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/scene_points.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + template<int M> + struct SphereIntersectorHitM + { + __forceinline SphereIntersectorHitM() {} + + __forceinline SphereIntersectorHitM(const vfloat<M>& t, const Vec3vf<M>& Ng) + : vt(t), vNg(Ng) {} + + __forceinline void finalize() {} + + __forceinline Vec2f uv(const size_t i) const { + return Vec2f(0.0f, 0.0f); + } + __forceinline float t(const size_t i) const { + return vt[i]; + } + __forceinline Vec3fa Ng(const size_t i) const { + return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]); + } + + public: + vfloat<M> vt; + Vec3vf<M> vNg; + }; + + template<int M> + struct SphereIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + template<typename Epilog> + static __forceinline bool intersect( + const vbool<M>& valid_i, Ray& ray, + const Precalculations& pre, const Vec4vf<M>& v0, const Epilog& epilog) + { + vbool<M> valid = valid_i; + + const vfloat<M> rd2 = rcp(dot(ray.dir, ray.dir)); + const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); + const Vec3vf<M> center = v0.xyz(); + const vfloat<M> radius = v0.w; + + const Vec3vf<M> c0 = center - ray_org; + const vfloat<M> projC0 = dot(c0, ray_dir) * rd2; + const Vec3vf<M> perp = c0 - projC0 * ray_dir; + const vfloat<M> l2 = dot(perp, perp); + const vfloat<M> r2 = radius * radius; + valid &= (l2 <= r2); + if (unlikely(none(valid))) + return false; + + const vfloat<M> td = sqrt((r2 - l2) * rd2); + const vfloat<M> t_front = projC0 - td; + const vfloat<M> t_back = projC0 + td; + + const vbool<M> valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar); + const vbool<M> valid_back = valid & (ray.tnear() <= t_back ) & (t_back <= ray.tfar); + + /* check if there is a first hit */ + const vbool<M> valid_first = valid_front | valid_back; + if (unlikely(none(valid_first))) + return false; + + /* construct first hit */ + const vfloat<M> td_front = -td; + const vfloat<M> td_back = +td; + const vfloat<M> t_first = select(valid_front, t_front, t_back); + const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp; + SphereIntersectorHitM<M> hit(t_first, Ng_first); + + /* invoke intersection filter for first hit */ + const bool is_hit_first = epilog(valid_first, hit); + + /* check for possible second hits before potentially accepted hit */ + const vfloat<M> t_second = t_back; + const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar); + if (unlikely(none(valid_second))) + return is_hit_first; + + /* invoke intersection filter for second hit */ + const Vec3vf<M> Ng_second = td_back * ray_dir - perp; + hit = SphereIntersectorHitM<M> (t_second, Ng_second); + const bool is_hit_second = epilog(valid_second, hit); + + return is_hit_first | is_hit_second; + } + + template<typename Epilog> + static __forceinline bool intersect( + const vbool<M>& valid_i, Ray& ray, IntersectContext* context, const Points* geom, + const Precalculations& pre, const Vec4vf<M>& v0i, const Epilog& epilog) + { + const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + return intersect(valid_i,ray,pre,v0,epilog); + } + }; + + template<int M, int K> + struct SphereIntersectorK + { + typedef CurvePrecalculationsK<K> Precalculations; + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid_i, + RayK<K>& ray, size_t k, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf<M>& v0i, + const Epilog& epilog) + { + vbool<M> valid = valid_i; + + const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir)); + + const Vec4vf<M> v0 = enlargeRadiusToMinWidth<M>(context,geom,ray_org,v0i); + const Vec3vf<M> center = v0.xyz(); + const vfloat<M> radius = v0.w; + + const Vec3vf<M> c0 = center - ray_org; + const vfloat<M> projC0 = dot(c0, ray_dir) * rd2; + const Vec3vf<M> perp = c0 - projC0 * ray_dir; + const vfloat<M> l2 = dot(perp, perp); + const vfloat<M> r2 = radius * radius; + valid &= (l2 <= r2); + if (unlikely(none(valid))) + return false; + + const vfloat<M> td = sqrt((r2 - l2) * rd2); + const vfloat<M> t_front = projC0 - td; + const vfloat<M> t_back = projC0 + td; + + const vbool<M> valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]); + const vbool<M> valid_back = valid & (ray.tnear()[k] <= t_back ) & (t_back <= ray.tfar[k]); + + /* check if there is a first hit */ + const vbool<M> valid_first = valid_front | valid_back; + if (unlikely(none(valid_first))) + return false; + + /* construct first hit */ + const vfloat<M> td_front = -td; + const vfloat<M> td_back = +td; + const vfloat<M> t_first = select(valid_front, t_front, t_back); + const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp; + SphereIntersectorHitM<M> hit(t_first, Ng_first); + + /* invoke intersection filter for first hit */ + const bool is_hit_first = epilog(valid_first, hit); + + /* check for possible second hits before potentially accepted hit */ + const vfloat<M> t_second = t_back; + const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]); + if (unlikely(none(valid_second))) + return is_hit_first; + + /* invoke intersection filter for second hit */ + const Vec3vf<M> Ng_second = td_back * ray_dir - perp; + hit = SphereIntersectorHitM<M> (t_second, Ng_second); + const bool is_hit_second = epilog(valid_second, hit); + + return is_hit_first | is_hit_second; + } + }; + } // namespace isa +} // namespace embree diff --git a/thirdparty/embree/kernels/geometry/spherei_intersector.h b/thirdparty/embree/kernels/geometry/spherei_intersector.h new file mode 100644 index 0000000000..7a0b428117 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/spherei_intersector.h @@ -0,0 +1,156 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "intersector_epilog.h" +#include "pointi.h" +#include "sphere_intersector.h" + +namespace embree +{ + namespace isa + { + template<int M, bool filter> + struct SphereMiIntersector1 + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& sphere) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(sphere.geomID()); + Vec4vf<M> v0; sphere.gather(v0, geom); + const vbool<M> valid = sphere.valid(); + SphereIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& sphere) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(sphere.geomID()); + Vec4vf<M> v0; sphere.gather(v0, geom); + const vbool<M> valid = sphere.valid(); + return SphereIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, + PointQueryContext* context, + const Primitive& sphere) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere); + } + }; + + template<int M, bool filter> + struct SphereMiMBIntersector1 + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& sphere) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(sphere.geomID()); + Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()); + const vbool<M> valid = sphere.valid(); + SphereIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& sphere) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(sphere.geomID()); + Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()); + const vbool<M> valid = sphere.valid(); + return SphereIntersector1<M>::intersect( + valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, filter>(ray, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, + PointQueryContext* context, + const Primitive& sphere) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere); + } + }; + + template<int M, int K, bool filter> + struct SphereMiIntersectorK + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(sphere.geomID()); + Vec4vf<M> v0; sphere.gather(v0, geom); + const vbool<M> valid = sphere.valid(); + SphereIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, + Intersect1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(sphere.geomID()); + Vec4vf<M> v0; sphere.gather(v0, geom); + const vbool<M> valid = sphere.valid(); + return SphereIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, + Occluded1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID())); + } + }; + + template<int M, int K, bool filter> + struct SphereMiMBIntersectorK + { + typedef PointMi<M> Primitive; + typedef CurvePrecalculationsK<K> Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(sphere.geomID()); + Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]); + const vbool<M> valid = sphere.valid(); + SphereIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, + Intersect1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get<Points>(sphere.geomID()); + Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]); + const vbool<M> valid = sphere.valid(); + return SphereIntersectorK<M, K>::intersect( + valid, ray, k, context, geom, pre, v0, + Occluded1KEpilogM<M, K, filter>(ray, k, context, sphere.geomID(), sphere.primID())); + } + }; + } // namespace isa +} // namespace embree diff --git a/thirdparty/embree/kernels/geometry/subdivpatch1.h b/thirdparty/embree/kernels/geometry/subdivpatch1.h new file mode 100644 index 0000000000..ae0d4e2616 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/subdivpatch1.h @@ -0,0 +1,38 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../geometry/primitive.h" +#include "../subdiv/subdivpatch1base.h" + +namespace embree +{ + + struct __aligned(64) SubdivPatch1 : public SubdivPatch1Base + { + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + + static Type type; + + public: + + /*! constructor for cached subdiv patch */ + SubdivPatch1 (const unsigned int gID, + const unsigned int pID, + const unsigned int subPatch, + const SubdivMesh *const mesh, + const size_t time, + const Vec2f uv[4], + const float edge_level[4], + const int subdiv[4], + const int simd_width) + : SubdivPatch1Base(gID,pID,subPatch,mesh,time,uv,edge_level,subdiv,simd_width) {} + }; +} diff --git a/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h new file mode 100644 index 0000000000..b4b15a1210 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/subdivpatch1_intersector.h @@ -0,0 +1,237 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subdivpatch1.h" +#include "grid_soa.h" +#include "grid_soa_intersector1.h" +#include "grid_soa_intersector_packet.h" +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + template<typename T> + class SubdivPatch1Precalculations : public T + { + public: + __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr) + : T(ray,ptr) {} + }; + + template<int K, typename T> + class SubdivPatch1PrecalculationsK : public T + { + public: + __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray) + : T(valid,ray) {} + }; + + class SubdivPatch1Intersector1 + { + public: + typedef GridSOA Primitive; + typedef SubdivPatch1Precalculations<GridSOAIntersector1::Precalculations> Precalculations; + + static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + lazy_node = prim->root(0); + pre.grid = (Primitive*)prim; + return false; + } + + /*! Intersect a ray with the primitive. */ + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) { + intersect(This,pre,ray,context,prim,ty,tray,lazy_node); + } + + /*! Test if the ray is occluded by the primitive */ + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) { + return occluded(This,pre,ray,context,prim,ty,tray,lazy_node); + } + + template<int N> + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) + { + // TODO: PointQuery implement + assert(false && "not implemented"); + return false; + } + + template<int N> + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) { + return pointQuery(This,query,context,prim,ty,tquery,lazy_node); + } + }; + + class SubdivPatch1MBIntersector1 + { + public: + typedef SubdivPatch1 Primitive; + typedef GridSOAMBIntersector1::Precalculations Precalculations; + + static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node) + { + Primitive* prim = (Primitive*) prim_i; + GridSOA* grid = nullptr; + grid = (GridSOA*) prim->root_ref.get(); + pre.itime = getTimeSegment(ray.time(), float(grid->time_steps-1), pre.ftime); + lazy_node = grid->root(pre.itime); + pre.grid = grid; + return false; + } + + /*! Intersect a ray with the primitive. */ + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node); + else processLazyNode(pre,ray,context,prim,lazy_node); + } + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) { + intersect(This,pre,ray,context,prim,ty,tray,lazy_node); + } + + /*! Test if the ray is occluded by the primitive */ + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node); + else return processLazyNode(pre,ray,context,prim,lazy_node); + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) { + return occluded(This,pre,ray,context,prim,ty,tray,lazy_node); + } + + template<int N> + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) + { + // TODO: PointQuery implement + assert(false && "not implemented"); + return false; + } + + template<int N, bool robust> + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) { + return pointQuery(This,query,context,prim,ty,tquery,lazy_node); + } + }; + + template <int K> + struct SubdivPatch1IntersectorK + { + typedef GridSOA Primitive; + typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations; + + static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + lazy_node = prim->root(0); + pre.grid = (Primitive*)prim; + return false; + } + + template<bool robust> + static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template<bool robust> + static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + }; + + typedef SubdivPatch1IntersectorK<4> SubdivPatch1Intersector4; + typedef SubdivPatch1IntersectorK<8> SubdivPatch1Intersector8; + typedef SubdivPatch1IntersectorK<16> SubdivPatch1Intersector16; + + template <int K> + struct SubdivPatch1MBIntersectorK + { + typedef SubdivPatch1 Primitive; + //typedef GridSOAMBIntersectorK<K>::Precalculations Precalculations; + typedef SubdivPatch1PrecalculationsK<K,typename GridSOAMBIntersectorK<K>::Precalculations> Precalculations; + + static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node) + { + Primitive* prim = (Primitive*) prim_i; + GridSOA* grid = (GridSOA*) prim->root_ref.get(); + lazy_node = grid->troot; + pre.grid = grid; + return false; + } + + template<bool robust> + static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template<bool robust> + static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + + template<int N, bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template<int N, bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,robust> &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + }; + + typedef SubdivPatch1MBIntersectorK<4> SubdivPatch1MBIntersector4; + typedef SubdivPatch1MBIntersectorK<8> SubdivPatch1MBIntersector8; + typedef SubdivPatch1MBIntersectorK<16> SubdivPatch1MBIntersector16; + } +} diff --git a/thirdparty/embree/kernels/geometry/subgrid.h b/thirdparty/embree/kernels/geometry/subgrid.h new file mode 100644 index 0000000000..ce54421cab --- /dev/null +++ b/thirdparty/embree/kernels/geometry/subgrid.h @@ -0,0 +1,517 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/scene_grid_mesh.h" +#include "../bvh/bvh.h" + +namespace embree +{ + /* Stores M quads from an indexed face set */ + struct SubGrid + { + /* Virtual interface to query information about the quad type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored quads */ + static __forceinline size_t max_size() { return 1; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline SubGrid() { } + + /* Construction from vertices and IDs */ + __forceinline SubGrid(const unsigned int x, + const unsigned int y, + const unsigned int geomID, + const unsigned int primID) + : _x(x), _y(y), _geomID(geomID), _primID(primID) + { + } + + __forceinline bool invalid3x3X() const { return (unsigned int)_x & (1<<15); } + __forceinline bool invalid3x3Y() const { return (unsigned int)_y & (1<<15); } + + /* Gather the quads */ + __forceinline void gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const GridMesh* const mesh, + const GridMesh::Grid &g) const + { + /* first quad always valid */ + const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; + const size_t vtxID01 = vtxID00 + 1; + const vfloat4 vtx00 = vfloat4::loadu(mesh->vertexPtr(vtxID00)); + const vfloat4 vtx01 = vfloat4::loadu(mesh->vertexPtr(vtxID01)); + const size_t vtxID10 = vtxID00 + g.lineVtxOffset; + const size_t vtxID11 = vtxID01 + g.lineVtxOffset; + const vfloat4 vtx10 = vfloat4::loadu(mesh->vertexPtr(vtxID10)); + const vfloat4 vtx11 = vfloat4::loadu(mesh->vertexPtr(vtxID11)); + + /* deltaX => vtx02, vtx12 */ + const size_t deltaX = invalid3x3X() ? 0 : 1; + const size_t vtxID02 = vtxID01 + deltaX; + const vfloat4 vtx02 = vfloat4::loadu(mesh->vertexPtr(vtxID02)); + const size_t vtxID12 = vtxID11 + deltaX; + const vfloat4 vtx12 = vfloat4::loadu(mesh->vertexPtr(vtxID12)); + + /* deltaY => vtx20, vtx21 */ + const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; + const size_t vtxID20 = vtxID10 + deltaY; + const size_t vtxID21 = vtxID11 + deltaY; + const vfloat4 vtx20 = vfloat4::loadu(mesh->vertexPtr(vtxID20)); + const vfloat4 vtx21 = vfloat4::loadu(mesh->vertexPtr(vtxID21)); + + /* deltaX/deltaY => vtx22 */ + const size_t vtxID22 = vtxID11 + deltaX + deltaY; + const vfloat4 vtx22 = vfloat4::loadu(mesh->vertexPtr(vtxID22)); + + transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z); + transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z); + transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z); + transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z); + } + + template<typename T> + __forceinline vfloat4 getVertexMB(const GridMesh* const mesh, const size_t offset, const size_t itime, const float ftime) const + { + const T v0 = T::loadu(mesh->vertexPtr(offset,itime+0)); + const T v1 = T::loadu(mesh->vertexPtr(offset,itime+1)); + return lerp(v0,v1,ftime); + } + + /* Gather the quads */ + __forceinline void gatherMB(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const GridMesh* const mesh, + const GridMesh::Grid &g, + const size_t itime, + const float ftime) const + { + /* first quad always valid */ + const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; + const size_t vtxID01 = vtxID00 + 1; + const vfloat4 vtx00 = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime); + const vfloat4 vtx01 = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime); + const size_t vtxID10 = vtxID00 + g.lineVtxOffset; + const size_t vtxID11 = vtxID01 + g.lineVtxOffset; + const vfloat4 vtx10 = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime); + const vfloat4 vtx11 = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime); + + /* deltaX => vtx02, vtx12 */ + const size_t deltaX = invalid3x3X() ? 0 : 1; + const size_t vtxID02 = vtxID01 + deltaX; + const vfloat4 vtx02 = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime); + const size_t vtxID12 = vtxID11 + deltaX; + const vfloat4 vtx12 = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime); + + /* deltaY => vtx20, vtx21 */ + const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; + const size_t vtxID20 = vtxID10 + deltaY; + const size_t vtxID21 = vtxID11 + deltaY; + const vfloat4 vtx20 = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime); + const vfloat4 vtx21 = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime); + + /* deltaX/deltaY => vtx22 */ + const size_t vtxID22 = vtxID11 + deltaX + deltaY; + const vfloat4 vtx22 = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime); + + transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z); + transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z); + transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z); + transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z); + } + + + + /* Gather the quads */ + __forceinline void gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const Scene *const scene) const + { + const GridMesh* const mesh = scene->get<GridMesh>(geomID()); + const GridMesh::Grid &g = mesh->grid(primID()); + gather(p0,p1,p2,p3,mesh,g); + } + + /* Gather the quads in the motion blur case */ + __forceinline void gatherMB(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const Scene *const scene, + const size_t itime, + const float ftime) const + { + const GridMesh* const mesh = scene->get<GridMesh>(geomID()); + const GridMesh::Grid &g = mesh->grid(primID()); + gatherMB(p0,p1,p2,p3,mesh,g,itime,ftime); + } + + /* Gather the quads */ + __forceinline void gather(Vec3fa vtx[16], const Scene *const scene) const + { + const GridMesh* mesh = scene->get<GridMesh>(geomID()); + const GridMesh::Grid &g = mesh->grid(primID()); + + /* first quad always valid */ + const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; + const size_t vtxID01 = vtxID00 + 1; + const Vec3fa vtx00 = Vec3fa::loadu(mesh->vertexPtr(vtxID00)); + const Vec3fa vtx01 = Vec3fa::loadu(mesh->vertexPtr(vtxID01)); + const size_t vtxID10 = vtxID00 + g.lineVtxOffset; + const size_t vtxID11 = vtxID01 + g.lineVtxOffset; + const Vec3fa vtx10 = Vec3fa::loadu(mesh->vertexPtr(vtxID10)); + const Vec3fa vtx11 = Vec3fa::loadu(mesh->vertexPtr(vtxID11)); + + /* deltaX => vtx02, vtx12 */ + const size_t deltaX = invalid3x3X() ? 0 : 1; + const size_t vtxID02 = vtxID01 + deltaX; + const Vec3fa vtx02 = Vec3fa::loadu(mesh->vertexPtr(vtxID02)); + const size_t vtxID12 = vtxID11 + deltaX; + const Vec3fa vtx12 = Vec3fa::loadu(mesh->vertexPtr(vtxID12)); + + /* deltaY => vtx20, vtx21 */ + const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; + const size_t vtxID20 = vtxID10 + deltaY; + const size_t vtxID21 = vtxID11 + deltaY; + const Vec3fa vtx20 = Vec3fa::loadu(mesh->vertexPtr(vtxID20)); + const Vec3fa vtx21 = Vec3fa::loadu(mesh->vertexPtr(vtxID21)); + + /* deltaX/deltaY => vtx22 */ + const size_t vtxID22 = vtxID11 + deltaX + deltaY; + const Vec3fa vtx22 = Vec3fa::loadu(mesh->vertexPtr(vtxID22)); + + vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10; + vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11; + vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20; + vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21; + } + + /* Gather the quads */ + __forceinline void gatherMB(vfloat4 vtx[16], const Scene *const scene, const size_t itime, const float ftime) const + { + const GridMesh* mesh = scene->get<GridMesh>(geomID()); + const GridMesh::Grid &g = mesh->grid(primID()); + + /* first quad always valid */ + const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; + const size_t vtxID01 = vtxID00 + 1; + const vfloat4 vtx00 = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime); + const vfloat4 vtx01 = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime); + const size_t vtxID10 = vtxID00 + g.lineVtxOffset; + const size_t vtxID11 = vtxID01 + g.lineVtxOffset; + const vfloat4 vtx10 = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime); + const vfloat4 vtx11 = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime); + + /* deltaX => vtx02, vtx12 */ + const size_t deltaX = invalid3x3X() ? 0 : 1; + const size_t vtxID02 = vtxID01 + deltaX; + const vfloat4 vtx02 = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime); + const size_t vtxID12 = vtxID11 + deltaX; + const vfloat4 vtx12 = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime); + + /* deltaY => vtx20, vtx21 */ + const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; + const size_t vtxID20 = vtxID10 + deltaY; + const size_t vtxID21 = vtxID11 + deltaY; + const vfloat4 vtx20 = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime); + const vfloat4 vtx21 = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime); + + /* deltaX/deltaY => vtx22 */ + const size_t vtxID22 = vtxID11 + deltaX + deltaY; + const vfloat4 vtx22 = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime); + + vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10; + vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11; + vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20; + vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21; + } + + + /* Calculate the bounds of the subgrid */ + __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const + { + BBox3fa bounds = empty; + FATAL("not implemented yet"); + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) + { + return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) + { + LBBox3fa allBounds = empty; + FATAL("not implemented yet"); + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + FATAL("not implemented yet"); + return allBounds; + } + + + friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) { + return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )"; + } + + __forceinline unsigned int geomID() const { return _geomID; } + __forceinline unsigned int primID() const { return _primID; } + __forceinline unsigned int x() const { return (unsigned int)_x & 0x7fff; } + __forceinline unsigned int y() const { return (unsigned int)_y & 0x7fff; } + + private: + unsigned short _x; + unsigned short _y; + unsigned int _geomID; // geometry ID of mesh + unsigned int _primID; // primitive ID of primitive inside mesh + }; + + struct SubGridID { + unsigned short x; + unsigned short y; + unsigned int primID; + + __forceinline SubGridID() {} + __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) : + x(x), y(y), primID(primID) {} + }; + + /* QuantizedBaseNode as large subgrid leaf */ + template<int N> + struct SubGridQBVHN + { + /* Virtual interface to query information about the quad type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + __forceinline size_t size() const + { + for (size_t i=0;i<N;i++) + if (primID(i) == -1) return i; + return N; + } + + __forceinline void clear() { + for (size_t i=0;i<N;i++) + subgridIDs[i] = SubGridID(0,0,(unsigned int)-1); + qnode.clear(); + } + + /* Default constructor */ + __forceinline SubGridQBVHN() { } + + /* Construction from vertices and IDs */ + __forceinline SubGridQBVHN(const unsigned int x[N], + const unsigned int y[N], + const unsigned int primID[N], + const BBox3fa * const subGridBounds, + const unsigned int geomID, + const unsigned int items) + { + clear(); + _geomID = geomID; + + __aligned(64) typename BVHN<N>::AABBNode node; + node.clear(); + for (size_t i=0;i<items;i++) + { + subgridIDs[i] = SubGridID(x[i],y[i],primID[i]); + node.setBounds(i,subGridBounds[i]); + } + qnode.init_dim(node); + } + + __forceinline unsigned int geomID() const { return _geomID; } + __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; } + __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; } + __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; } + + __forceinline SubGrid subgrid(const size_t i) const { + assert(i < N); + assert(primID(i) != -1); + return SubGrid(x(i),y(i),geomID(),primID(i)); + } + + public: + SubGridID subgridIDs[N]; + + typename BVHN<N>::QuantizedBaseNode qnode; + + unsigned int _geomID; // geometry ID of mesh + + + friend embree_ostream operator<<(embree_ostream cout, const SubGridQBVHN& sg) { + cout << "SubGridQBVHN " << embree_endl; + for (size_t i=0;i<N;i++) + cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl; + cout << "geomID " << sg._geomID << embree_endl; + cout << "lowerX " << sg.qnode.dequantizeLowerX() << embree_endl; + cout << "upperX " << sg.qnode.dequantizeUpperX() << embree_endl; + cout << "lowerY " << sg.qnode.dequantizeLowerY() << embree_endl; + cout << "upperY " << sg.qnode.dequantizeUpperY() << embree_endl; + cout << "lowerZ " << sg.qnode.dequantizeLowerZ() << embree_endl; + cout << "upperZ " << sg.qnode.dequantizeUpperZ() << embree_endl; + return cout; + } + + }; + + template<int N> + typename SubGridQBVHN<N>::Type SubGridQBVHN<N>::type; + + typedef SubGridQBVHN<4> SubGridQBVH4; + typedef SubGridQBVHN<8> SubGridQBVH8; + + + /* QuantizedBaseNode as large subgrid leaf */ + template<int N> + struct SubGridMBQBVHN + { + /* Virtual interface to query information about the quad type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + __forceinline size_t size() const + { + for (size_t i=0;i<N;i++) + if (primID(i) == -1) return i; + return N; + } + + __forceinline void clear() { + for (size_t i=0;i<N;i++) + subgridIDs[i] = SubGridID(0,0,(unsigned int)-1); + qnode.clear(); + } + + /* Default constructor */ + __forceinline SubGridMBQBVHN() { } + + /* Construction from vertices and IDs */ + __forceinline SubGridMBQBVHN(const unsigned int x[N], + const unsigned int y[N], + const unsigned int primID[N], + const BBox3fa * const subGridBounds0, + const BBox3fa * const subGridBounds1, + const unsigned int geomID, + const float toffset, + const float tscale, + const unsigned int items) + { + clear(); + _geomID = geomID; + time_offset = toffset; + time_scale = tscale; + + __aligned(64) typename BVHN<N>::AABBNode node0,node1; + node0.clear(); + node1.clear(); + for (size_t i=0;i<items;i++) + { + subgridIDs[i] = SubGridID(x[i],y[i],primID[i]); + node0.setBounds(i,subGridBounds0[i]); + node1.setBounds(i,subGridBounds1[i]); + } + qnode.node0.init_dim(node0); + qnode.node1.init_dim(node1); + } + + __forceinline unsigned int geomID() const { return _geomID; } + __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; } + __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; } + __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; } + + __forceinline SubGrid subgrid(const size_t i) const { + assert(i < N); + assert(primID(i) != -1); + return SubGrid(x(i),y(i),geomID(),primID(i)); + } + + __forceinline float adjustTime(const float t) const { return time_scale * (t-time_offset); } + + template<int K> + __forceinline vfloat<K> adjustTime(const vfloat<K> &t) const { return time_scale * (t-time_offset); } + + public: + SubGridID subgridIDs[N]; + + typename BVHN<N>::QuantizedBaseNodeMB qnode; + + float time_offset; + float time_scale; + unsigned int _geomID; // geometry ID of mesh + + + friend embree_ostream operator<<(embree_ostream cout, const SubGridMBQBVHN& sg) { + cout << "SubGridMBQBVHN " << embree_endl; + for (size_t i=0;i<N;i++) + cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl; + cout << "geomID " << sg._geomID << embree_endl; + cout << "time_offset " << sg.time_offset << embree_endl; + cout << "time_scale " << sg.time_scale << embree_endl; + cout << "lowerX " << sg.qnode.node0.dequantizeLowerX() << embree_endl; + cout << "upperX " << sg.qnode.node0.dequantizeUpperX() << embree_endl; + cout << "lowerY " << sg.qnode.node0.dequantizeLowerY() << embree_endl; + cout << "upperY " << sg.qnode.node0.dequantizeUpperY() << embree_endl; + cout << "lowerZ " << sg.qnode.node0.dequantizeLowerZ() << embree_endl; + cout << "upperZ " << sg.qnode.node0.dequantizeUpperZ() << embree_endl; + cout << "lowerX " << sg.qnode.node1.dequantizeLowerX() << embree_endl; + cout << "upperX " << sg.qnode.node1.dequantizeUpperX() << embree_endl; + cout << "lowerY " << sg.qnode.node1.dequantizeLowerY() << embree_endl; + cout << "upperY " << sg.qnode.node1.dequantizeUpperY() << embree_endl; + cout << "lowerZ " << sg.qnode.node1.dequantizeLowerZ() << embree_endl; + cout << "upperZ " << sg.qnode.node1.dequantizeUpperZ() << embree_endl; + return cout; + } + + }; + +} diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_intersector.h new file mode 100644 index 0000000000..ad5fee2e4e --- /dev/null +++ b/thirdparty/embree/kernels/geometry/subgrid_intersector.h @@ -0,0 +1,517 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subgrid.h" +#include "subgrid_intersector_moeller.h" +#include "subgrid_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + + // ======================================================================================= + // =================================== SubGridIntersectors =============================== + // ======================================================================================= + + + template<int N, bool filter> + struct SubGridIntersector1Moeller + { + typedef SubGridQBVHN<N> Primitive; + typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) + { + STAT3(point_query.trav_prims,1,1,1); + AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID()); + assert(accel); + context->geomID = subgrid.geomID(); + context->primID = subgrid.primID(); + return accel->pointQuery(query, context); + } + + template<bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); +#endif + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (unlikely(dist[ID] > ray.tfar)) continue; + intersect(pre,ray,context,prim[i].subgrid(ID)); + } + } + } + template<bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (occluded(pre,ray,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node) + { + bool changed = false; + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask; + if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { + mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist); + } else { + mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist); + } + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + changed |= pointQuery(query, context, prim[i].subgrid(ID)); + } + } + return changed; + } + }; + + template<int N, bool filter> + struct SubGridIntersector1Pluecker + { + typedef SubGridQBVHN<N> Primitive; + typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) + { + STAT3(point_query.trav_prims,1,1,1); + AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID()); + context->geomID = subgrid.geomID(); + context->primID = subgrid.primID(); + return accel->pointQuery(query, context); + } + + template<bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); +#endif + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (unlikely(dist[ID] > ray.tfar)) continue; + intersect(pre,ray,context,prim[i].subgrid(ID)); + } + } + } + + template<bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (occluded(pre,ray,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node) + { + bool changed = false; + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask; + if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { + mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist); + } else { + mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist); + } +#if defined(__AVX__) + STAT3(point_query.trav_hit_boxes[popcnt(mask)],1,1,1); +#endif + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + changed |= pointQuery(query, context, prim[i].subgrid(ID)); + } + } + return changed; + } + }; + + template<int N, int K, bool filter> + struct SubGridIntersectorKMoeller + { + typedef SubGridQBVHN<N> Primitive; + typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations; + + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid) + { + Vec3fa vtx[16]; + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + subgrid.gather(vtx,context->scene); + for (unsigned int i=0; i<4; i++) + { + const Vec3vf<K> p0 = vtx[i*4+0]; + const Vec3vf<K> p1 = vtx[i*4+1]; + const Vec3vf<K> p2 = vtx[i*4+2]; + const Vec3vf<K> p3 = vtx[i*4+3]; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i)); + } + } + + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid) + { + vbool<K> valid0 = valid_i; + Vec3fa vtx[16]; + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + subgrid.gather(vtx,context->scene); + for (unsigned int i=0; i<4; i++) + { + const Vec3vf<K> p0 = vtx[i*4+0]; + const Vec3vf<K> p1 = vtx[i*4+1]; + const Vec3vf<K> p2 = vtx[i*4+2]; + const Vec3vf<K> p3 = vtx[i*4+3]; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) + break; + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + template<bool robust> + static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK; + for (size_t j=0;j<num;j++) + { + size_t m_valid = movemask(prim[j].qnode.validMask()); + vfloat<K> dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; + intersect(valid,pre,ray,context,prim[j].subgrid(i)); + } + } + } + + template<bool robust> + static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK; + vbool<K> valid0 = valid; + for (size_t j=0;j<num;j++) + { + size_t m_valid = movemask(prim[j].qnode.validMask()); + vfloat<K> dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; + valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); + if (none(valid0)) break; + } + } + return !valid0; + } + + template<bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (unlikely(dist[ID] > ray.tfar[k])) continue; + intersect(pre,ray,k,context,prim[i].subgrid(ID)); + } + } + } + + template<bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + }; + + + template<int N, int K, bool filter> + struct SubGridIntersectorKPluecker + { + typedef SubGridQBVHN<N> Primitive; + typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations; + + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid) + { + Vec3fa vtx[16]; + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + subgrid.gather(vtx,context->scene); + for (unsigned int i=0; i<4; i++) + { + const Vec3vf<K> p0 = vtx[i*4+0]; + const Vec3vf<K> p1 = vtx[i*4+1]; + const Vec3vf<K> p2 = vtx[i*4+2]; + const Vec3vf<K> p3 = vtx[i*4+3]; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i)); + } + } + + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid) + { + vbool<K> valid0 = valid_i; + Vec3fa vtx[16]; + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + subgrid.gather(vtx,context->scene); + for (unsigned int i=0; i<4; i++) + { + const Vec3vf<K> p0 = vtx[i*4+0]; + const Vec3vf<K> p1 = vtx[i*4+1]; + const Vec3vf<K> p2 = vtx[i*4+2]; + const Vec3vf<K> p3 = vtx[i*4+3]; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + //if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) + if (pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) + + break; + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + template<bool robust> + static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK; + for (size_t j=0;j<num;j++) + { + size_t m_valid = movemask(prim[j].qnode.validMask()); + vfloat<K> dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; + intersect(valid,pre,ray,context,prim[j].subgrid(i)); + } + } + } + + template<bool robust> + static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK; + vbool<K> valid0 = valid; + for (size_t j=0;j<num;j++) + { + size_t m_valid = movemask(prim[j].qnode.validMask()); + vfloat<K> dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; + valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); + if (none(valid0)) break; + } + } + return !valid0; + } + + template<bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (unlikely(dist[ID] > ray.tfar[k])) continue; + intersect(pre,ray,k,context,prim[i].subgrid(ID)); + } + } + } + + template<bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h new file mode 100644 index 0000000000..64937d34fe --- /dev/null +++ b/thirdparty/embree/kernels/geometry/subgrid_intersector_moeller.h @@ -0,0 +1,382 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subgrid.h" +#include "quad_intersector_moeller.h" + +namespace embree +{ + namespace isa + { + + /* ----------------------------- */ + /* -- single ray intersectors -- */ + /* ----------------------------- */ + + template<int M> + __forceinline void interpolateUV(MoellerTrumboreHitM<M,UVIdentity<M>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY) + { + /* correct U,V interpolation across the entire grid */ + const vint<M> sx((int)subgrid.x()); + const vint<M> sy((int)subgrid.y()); + const vint<M> sxM(sx + stepX); + const vint<M> syM(sy + stepY); + const float inv_resX = rcp((float)((int)g.resX-1)); + const float inv_resY = rcp((float)((int)g.resY-1)); + hit.U = (hit.U + (vfloat<M>)sxM * hit.absDen) * inv_resX; + hit.V = (hit.V + (vfloat<M>)syM * hit.absDen) * inv_resY; + } + + template<int M, bool filter> + struct SubGridQuadMIntersector1MoellerTrumbore; + + template<int M, bool filter> + struct SubGridQuadMIntersector1MoellerTrumbore + { + __forceinline SubGridQuadMIntersector1MoellerTrumbore() {} + + __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + __forceinline void intersect(RayHit& ray, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + UVIdentity<M> mapUV; + MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV); + MoellerTrumboreIntersector1<M> intersector(ray,nullptr); + Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID()); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) + { + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + epilog(hit.valid,hit); + } + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + epilog(hit.valid,hit); + } + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + UVIdentity<M> mapUV; + MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV); + MoellerTrumboreIntersector1<M> intersector(ray,nullptr); + Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID()); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) + { + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + if (epilog(hit.valid,hit)) + return true; + } + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + if (epilog(hit.valid,hit)) + return true; + } + return false; + } + }; + +#if defined (__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template<bool filter> + struct SubGridQuadMIntersector1MoellerTrumbore<4,filter> + { + __forceinline SubGridQuadMIntersector1MoellerTrumbore() {} + + __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + template<typename Epilog> + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + UVIdentity<8> mapUV; + MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV); + MoellerTrumboreIntersector1<8> intersector(ray,nullptr); + const vbool8 flags(0,0,0,0,1,1,1,1); + if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit))) + { + /* correct U,V interpolation across the entire grid */ + const vfloat8 U = select(flags,hit.absDen - hit.V,hit.U); + const vfloat8 V = select(flags,hit.absDen - hit.U,hit.V); + hit.U = U; + hit.V = V; + hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); + interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1)); + if (unlikely(epilog(hit.valid,hit))) + return true; + } + return false; + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); + } + }; + +#endif + + // ============================================================================================================================ + // ============================================================================================================================ + // ============================================================================================================================ + + + /* ----------------------------- */ + /* -- ray packet intersectors -- */ + /* ----------------------------- */ + + template<int K> + __forceinline void interpolateUV(const vbool<K>& valid, MoellerTrumboreHitK<K,UVIdentity<K>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const unsigned int i) + { + /* correct U,V interpolation across the entire grid */ + const unsigned int sx = subgrid.x() + (unsigned int)(i % 2); + const unsigned int sy = subgrid.y() + (unsigned int)(i >>1); + const float inv_resX = rcp((float)(int)(g.resX-1)); + const float inv_resY = rcp((float)(int)(g.resY-1)); + hit.U = select(valid,(hit.U + vfloat<K>((float)sx) * hit.absDen) * inv_resX,hit.U); + hit.V = select(valid,(hit.V + vfloat<K>((float)sy) * hit.absDen) * inv_resY,hit.V); + } + + template<int M, int K, bool filter> + struct SubGridQuadMIntersectorKMoellerTrumboreBase + { + __forceinline SubGridQuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {} + + template<typename Epilog> + __forceinline bool intersectK(const vbool<K>& valid, + RayK<K>& ray, + const Vec3vf<K>& v0, + const Vec3vf<K>& v1, + const Vec3vf<K>& v2, + const Vec3vf<K>& v3, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + UVIdentity<K> mapUV; + MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV); + MoellerTrumboreIntersectorK<M,K> intersector; + + const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit); + if (any(valid0)) + { + interpolateUV(valid0,hit,g,subgrid,i); + epilog(valid0,hit); + } + const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit); + if (any(valid1)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV(valid1,hit,g,subgrid,i); + epilog(valid1,hit); + } + return any(valid0|valid1); + } + + template<typename Epilog> + __forceinline bool occludedK(const vbool<K>& valid, + RayK<K>& ray, + const Vec3vf<K>& v0, + const Vec3vf<K>& v1, + const Vec3vf<K>& v2, + const Vec3vf<K>& v3, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + UVIdentity<K> mapUV; + MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV); + MoellerTrumboreIntersectorK<M,K> intersector; + + vbool<K> valid_final = valid; + const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit); + if (any(valid0)) + { + interpolateUV(valid0,hit,g,subgrid,i); + epilog(valid0,hit); + valid_final &= !valid0; + } + if (none(valid_final)) return true; + const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit); + if (any(valid1)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV(valid1,hit,g,subgrid,i); + epilog(valid1,hit); + valid_final &= !valid1; + } + return none(valid_final); + } + + static __forceinline bool intersect1(RayK<K>& ray, + size_t k, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + MoellerTrumboreHitM<M,UVIdentity<M>> &hit) + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + MoellerTrumboreIntersectorK<8,K> intersector; + UVIdentity<M> mapUV; + return intersector.intersectEdge(ray,k,v0,e1,e2,mapUV,hit); + } + + }; + + template<int M, int K, bool filter> + struct SubGridQuadMIntersectorKMoellerTrumbore : public SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter> + { + __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray) + : SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {} + + __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + UVIdentity<M> mapUV; + MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV); + Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); + MoellerTrumboreIntersectorK<M,K> intersector; + /* intersect first triangle */ + if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) + { + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + epilog(hit.valid,hit); + } + + /* intersect second triangle */ + if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + epilog(hit.valid,hit); + } + } + + __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + UVIdentity<M> mapUV; + MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV); + Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); + MoellerTrumboreIntersectorK<M,K> intersector; + /* intersect first triangle */ + if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) + { + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + if (epilog(hit.valid,hit)) return true; + } + + /* intersect second triangle */ + if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + if (epilog(hit.valid,hit)) return true; + } + return false; + } + }; + + +#if defined (__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template<int K, bool filter> + struct SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> : public SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter> + { + __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray) + : SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {} + + template<typename Epilog> + __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + const vbool8 flags(0,0,0,0,1,1,1,1); + + UVIdentity<8> mapUV; + MoellerTrumboreHitM<8,UVIdentity<8>> hit(mapUV); + if (SubGridQuadMIntersectorKMoellerTrumboreBase<8,K,filter>::intersect1(ray,k,vtx0,vtx1,vtx2,hit)) + { + const vfloat8 U = select(flags,hit.absDen - hit.V,hit.U); + const vfloat8 V = select(flags,hit.absDen - hit.U,hit.V); + hit.U = U; + hit.V = V; + hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); + interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1)); + if (unlikely(epilog(hit.valid,hit))) + return true; + + } + return false; + } + + __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID())); + } + + __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID())); + } + }; + +#endif + + + + } +} diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h new file mode 100644 index 0000000000..5ded56e1f7 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/subgrid_intersector_pluecker.h @@ -0,0 +1,367 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subgrid.h" +#include "quad_intersector_moeller.h" +#include "quad_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + + template<int M> + __forceinline void interpolateUV(PlueckerHitM<M,UVIdentity<M>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY) + { + /* correct U,V interpolation across the entire grid */ + const vint<M> sx((int)subgrid.x()); + const vint<M> sy((int)subgrid.y()); + const vint<M> sxM(sx + stepX); + const vint<M> syM(sy + stepY); + const float inv_resX = rcp((float)((int)g.resX-1)); + const float inv_resY = rcp((float)((int)g.resY-1)); + hit.U = (hit.U + vfloat<M>(sxM) * hit.UVW) * inv_resX; + hit.V = (hit.V + vfloat<M>(syM) * hit.UVW) * inv_resY; + } + + template<int M, bool filter> + struct SubGridQuadMIntersector1Pluecker; + + template<int M, bool filter> + struct SubGridQuadMIntersector1Pluecker + { + __forceinline SubGridQuadMIntersector1Pluecker() {} + + __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + __forceinline void intersect(RayHit& ray, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + UVIdentity<M> mapUV; + PlueckerHitM<M,UVIdentity<M>> hit(mapUV); + PlueckerIntersector1<M> intersector(ray,nullptr); + + Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID()); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) + { + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + epilog(hit.valid,hit); + } + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.UVW - hit.U; + hit.V = hit.UVW - hit.V; + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + epilog(hit.valid,hit); + } + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + UVIdentity<M> mapUV; + PlueckerHitM<M,UVIdentity<M>> hit(mapUV); + PlueckerIntersector1<M> intersector(ray,nullptr); + Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID()); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,mapUV,hit)) + { + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + if (epilog(hit.valid,hit)) + return true; + } + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.UVW - hit.U; + hit.V = hit.UVW - hit.V; + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + if (epilog(hit.valid,hit)) + return true; + } + return false; + } + }; + +#if defined (__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template<bool filter> + struct SubGridQuadMIntersector1Pluecker<4,filter> + { + __forceinline SubGridQuadMIntersector1Pluecker() {} + + __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + template<typename Epilog> + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + + UVIdentity<8> mapUV; + PlueckerHitM<8,UVIdentity<8>> hit(mapUV); + PlueckerIntersector1<8> intersector(ray,nullptr); + const vbool8 flags(0,0,0,0,1,1,1,1); + if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,mapUV,hit))) + { + /* correct U,V interpolation across the entire grid */ + const vfloat8 U = select(flags,hit.UVW - hit.V,hit.U); + const vfloat8 V = select(flags,hit.UVW - hit.U,hit.V); + hit.U = U; + hit.V = V; + hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); + interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1)); + if (unlikely(epilog(hit.valid,hit))) + return true; + } + return false; + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); + } + }; + +#endif + + + /* ----------------------------- */ + /* -- ray packet intersectors -- */ + /* ----------------------------- */ + + template<int K> + __forceinline void interpolateUV(const vbool<K>& valid, PlueckerHitK<K,UVIdentity<K>> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const unsigned int i) + { + /* correct U,V interpolation across the entire grid */ + const unsigned int sx = subgrid.x() + (unsigned int)(i % 2); + const unsigned int sy = subgrid.y() + (unsigned int)(i >>1); + const float inv_resX = rcp((float)(int)(g.resX-1)); + const float inv_resY = rcp((float)(int)(g.resY-1)); + hit.U = select(valid,(hit.U + vfloat<K>((float)sx) * hit.UVW) * inv_resX,hit.U); + hit.V = select(valid,(hit.V + vfloat<K>((float)sy) * hit.UVW) * inv_resY,hit.V); + } + + template<int M, int K, bool filter> + struct SubGridQuadMIntersectorKPlueckerBase + { + __forceinline SubGridQuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {} + + template<typename Epilog> + __forceinline bool intersectK(const vbool<K>& valid, + RayK<K>& ray, + const Vec3vf<K>& v0, + const Vec3vf<K>& v1, + const Vec3vf<K>& v2, + const Vec3vf<K>& v3, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + UVIdentity<K> mapUV; + PlueckerHitK<K,UVIdentity<K>> hit(mapUV); + PlueckerIntersectorK<M,K> intersector; + + const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit); + if (any(valid0)) + { + interpolateUV(valid0,hit,g,subgrid,i); + epilog(valid0,hit); + } + const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit); + if (any(valid1)) + { + hit.U = hit.UVW - hit.U; + hit.V = hit.UVW - hit.V; + interpolateUV(valid1,hit,g,subgrid,i); + epilog(valid1,hit); + } + return any(valid0|valid1); + } + + template<typename Epilog> + __forceinline bool occludedK(const vbool<K>& valid, + RayK<K>& ray, + const Vec3vf<K>& v0, + const Vec3vf<K>& v1, + const Vec3vf<K>& v2, + const Vec3vf<K>& v3, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + UVIdentity<K> mapUV; + PlueckerHitK<K,UVIdentity<K>> hit(mapUV); + PlueckerIntersectorK<M,K> intersector; + + vbool<K> valid_final = valid; + const vbool<K> valid0 = intersector.intersectK(valid,ray,v0,v1,v3,mapUV,hit); + if (any(valid0)) + { + interpolateUV(valid0,hit,g,subgrid,i); + epilog(valid0,hit); + valid_final &= !valid0; + } + if (none(valid_final)) return true; + const vbool<K> valid1 = intersector.intersectK(valid,ray,v2,v3,v1,mapUV,hit); + if (any(valid1)) + { + hit.U = hit.UVW - hit.U; + hit.V = hit.UVW - hit.V; + interpolateUV(valid1,hit,g,subgrid,i); + epilog(valid1,hit); + valid_final &= !valid1; + } + return none(valid_final); + } + + + }; + + + + + template<int M, int K, bool filter> + struct SubGridQuadMIntersectorKPluecker : public SubGridQuadMIntersectorKPlueckerBase<M,K,filter> + { + __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray) + : SubGridQuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {} + + __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + UVIdentity<M> mapUV; + PlueckerHitM<M,UVIdentity<M>> hit(mapUV); + Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); + PlueckerIntersectorK<M,K> intersector; + + /* intersect first triangle */ + if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) + { + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + epilog(hit.valid,hit); + } + + /* intersect second triangle */ + if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.UVW - hit.U; + hit.V = hit.UVW - hit.V; + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + epilog(hit.valid,hit); + } + } + + __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + UVIdentity<M> mapUV; + PlueckerHitM<M,UVIdentity<M>> hit(mapUV); + Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); + PlueckerIntersectorK<M,K> intersector; + + /* intersect first triangle */ + if (intersector.intersect(ray,k,v0,v1,v3,mapUV,hit)) + { + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + if (epilog(hit.valid,hit)) return true; + } + + /* intersect second triangle */ + if (intersector.intersect(ray,k,v2,v3,v1,mapUV,hit)) + { + hit.U = hit.UVW - hit.U; + hit.V = hit.UVW - hit.V; + interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1)); + if (epilog(hit.valid,hit)) return true; + } + return false; + } + }; + + +#if defined (__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template<int K, bool filter> + struct SubGridQuadMIntersectorKPluecker<4,K,filter> : public SubGridQuadMIntersectorKPlueckerBase<4,K,filter> + { + __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray) + : SubGridQuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {} + + template<typename Epilog> + __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + UVIdentity<8> mapUV; + PlueckerHitM<8,UVIdentity<8>> hit(mapUV); + PlueckerIntersectorK<8,K> intersector; + const vbool8 flags(0,0,0,0,1,1,1,1); + if (unlikely(intersector.intersect(ray,k,vtx0,vtx1,vtx2,mapUV,hit))) + { + /* correct U,V interpolation across the entire grid */ + const vfloat8 U = select(flags,hit.UVW - hit.V,hit.U); + const vfloat8 V = select(flags,hit.UVW - hit.U,hit.V); + hit.U = U; + hit.V = V; + hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); + interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1)); + if (unlikely(epilog(hit.valid,hit))) + return true; + } + return false; + } + + __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID())); + } + + __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID())); + } + }; +#endif + + + } +} diff --git a/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h new file mode 100644 index 0000000000..473d656e24 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/subgrid_mb_intersector.h @@ -0,0 +1,236 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subgrid_intersector.h" + +namespace embree +{ + namespace isa + { + template<int N, bool filter> + struct SubGridMBIntersector1Pluecker + { + typedef SubGridMBQBVHN<N> Primitive; + typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + float ftime; + const int itime = mesh->timeSegment(ray.time(), ftime); + Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime); + pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + float ftime; + const int itime = mesh->timeSegment(ray.time(), ftime); + + Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime); + return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, subgrid); + } + + template<bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + const float time = prim[i].adjustTime(ray.time()); + + assert(time <= 1.0f); + size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); +#endif + while(mask != 0) + { + const size_t ID = bscf(mask); + if (unlikely(dist[ID] > ray.tfar)) continue; + intersect(pre,ray,context,prim[i].subgrid(ID)); + } + } + } + + template<bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + for (size_t i=0;i<num;i++) + { + const float time = prim[i].adjustTime(ray.time()); + assert(time <= 1.0f); + vfloat<N> dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + if (occluded(pre,ray,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node) + { + assert(false && "not implemented"); + return false; + } + }; + + + template<int N, int K, bool filter> + struct SubGridMBIntersectorKPluecker + { + typedef SubGridMBQBVHN<N> Primitive; + typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations; + + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid) + { + size_t m_valid = movemask(valid_i); + while(m_valid) + { + size_t ID = bscf(m_valid); + intersect(pre,ray,ID,context,subgrid); + } + } + + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid) + { + vbool<K> valid0 = valid_i; + size_t m_valid = movemask(valid_i); + while(m_valid) + { + size_t ID = bscf(m_valid); + if (occluded(pre,ray,ID,context,subgrid)) + clear(valid0,ID); + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + vfloat<K> ftime; + const vint<K> itime = mesh->timeSegment<K>(ray.time(), ftime); + Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]); + pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get<GridMesh>(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + vfloat<K> ftime; + const vint<K> itime = mesh->timeSegment<K>(ray.time(), ftime); + Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + template<bool robust> + static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK; + for (size_t j=0;j<num;j++) + { + size_t m_valid = movemask(prim[j].qnode.validMask()); + const vfloat<K> time = prim[j].template adjustTime<K>(ray.time()); + + vfloat<K> dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue; + intersect(valid,pre,ray,context,prim[j].subgrid(i)); + } + } + } + + template<bool robust> + static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK; + + vbool<K> valid0 = valid; + for (size_t j=0;j<num;j++) + { + size_t m_valid = movemask(prim[j].qnode.validMask()); + const vfloat<K> time = prim[j].template adjustTime<K>(ray.time()); + vfloat<K> dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue; + valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); + if (none(valid0)) break; + } + } + return !valid0; + } + + template<bool robust> + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + const float time = prim[i].adjustTime(ray.time()[k]); + assert(time <= 1.0f); + + size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + if (unlikely(dist[ID] > ray.tfar[k])) continue; + intersect(pre,ray,k,context,prim[i].subgrid(ID)); + } + } + } + + template<bool robust> + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,robust> &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1<N,robust> isec1; + + for (size_t i=0;i<num;i++) + { + vfloat<N> dist; + const float time = prim[i].adjustTime(ray.time()[k]); + assert(time <= 1.0f); + + size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/triangle.h b/thirdparty/embree/kernels/geometry/triangle.h new file mode 100644 index 0000000000..24b758ae48 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/triangle.h @@ -0,0 +1,162 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + /* Precalculated representation for M triangles. Stores for each + triangle a base vertex, two edges, and the geometry normal to + speed up intersection calculations */ + template<int M> + struct TriangleM + { + public: + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored triangles */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline TriangleM() {} + + /* Construction from vertices and IDs */ + __forceinline TriangleM(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs) + : v0(v0), e1(v0-v1), e2(v2-v0), geomIDs(geomIDs), primIDs(primIDs) {} + + /* Returns a mask that tells which triangles are valid */ + __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); } + + /* Returns true if the specified triangle is valid */ + __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; } + + /* Returns the number of stored triangles */ + __forceinline size_t size() const { return bsf(~movemask(valid())); } + + /* Returns the geometry IDs */ + __forceinline vuint<M>& geomID() { return geomIDs; } + __forceinline const vuint<M>& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; } + + /* Returns the primitive IDs */ + __forceinline vuint<M>& primID() { return primIDs; } + __forceinline const vuint<M>& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; } + + /* Calculate the bounds of the triangle */ + __forceinline BBox3fa bounds() const + { + Vec3vf<M> p0 = v0; + Vec3vf<M> p1 = v0-e1; + Vec3vf<M> p2 = v0+e2; + Vec3vf<M> lower = min(p0,p1,p2); + Vec3vf<M> upper = max(p0,p1,p2); + vbool<M> mask = valid(); + lower.x = select(mask,lower.x,vfloat<M>(pos_inf)); + lower.y = select(mask,lower.y,vfloat<M>(pos_inf)); + lower.z = select(mask,lower.z,vfloat<M>(pos_inf)); + upper.x = select(mask,upper.x,vfloat<M>(neg_inf)); + upper.y = select(mask,upper.y,vfloat<M>(neg_inf)); + upper.z = select(mask,upper.z,vfloat<M>(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Non temporal store */ + __forceinline static void store_nt(TriangleM* dst, const TriangleM& src) + { + vfloat<M>::store_nt(&dst->v0.x,src.v0.x); + vfloat<M>::store_nt(&dst->v0.y,src.v0.y); + vfloat<M>::store_nt(&dst->v0.z,src.v0.z); + vfloat<M>::store_nt(&dst->e1.x,src.e1.x); + vfloat<M>::store_nt(&dst->e1.y,src.e1.y); + vfloat<M>::store_nt(&dst->e1.z,src.e1.z); + vfloat<M>::store_nt(&dst->e2.x,src.e2.x); + vfloat<M>::store_nt(&dst->e2.y,src.e2.y); + vfloat<M>::store_nt(&dst->e2.z,src.e2.z); + vuint<M>::store_nt(&dst->geomIDs,src.geomIDs); + vuint<M>::store_nt(&dst->primIDs,src.primIDs); + } + + /* Fill triangle from triangle list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) + { + vuint<M> vgeomID = -1, vprimID = -1; + Vec3vf<M> v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; i<M && begin<end; i++, begin++) + { + const PrimRef& prim = prims[begin]; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + vgeomID [i] = geomID; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID)); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(TriangleMesh* mesh) + { + BBox3fa bounds = empty; + vuint<M> vgeomID = -1, vprimID = -1; + Vec3vf<M> v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; i<M; i++) + { + if (unlikely(geomID(i) == -1)) break; + const unsigned geomId = geomID(i); + const unsigned primId = primID(i); + const TriangleMesh::Triangle& tri = mesh->triangle(primId); + const Vec3fa p0 = mesh->vertex(tri.v[0]); + const Vec3fa p1 = mesh->vertex(tri.v[1]); + const Vec3fa p2 = mesh->vertex(tri.v[2]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); + vgeomID [i] = geomId; + vprimID [i] = primId; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID)); + return bounds; + } + + public: + Vec3vf<M> v0; // base vertex of the triangles + Vec3vf<M> e1; // 1st edge of the triangles (v0-v1) + Vec3vf<M> e2; // 2nd edge of the triangles (v2-v0) + private: + vuint<M> geomIDs; // geometry IDs + vuint<M> primIDs; // primitive IDs + }; + + template<int M> + typename TriangleM<M>::Type TriangleM<M>::type; + + typedef TriangleM<4> Triangle4; +} diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector.h b/thirdparty/embree/kernels/geometry/triangle_intersector.h new file mode 100644 index 0000000000..2cdff78ec8 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/triangle_intersector.h @@ -0,0 +1,96 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "triangle_intersector_moeller.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMIntersector1Moeller + { + typedef TriangleM<M> Primitive; + typedef MoellerTrumboreIntersector1<M> Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM<M>& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM<M>& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + + }; + + /*! Intersects M triangles with K rays. */ + template<int M, int K, bool filter> + struct TriangleMIntersectorKMoeller + { + typedef TriangleM<M> Primitive; + typedef MoellerTrumboreIntersectorK<M,K> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleM<M>& tri) + { + STAT_USER(0,TriangleM<M>::max_size()); + for (size_t i=0; i<TriangleM<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i); + const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i); + const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i); + pre.intersectEdgeK(valid_i,ray,p0,e1,e2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleM<M>& tri) + { + vbool<K> valid0 = valid_i; + + for (size_t i=0; i<TriangleM<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i); + const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i); + const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i); + pre.intersectEdgeK(valid0,ray,p0,e1,e2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h new file mode 100644 index 0000000000..0a42d8f08b --- /dev/null +++ b/thirdparty/embree/kernels/geometry/triangle_intersector_moeller.h @@ -0,0 +1,525 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "intersector_epilog.h" + +/*! This intersector implements a modified version of the Moeller + * Trumbore intersector from the paper "Fast, Minimum Storage + * Ray-Triangle Intersection". In contrast to the paper we + * precalculate some factors and factor the calculations differently + * to allow precalculating the cross product e1 x e2. The resulting + * algorithm is similar to the fastest one of the paper "Optimizing + * Ray-Triangle Intersection via Automated Search". */ + +namespace embree +{ + namespace isa + { + template<int M, typename UVMapper> + struct MoellerTrumboreHitM + { + __forceinline MoellerTrumboreHitM(const UVMapper& mapUV) : mapUV(mapUV) {} + + __forceinline MoellerTrumboreHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const vfloat<M>& absDen, const Vec3vf<M>& Ng, const UVMapper& mapUV) + : U(U), V(V), T(T), absDen(absDen), mapUV(mapUV), valid(valid), vNg(Ng) {} + + __forceinline void finalize() + { + const vfloat<M> rcpAbsDen = rcp(absDen); + vt = T * rcpAbsDen; + vu = U * rcpAbsDen; + vv = V * rcpAbsDen; + mapUV(vu,vv,vNg); + } + + __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); } + __forceinline vfloat<M> t () const { return vt; } + __forceinline Vec3vf<M> Ng() const { return vNg; } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + public: + vfloat<M> U; + vfloat<M> V; + vfloat<M> T; + vfloat<M> absDen; + UVMapper mapUV; + + public: + vbool<M> valid; + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + }; + + template<int M, bool early_out = true> + struct MoellerTrumboreIntersector1 + { + __forceinline MoellerTrumboreIntersector1() {} + + __forceinline MoellerTrumboreIntersector1(const Ray& ray, const void* ptr) {} + + template<typename UVMapper> + __forceinline bool intersect(const vbool<M>& valid0, + Ray& ray, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const Vec3vf<M>& tri_Ng, + const UVMapper& mapUV, + MoellerTrumboreHitM<M,UVMapper>& hit) const + { + /* calculate denominator */ + vbool<M> valid = valid0; + const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org); + const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir); + const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O; + const Vec3vf<M> R = cross(C,D); + const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D); + + const vfloat<M> absDen = abs(den); + const vfloat<M> sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen; + const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + valid &= (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(early_out && none(valid))) return false; + + /* perform depth test */ + const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat<M>(ray.tnear()) < T) & (T <= absDen*vfloat<M>(ray.tfar)); + if (likely(early_out && none(valid))) return false; + + /* update hit information */ + new (&hit) MoellerTrumboreHitM<M,UVMapper>(valid,U,V,T,absDen,tri_Ng,mapUV); + + return true; + } + + template<typename UVMapper> + __forceinline bool intersectEdge(const vbool<M>& valid, + Ray& ray, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const UVMapper& mapUV, + MoellerTrumboreHitM<M,UVMapper>& hit) const + { + const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1); + return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit); + } + + template<typename UVMapper> + __forceinline bool intersectEdge(Ray& ray, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const UVMapper& mapUV, + MoellerTrumboreHitM<M,UVMapper>& hit) const + { + vbool<M> valid = true; + const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1); + return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit); + } + + template<typename UVMapper> + __forceinline bool intersect(Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + MoellerTrumboreHitM<M,UVMapper>& hit) const + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + return intersectEdge(ray,v0,e1,e2,mapUV,hit); + } + + template<typename UVMapper> + __forceinline bool intersect(const vbool<M>& valid, + Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + MoellerTrumboreHitM<M,UVMapper>& hit) const + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + return intersectEdge(valid,ray,v0,e1,e2,mapUV,hit); + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersectEdge(Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& e1, + const Vec3vf<M>& e2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + MoellerTrumboreHitM<M,UVMapper> hit(mapUV); + if (likely(intersectEdge(ray,v0,e1,e2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersect(Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + MoellerTrumboreHitM<M,UVMapper> hit(mapUV); + if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename Epilog> + __forceinline bool intersect(Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const Epilog& epilog) const + { + auto mapUV = UVIdentity<M>(); + MoellerTrumboreHitM<M,UVIdentity<M>> hit(mapUV); + if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersect(const vbool<M>& valid, + Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + MoellerTrumboreHitM<M,UVMapper> hit(mapUV); + if (likely(intersect(valid,ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + }; + + template<int K, typename UVMapper> + struct MoellerTrumboreHitK + { + __forceinline MoellerTrumboreHitK(const UVMapper& mapUV) : mapUV(mapUV) {} + __forceinline MoellerTrumboreHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng, const UVMapper& mapUV) + : U(U), V(V), T(T), absDen(absDen), Ng(Ng), mapUV(mapUV) {} + + __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const + { + const vfloat<K> rcpAbsDen = rcp(absDen); + const vfloat<K> t = T * rcpAbsDen; + vfloat<K> u = U * rcpAbsDen; + vfloat<K> v = V * rcpAbsDen; + Vec3vf<K> vNg = Ng; + mapUV(u,v,vNg); + return std::make_tuple(u,v,t,vNg); + } + + vfloat<K> U; + vfloat<K> V; + const vfloat<K> T; + const vfloat<K> absDen; + const Vec3vf<K> Ng; + const UVMapper& mapUV; + }; + + template<int M, int K> + struct MoellerTrumboreIntersectorK + { + __forceinline MoellerTrumboreIntersectorK() {} + __forceinline MoellerTrumboreIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template<typename UVMapper> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + //RayK<K>& ray, + const Vec3vf<K>& ray_org, + const Vec3vf<K>& ray_dir, + const vfloat<K>& ray_tnear, + const vfloat<K>& ray_tfar, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_e1, + const Vec3vf<K>& tri_e2, + const Vec3vf<K>& tri_Ng, + const UVMapper& mapUV, + MoellerTrumboreHitK<K,UVMapper> &hit) const + { + /* calculate denominator */ + vbool<K> valid = valid0; + const Vec3vf<K> C = tri_v0 - ray_org; + const Vec3vf<K> R = cross(C,ray_dir); + const vfloat<K> den = dot(tri_Ng,ray_dir); + const vfloat<K> absDen = abs(den); + const vfloat<K> sgnDen = signmsk(den); + + /* test against edge p2 p0 */ + const vfloat<K> U = dot(tri_e2,R) ^ sgnDen; + valid &= U >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p0 p1 */ + const vfloat<K> V = dot(tri_e1,R) ^ sgnDen; + valid &= V >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p1 p2 */ + const vfloat<K> W = absDen-U-V; + valid &= W >= 0.0f; + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen; + valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar); + if (unlikely(none(valid))) return false; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= den < vfloat<K>(zero); + if (unlikely(none(valid))) return false; +#else + valid &= den != vfloat<K>(zero); + if (unlikely(none(valid))) return false; +#endif + + /* calculate hit information */ + new (&hit) MoellerTrumboreHitK<K,UVMapper>(U,V,T,absDen,tri_Ng,mapUV); + return valid; + } + + /*! Intersects K rays with one of M triangles. */ + template<typename UVMapper> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const UVMapper& mapUV, + MoellerTrumboreHitK<K,UVMapper> &hit) const + { + const Vec3vf<K> e1 = tri_v0-tri_v1; + const Vec3vf<K> e2 = tri_v2-tri_v0; + const Vec3vf<K> Ng = cross(e2,e1); + return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit); + } + + + /*! Intersects K rays with one of M triangles. */ + template<typename UVMapper, typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV); + const Vec3vf<K> e1 = tri_v0-tri_v1; + const Vec3vf<K> e2 = tri_v2-tri_v0; + const Vec3vf<K> Ng = cross(e2,e1); + const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit); + return epilog(valid,hit); + } + + + + template<typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const Epilog& epilog) const + { + UVIdentity<K> mapUV; + MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV); + const Vec3vf<K> e1 = tri_v0-tri_v1; + const Vec3vf<K> e2 = tri_v2-tri_v0; + const Vec3vf<K> Ng = cross(e2,e1); + const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,mapUV,hit); + return epilog(valid,hit); + } + + /*! Intersects K rays with one of M triangles. */ + template<typename UVMapper, typename Epilog> + __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_e1, + const Vec3vf<K>& tri_e2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + MoellerTrumboreHitK<K,UVIdentity<K>> hit(mapUV); + const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1); + const vbool<K> valid = intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,mapUV,hit); + return epilog(valid,hit); + } + + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + template<typename UVMapper> + __forceinline bool intersectEdge(RayK<K>& ray, + size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const UVMapper& mapUV, + MoellerTrumboreHitM<M,UVMapper>& hit) const + { + /* calculate denominator */ + typedef Vec3vf<M> Vec3vfM; + const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1); + + const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k); + const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k); + const Vec3vfM C = Vec3vfM(tri_v0) - O; + const Vec3vfM R = cross(C,D); + const vfloat<M> den = dot(Vec3vfM(tri_Ng),D); + const vfloat<M> absDen = abs(den); + const vfloat<M> sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen; + const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k])); + if (likely(none(valid))) return false; + + /* calculate hit information */ + new (&hit) MoellerTrumboreHitM<M,UVMapper>(valid,U,V,T,absDen,tri_Ng,mapUV); + return true; + } + + template<typename UVMapper> + __forceinline bool intersectEdge(RayK<K>& ray, + size_t k, + const BBox<vfloat<M>>& time_range, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const UVMapper& mapUV, + MoellerTrumboreHitM<M,UVMapper>& hit) const + { + if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,mapUV,hit))) + { + hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]); + hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper; + return any(hit.valid); + } + return false; + } + + template<typename UVMapper> + __forceinline bool intersect(RayK<K>& ray, + size_t k, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + MoellerTrumboreHitM<M,UVMapper>& hit) const + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + return intersectEdge(ray,k,v0,e1,e2,mapUV,hit); + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersectEdge(RayK<K>& ray, + size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + MoellerTrumboreHitM<M,UVMapper> hit(mapUV); + if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersectEdge(RayK<K>& ray, + size_t k, + const BBox<vfloat<M>>& time_range, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + MoellerTrumboreHitM<M,UVMapper> hit(mapUV); + if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersect(RayK<K>& ray, + size_t k, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + return intersectEdge(ray,k,v0,e1,e2,mapUV,epilog); + } + + template<typename Epilog> + __forceinline bool intersect(RayK<K>& ray, + size_t k, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const Epilog& epilog) const + { + return intersect(ray,k,v0,v1,v2,UVIdentity<M>(),epilog); + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersect(RayK<K>& ray, + size_t k, + const BBox<vfloat<M>>& time_range, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + return intersectEdge(ray,k,time_range,v0,e1,e2,mapUV,epilog); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h new file mode 100644 index 0000000000..8fbefcea88 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/triangle_intersector_pluecker.h @@ -0,0 +1,407 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "trianglev.h" +#include "trianglev_mb.h" +#include "intersector_epilog.h" + +/*! Modified Pluecker ray/triangle intersector. The test first shifts + * the ray origin into the origin of the coordinate system and then + * uses Pluecker coordinates for the intersection. Due to the shift, + * the Pluecker coordinate calculation simplifies and the tests get + * numerically stable. The edge equations are watertight along the + * edge for neighboring triangles. */ + +namespace embree +{ + namespace isa + { + template<int M, typename UVMapper> + struct PlueckerHitM + { + __forceinline PlueckerHitM(const UVMapper& mapUV) : mapUV(mapUV) {} + + __forceinline PlueckerHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& UVW, const vfloat<M>& t, const Vec3vf<M>& Ng, const UVMapper& mapUV) + : U(U), V(V), UVW(UVW), mapUV(mapUV), valid(valid), vt(t), vNg(Ng) {} + + __forceinline void finalize() + { + const vbool<M> invalid = abs(UVW) < min_rcp_input; + const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW)); + vu = min(U * rcpUVW,1.0f); + vv = min(V * rcpUVW,1.0f); + mapUV(vu,vv,vNg); + } + + __forceinline Vec2vf<M> uv() const { return Vec2vf<M>(vu,vv); } + __forceinline vfloat<M> t () const { return vt; } + __forceinline Vec3vf<M> Ng() const { return vNg; } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + public: + vfloat<M> U; + vfloat<M> V; + vfloat<M> UVW; + const UVMapper& mapUV; + + public: + vbool<M> valid; + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + }; + + template<int M, bool early_out = true> + struct PlueckerIntersector1 + { + __forceinline PlueckerIntersector1() {} + + __forceinline PlueckerIntersector1(const Ray& ray, const void* ptr) {} + + template<typename UVMapper> + __forceinline bool intersect(const vbool<M>& valid0, + Ray& ray, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const UVMapper& mapUV, + PlueckerHitM<M,UVMapper>& hit) const + { + vbool<M> valid = valid0; + + /* calculate vertices relative to ray origin */ + const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org); + const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir); + const Vec3vf<M> v0 = tri_v0-O; + const Vec3vf<M> v1 = tri_v1-O; + const Vec3vf<M> v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf<M> e0 = v2-v0; + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v1-v2; + + /* perform edge tests */ + const vfloat<M> U = dot(cross(e0,v2+v0),D); + const vfloat<M> V = dot(cross(e1,v0+v1),D); + const vfloat<M> W = dot(cross(e2,v1+v2),D); + const vfloat<M> UVW = U+V+W; + const vfloat<M> eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + valid &= max(U,V,W) <= eps; +#else + valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(early_out && none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2); + const vfloat<M> den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat<M> T = twice(dot(v0,Ng)); + const vfloat<M> t = rcp(den)*T; + valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar); + valid &= den != vfloat<M>(zero); + if (unlikely(early_out && none(valid))) return false; + + /* update hit information */ + new (&hit) PlueckerHitM<M,UVMapper>(valid,U,V,UVW,t,Ng,mapUV); + return true; + } + + template<typename UVMapper> + __forceinline bool intersectEdge(const vbool<M>& valid, + Ray& ray, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const UVMapper& mapUV, + PlueckerHitM<M,UVMapper>& hit) const + { + return intersect(valid,ray,tri_v0,tri_v1,tri_v2,mapUV,hit); + } + + template<typename UVMapper> + __forceinline bool intersectEdge(Ray& ray, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const UVMapper& mapUV, + PlueckerHitM<M,UVMapper>& hit) const + { + vbool<M> valid = true; + return intersect(valid,ray,tri_v0,tri_v1,tri_v2,mapUV,hit); + } + + template<typename UVMapper> + __forceinline bool intersect(Ray& ray, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const UVMapper& mapUV, + PlueckerHitM<M,UVMapper>& hit) const + { + return intersectEdge(ray,tri_v0,tri_v1,tri_v2,mapUV,hit); + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersectEdge(Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& e1, + const Vec3vf<M>& e2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + PlueckerHitM<M,UVMapper> hit(mapUV); + if (likely(intersectEdge(ray,v0,e1,e2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersect(Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + PlueckerHitM<M,UVMapper> hit(mapUV); + if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename Epilog> + __forceinline bool intersect(Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const Epilog& epilog) const + { + auto mapUV = UVIdentity<M>(); + PlueckerHitM<M,UVIdentity<M>> hit(mapUV); + if (likely(intersect(ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersect(const vbool<M>& valid, + Ray& ray, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + PlueckerHitM<M,UVMapper> hit(mapUV); + if (likely(intersect(valid,ray,v0,v1,v2,mapUV,hit))) return epilog(hit.valid,hit); + return false; + } + + }; + + template<int K, typename UVMapper> + struct PlueckerHitK + { + __forceinline PlueckerHitK(const UVMapper& mapUV) : mapUV(mapUV) {} + + __forceinline PlueckerHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& UVW, const vfloat<K>& t, const Vec3vf<K>& Ng, const UVMapper& mapUV) + : U(U), V(V), UVW(UVW), t(t), Ng(Ng), mapUV(mapUV) {} + + __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const + { + const vbool<K> invalid = abs(UVW) < min_rcp_input; + const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW)); + vfloat<K> u = min(U * rcpUVW,1.0f); + vfloat<K> v = min(V * rcpUVW,1.0f); + Vec3vf<K> vNg = Ng; + mapUV(u,v,vNg); + return std::make_tuple(u,v,t,vNg); + } + vfloat<K> U; + vfloat<K> V; + const vfloat<K> UVW; + const vfloat<K> t; + const Vec3vf<K> Ng; + const UVMapper& mapUV; + }; + + template<int M, int K> + struct PlueckerIntersectorK + { + __forceinline PlueckerIntersectorK() {} + __forceinline PlueckerIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template<typename UVMapper> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const UVMapper& mapUV, + PlueckerHitK<K,UVMapper> &hit) const + { + /* calculate vertices relative to ray origin */ + vbool<K> valid = valid0; + const Vec3vf<K> O = ray.org; + const Vec3vf<K> D = ray.dir; + const Vec3vf<K> v0 = tri_v0-O; + const Vec3vf<K> v1 = tri_v1-O; + const Vec3vf<K> v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf<K> e0 = v2-v0; + const Vec3vf<K> e1 = v0-v1; + const Vec3vf<K> e2 = v1-v2; + + /* perform edge tests */ + const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D); + const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D); + const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D); + const vfloat<K> UVW = U+V+W; + const vfloat<K> eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + valid &= max(U,V,W) <= eps; +#else + valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return valid; + + /* calculate geometry normal and denominator */ + const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2); + const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D)); + + /* perform depth test */ + const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng))); + const vfloat<K> t = rcp(den)*T; + valid &= ray.tnear() <= t & t <= ray.tfar; + valid &= den != vfloat<K>(zero); + if (unlikely(none(valid))) return valid; + + /* calculate hit information */ + new (&hit) PlueckerHitK<K,UVMapper>(U,V,UVW,t,Ng,mapUV); + return valid; + } + + template<typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const Epilog& epilog) const + { + UVIdentity<K> mapUV; + PlueckerHitK<K,UVIdentity<K>> hit(mapUV); + const vbool<K> valid = intersectK(valid0,ray,tri_v0,tri_v1,tri_v2,mapUV,hit); + return epilog(valid,hit); + } + + template<typename UVMapper, typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + PlueckerHitK<K,UVMapper> hit(mapUV); + const vbool<K> valid = intersectK(valid0,ray,tri_v0,tri_v1,tri_v2,mapUV,hit); + return epilog(valid,hit); + } + + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + template<typename UVMapper> + __forceinline bool intersect(RayK<K>& ray, size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const UVMapper& mapUV, + PlueckerHitM<M,UVMapper> &hit) const + { + /* calculate vertices relative to ray origin */ + const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k); + const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k); + const Vec3vf<M> v0 = tri_v0-O; + const Vec3vf<M> v1 = tri_v1-O; + const Vec3vf<M> v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf<M> e0 = v2-v0; + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v1-v2; + + + /* perform edge tests */ + const vfloat<M> U = dot(cross(e0,v2+v0),D); + const vfloat<M> V = dot(cross(e1,v0+v1),D); + const vfloat<M> W = dot(cross(e2,v1+v2),D); + + const vfloat<M> UVW = U+V+W; + const vfloat<M> eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool<M> valid = max(U,V,W) <= eps; +#else + vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2); + const vfloat<M> den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat<M> T = twice(dot(v0,Ng)); + const vfloat<M> t = rcp(den)*T; + valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]); + if (unlikely(none(valid))) return false; + + /* avoid division by 0 */ + valid &= den != vfloat<M>(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + new (&hit) PlueckerHitM<M,UVMapper>(valid,U,V,UVW,t,Ng,mapUV); + return true; + } + + template<typename UVMapper, typename Epilog> + __forceinline bool intersect(RayK<K>& ray, size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + PlueckerHitM<M,UVMapper> hit(mapUV); + if (intersect(ray,k,tri_v0,tri_v1,tri_v2,mapUV,hit)) + return epilog(hit.valid,hit); + return false; + } + + template<typename Epilog> + __forceinline bool intersect(RayK<K>& ray, size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + const Epilog& epilog) const + { + UVIdentity<M> mapUV; + PlueckerHitM<M,UVIdentity<M>> hit(mapUV); + if (intersect(ray,k,tri_v0,tri_v1,tri_v2,mapUV,hit)) + return epilog(hit.valid,hit); + return false; + } + + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h new file mode 100644 index 0000000000..f05dcc4537 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/triangle_intersector_woop.h @@ -0,0 +1,418 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "intersector_epilog.h" + +/*! This intersector implements a modified version of the Woop's ray-triangle intersection test */ + +namespace embree +{ + namespace isa + { + template<int M> + struct WoopHitM + { + __forceinline WoopHitM() {} + + __forceinline WoopHitM(const vbool<M>& valid, + const vfloat<M>& U, + const vfloat<M>& V, + const vfloat<M>& T, + const vfloat<M>& inv_det, + const Vec3vf<M>& Ng) + : U(U), V(V), T(T), inv_det(inv_det), valid(valid), vNg(Ng) {} + + __forceinline void finalize() + { + vt = T; + vu = U*inv_det; + vv = V*inv_det; + } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + private: + const vfloat<M> U; + const vfloat<M> V; + const vfloat<M> T; + const vfloat<M> inv_det; + + public: + const vbool<M> valid; + vfloat<M> vu; + vfloat<M> vv; + vfloat<M> vt; + Vec3vf<M> vNg; + }; + + template<int M> + struct WoopPrecalculations1 + { + unsigned int kx,ky,kz; + Vec3vf<M> org; + Vec3fa S; + __forceinline WoopPrecalculations1() {} + + __forceinline WoopPrecalculations1(const Ray& ray, const void* ptr) + { + kz = maxDim(abs(ray.dir)); + kx = (kz+1) % 3; + ky = (kx+1) % 3; + const float inv_dir_kz = rcp(ray.dir[kz]); + if (ray.dir[kz]) std::swap(kx,ky); + S.x = ray.dir[kx] * inv_dir_kz; + S.y = ray.dir[ky] * inv_dir_kz; + S.z = inv_dir_kz; + org = Vec3vf<M>(ray.org[kx],ray.org[ky],ray.org[kz]); + } + }; + + + template<int M> + struct WoopIntersector1 + { + + typedef WoopPrecalculations1<M> Precalculations; + + __forceinline WoopIntersector1() {} + + __forceinline WoopIntersector1(const Ray& ray, const void* ptr) {} + + static __forceinline bool intersect(const vbool<M>& valid0, + Ray& ray, + const Precalculations& pre, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_v1, + const Vec3vf<M>& tri_v2, + WoopHitM<M>& hit) + { + vbool<M> valid = valid0; + + /* vertices relative to ray origin */ + const Vec3vf<M> org = Vec3vf<M>(pre.org.x,pre.org.y,pre.org.z); + const Vec3vf<M> A = Vec3vf<M>(tri_v0[pre.kx],tri_v0[pre.ky],tri_v0[pre.kz]) - org; + const Vec3vf<M> B = Vec3vf<M>(tri_v1[pre.kx],tri_v1[pre.ky],tri_v1[pre.kz]) - org; + const Vec3vf<M> C = Vec3vf<M>(tri_v2[pre.kx],tri_v2[pre.ky],tri_v2[pre.kz]) - org; + + /* shear and scale vertices */ + const vfloat<M> Ax = nmadd(A.z,pre.S.x,A.x); + const vfloat<M> Ay = nmadd(A.z,pre.S.y,A.y); + const vfloat<M> Bx = nmadd(B.z,pre.S.x,B.x); + const vfloat<M> By = nmadd(B.z,pre.S.y,B.y); + const vfloat<M> Cx = nmadd(C.z,pre.S.x,C.x); + const vfloat<M> Cy = nmadd(C.z,pre.S.y,C.y); + + /* scaled barycentric */ + const vfloat<M> U0 = Cx*By; + const vfloat<M> U1 = Cy*Bx; + const vfloat<M> V0 = Ax*Cy; + const vfloat<M> V1 = Ay*Cx; + const vfloat<M> W0 = Bx*Ay; + const vfloat<M> W1 = By*Ax; +#if !defined(__AVX512F__) + valid &= (U0 >= U1) & (V0 >= V1) & (W0 >= W1) | + (U0 <= U1) & (V0 <= V1) & (W0 <= W1); +#else + valid &= ge(ge(U0 >= U1,V0,V1),W0,W1) | le(le(U0 <= U1,V0,V1),W0,W1); +#endif + + if (likely(none(valid))) return false; + const vfloat<M> U = U0-U1; + const vfloat<M> V = V0-V1; + const vfloat<M> W = W0-W1; + + const vfloat<M> det = U+V+W; + + valid &= det != 0.0f; + const vfloat<M> inv_det = rcp(det); + + const vfloat<M> Az = pre.S.z * A.z; + const vfloat<M> Bz = pre.S.z * B.z; + const vfloat<M> Cz = pre.S.z * C.z; + const vfloat<M> T = madd(U,Az,madd(V,Bz,W*Cz)); + const vfloat<M> t = T * inv_det; + /* perform depth test */ + valid &= (vfloat<M>(ray.tnear()) < t) & (t <= vfloat<M>(ray.tfar)); + if (likely(none(valid))) return false; + + const Vec3vf<M> tri_Ng = cross(tri_v2-tri_v0,tri_v0-tri_v1); + + /* update hit information */ + new (&hit) WoopHitM<M>(valid,U,V,t,inv_det,tri_Ng); + return true; + } + + static __forceinline bool intersect(Ray& ray, + const Precalculations& pre, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + WoopHitM<M>& hit) + { + vbool<M> valid = true; + return intersect(valid,ray,pre,v0,v1,v2,hit); + } + + + template<typename Epilog> + static __forceinline bool intersect(Ray& ray, + const Precalculations& pre, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const Epilog& epilog) + { + WoopHitM<M> hit; + if (likely(intersect(ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename Epilog> + static __forceinline bool intersect(const vbool<M>& valid, + Ray& ray, + const Precalculations& pre, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const Epilog& epilog) + { + WoopHitM<M> hit; + if (likely(intersect(valid,ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit); + return false; + } + }; + +#if 0 + template<int K> + struct WoopHitK + { + __forceinline WoopHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng) + : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {} + + __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const + { + const vfloat<K> rcpAbsDen = rcp(absDen); + const vfloat<K> t = T * rcpAbsDen; + const vfloat<K> u = U * rcpAbsDen; + const vfloat<K> v = V * rcpAbsDen; + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat<K> U; + const vfloat<K> V; + const vfloat<K> T; + const vfloat<K> absDen; + const Vec3vf<K> Ng; + }; + + template<int M, int K> + struct WoopIntersectorK + { + __forceinline WoopIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template<typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + //RayK<K>& ray, + const Vec3vf<K>& ray_org, + const Vec3vf<K>& ray_dir, + const vfloat<K>& ray_tnear, + const vfloat<K>& ray_tfar, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_e1, + const Vec3vf<K>& tri_e2, + const Vec3vf<K>& tri_Ng, + const Epilog& epilog) const + { + /* calculate denominator */ + vbool<K> valid = valid0; + const Vec3vf<K> C = tri_v0 - ray_org; + const Vec3vf<K> R = cross(C,ray_dir); + const vfloat<K> den = dot(tri_Ng,ray_dir); + const vfloat<K> absDen = abs(den); + const vfloat<K> sgnDen = signmsk(den); + + /* test against edge p2 p0 */ + const vfloat<K> U = dot(tri_e2,R) ^ sgnDen; + valid &= U >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p0 p1 */ + const vfloat<K> V = dot(tri_e1,R) ^ sgnDen; + valid &= V >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p1 p2 */ + const vfloat<K> W = absDen-U-V; + valid &= W >= 0.0f; + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen; + valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar); + if (unlikely(none(valid))) return false; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= den < vfloat<K>(zero); + if (unlikely(none(valid))) return false; +#else + valid &= den != vfloat<K>(zero); + if (unlikely(none(valid))) return false; +#endif + + /* calculate hit information */ + WoopHitK<K> hit(U,V,T,absDen,tri_Ng); + return epilog(valid,hit); + } + + /*! Intersects K rays with one of M triangles. */ + template<typename Epilog> + __forceinline vbool<K> intersectK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_v1, + const Vec3vf<K>& tri_v2, + const Epilog& epilog) const + { + const Vec3vf<K> e1 = tri_v0-tri_v1; + const Vec3vf<K> e2 = tri_v2-tri_v0; + const Vec3vf<K> Ng = cross(e2,e1); + return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog); + } + + /*! Intersects K rays with one of M triangles. */ + template<typename Epilog> + __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, + RayK<K>& ray, + const Vec3vf<K>& tri_v0, + const Vec3vf<K>& tri_e1, + const Vec3vf<K>& tri_e2, + const Epilog& epilog) const + { + const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1); + return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog); + } + + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + __forceinline bool intersectEdge(RayK<K>& ray, + size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + WoopHitM<M>& hit) const + { + /* calculate denominator */ + typedef Vec3vf<M> Vec3vfM; + const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1); + + const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k); + const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k); + const Vec3vfM C = Vec3vfM(tri_v0) - O; + const Vec3vfM R = cross(C,D); + const vfloat<M> den = dot(Vec3vfM(tri_Ng),D); + const vfloat<M> absDen = abs(den); + const vfloat<M> sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen; + const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k])); + if (likely(none(valid))) return false; + + /* calculate hit information */ + new (&hit) WoopHitM<M>(valid,U,V,T,absDen,tri_Ng); + return true; + } + + __forceinline bool intersectEdge(RayK<K>& ray, + size_t k, + const BBox<vfloat<M>>& time_range, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + WoopHitM<M>& hit) const + { + if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) + { + hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]); + hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper; + return any(hit.valid); + } + return false; + } + + template<typename Epilog> + __forceinline bool intersectEdge(RayK<K>& ray, + size_t k, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const Epilog& epilog) const + { + WoopHitM<M> hit; + if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename Epilog> + __forceinline bool intersectEdge(RayK<K>& ray, + size_t k, + const BBox<vfloat<M>>& time_range, + const Vec3vf<M>& tri_v0, + const Vec3vf<M>& tri_e1, + const Vec3vf<M>& tri_e2, + const Epilog& epilog) const + { + WoopHitM<M> hit; + if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); + return false; + } + + template<typename Epilog> + __forceinline bool intersect(RayK<K>& ray, + size_t k, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const Epilog& epilog) const + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + return intersectEdge(ray,k,v0,e1,e2,epilog); + } + + template<typename Epilog> + __forceinline bool intersect(RayK<K>& ray, + size_t k, + const BBox<vfloat<M>>& time_range, + const Vec3vf<M>& v0, + const Vec3vf<M>& v1, + const Vec3vf<M>& v2, + const Epilog& epilog) const + { + const Vec3vf<M> e1 = v0-v1; + const Vec3vf<M> e2 = v2-v0; + return intersectEdge(ray,k,time_range,v0,e1,e2,epilog); + } + }; +#endif + } +} diff --git a/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h b/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h new file mode 100644 index 0000000000..50106bcc16 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/triangle_triangle_intersector.h @@ -0,0 +1,132 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "primitive.h" + +namespace embree +{ + namespace isa + { + struct TriangleTriangleIntersector + { + __forceinline static float T(float pa0, float pa1, float da0, float da1) { + return pa0 + (pa1-pa0)*da0/(da0-da1); + } + + __forceinline static bool point_line_side(const Vec2f& p, const Vec2f& a0, const Vec2f& a1) { + return det(p-a0,a0-a1) >= 0.0f; + } + + __forceinline static bool point_inside_triangle(const Vec2f& p, const Vec2f& a, const Vec2f& b, const Vec2f& c) + { + const bool pab = point_line_side(p,a,b); + const bool pbc = point_line_side(p,b,c); + const bool pca = point_line_side(p,c,a); + return pab == pbc && pab == pca; + } + + __forceinline static bool intersect_line_line(const Vec2f& a0, const Vec2f& a1, const Vec2f& b0, const Vec2f& b1) + { + const bool different_sides0 = point_line_side(b0,a0,a1) != point_line_side(b1,a0,a1); + const bool different_sides1 = point_line_side(a0,b0,b1) != point_line_side(a1,b0,b1); + return different_sides0 && different_sides1; + } + + __forceinline static bool intersect_triangle_triangle (const Vec2f& a0, const Vec2f& a1, const Vec2f& a2, + const Vec2f& b0, const Vec2f& b1, const Vec2f& b2) + { + const bool a01_b01 = intersect_line_line(a0,a1,b0,b1); + if (a01_b01) return true; + const bool a01_b12 = intersect_line_line(a0,a1,b1,b2); + if (a01_b12) return true; + const bool a01_b20 = intersect_line_line(a0,a1,b2,b0); + if (a01_b20) return true; + const bool a12_b01 = intersect_line_line(a1,a2,b0,b1); + if (a12_b01) return true; + const bool a12_b12 = intersect_line_line(a1,a2,b1,b2); + if (a12_b12) return true; + const bool a12_b20 = intersect_line_line(a1,a2,b2,b0); + if (a12_b20) return true; + const bool a20_b01 = intersect_line_line(a2,a0,b0,b1); + if (a20_b01) return true; + const bool a20_b12 = intersect_line_line(a2,a0,b1,b2); + if (a20_b12) return true; + const bool a20_b20 = intersect_line_line(a2,a0,b2,b0); + if (a20_b20) return true; + + bool a_in_b = point_inside_triangle(a0,b0,b1,b2) && point_inside_triangle(a1,b0,b1,b2) && point_inside_triangle(a2,b0,b1,b2); + if (a_in_b) return true; + + bool b_in_a = point_inside_triangle(b0,a0,a1,a2) && point_inside_triangle(b1,a0,a1,a2) && point_inside_triangle(b2,a0,a1,a2); + if (b_in_a) return true; + + return false; + } + + static bool intersect_triangle_triangle (const Vec3fa& a0, const Vec3fa& a1, const Vec3fa& a2, + const Vec3fa& b0, const Vec3fa& b1, const Vec3fa& b2) + { + const float eps = 1E-5f; + + /* calculate triangle planes */ + const Vec3fa Na = cross(a1-a0,a2-a0); + const float Ca = dot(Na,a0); + const Vec3fa Nb = cross(b1-b0,b2-b0); + const float Cb = dot(Nb,b0); + + /* project triangle A onto plane B */ + const float da0 = dot(Nb,a0)-Cb; + const float da1 = dot(Nb,a1)-Cb; + const float da2 = dot(Nb,a2)-Cb; + if (max(da0,da1,da2) < -eps) return false; + if (min(da0,da1,da2) > +eps) return false; + //CSTAT(bvh_collide_prim_intersections4++); + + /* project triangle B onto plane A */ + const float db0 = dot(Na,b0)-Ca; + const float db1 = dot(Na,b1)-Ca; + const float db2 = dot(Na,b2)-Ca; + if (max(db0,db1,db2) < -eps) return false; + if (min(db0,db1,db2) > +eps) return false; + //CSTAT(bvh_collide_prim_intersections5++); + + if (unlikely((std::fabs(da0) < eps && std::fabs(da1) < eps && std::fabs(da2) < eps) || + (std::fabs(db0) < eps && std::fabs(db1) < eps && std::fabs(db2) < eps))) + { + const size_t dz = maxDim(Na); + const size_t dx = (dz+1)%3; + const size_t dy = (dx+1)%3; + const Vec2f A0(a0[dx],a0[dy]); + const Vec2f A1(a1[dx],a1[dy]); + const Vec2f A2(a2[dx],a2[dy]); + const Vec2f B0(b0[dx],b0[dy]); + const Vec2f B1(b1[dx],b1[dy]); + const Vec2f B2(b2[dx],b2[dy]); + return intersect_triangle_triangle(A0,A1,A2,B0,B1,B2); + } + + const Vec3fa D = cross(Na,Nb); + const float pa0 = dot(D,a0); + const float pa1 = dot(D,a1); + const float pa2 = dot(D,a2); + const float pb0 = dot(D,b0); + const float pb1 = dot(D,b1); + const float pb2 = dot(D,b2); + + BBox1f ba = empty; + if (min(da0,da1) <= 0.0f && max(da0,da1) >= 0.0f && abs(da0-da1) > 0.0f) ba.extend(T(pa0,pa1,da0,da1)); + if (min(da1,da2) <= 0.0f && max(da1,da2) >= 0.0f && abs(da1-da2) > 0.0f) ba.extend(T(pa1,pa2,da1,da2)); + if (min(da2,da0) <= 0.0f && max(da2,da0) >= 0.0f && abs(da2-da0) > 0.0f) ba.extend(T(pa2,pa0,da2,da0)); + + BBox1f bb = empty; + if (min(db0,db1) <= 0.0f && max(db0,db1) >= 0.0f && abs(db0-db1) > 0.0f) bb.extend(T(pb0,pb1,db0,db1)); + if (min(db1,db2) <= 0.0f && max(db1,db2) >= 0.0f && abs(db1-db2) > 0.0f) bb.extend(T(pb1,pb2,db1,db2)); + if (min(db2,db0) <= 0.0f && max(db2,db0) >= 0.0f && abs(db2-db0) > 0.0f) bb.extend(T(pb2,pb0,db2,db0)); + + return conjoint(ba,bb); + } + }; + } +} + + diff --git a/thirdparty/embree/kernels/geometry/trianglei.h b/thirdparty/embree/kernels/geometry/trianglei.h new file mode 100644 index 0000000000..6aad48a5ef --- /dev/null +++ b/thirdparty/embree/kernels/geometry/trianglei.h @@ -0,0 +1,442 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../common/scene.h" + +namespace embree +{ + /* Stores M triangles from an indexed face set */ + template <int M> + struct TriangleMi + { + /* Virtual interface to query information about the triangle type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored triangles */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline TriangleMi() { } + + /* Construction from vertices and IDs */ + __forceinline TriangleMi(const vuint<M>& v0, + const vuint<M>& v1, + const vuint<M>& v2, + const vuint<M>& geomIDs, + const vuint<M>& primIDs) +#if defined(EMBREE_COMPACT_POLYS) + : geomIDs(geomIDs), primIDs(primIDs) {} +#else + : v0_(v0), v1_(v1), v2_(v2), geomIDs(geomIDs), primIDs(primIDs) {} +#endif + + /* Returns a mask that tells which triangles are valid */ + __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); } + + /* Returns if the specified triangle is valid */ + __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; } + + /* Returns the number of stored triangles */ + __forceinline size_t size() const { return bsf(~movemask(valid())); } + + /* Returns the geometry IDs */ + __forceinline vuint<M> geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; } + + /* Returns the primitive IDs */ + __forceinline vuint<M> primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; } + + /* Calculate the bounds of the triangles */ + __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const + { + BBox3fa bounds = empty; + for (size_t i=0; i<M && valid(i); i++) { + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i)); + bounds.extend(mesh->bounds(primID(i),itime)); + } + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime) { + return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) + { + LBBox3fa allBounds = empty; + for (size_t i=0; i<M && valid(i); i++) + { + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i)); + allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps)); + } + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + for (size_t i=0; i<M && valid(i); i++) + { + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i)); + allBounds.extend(mesh->linearBounds(primID(i), time_range)); + } + return allBounds; + } + + /* Non-temporal store */ + __forceinline static void store_nt(TriangleMi* dst, const TriangleMi& src) + { +#if !defined(EMBREE_COMPACT_POLYS) + vuint<M>::store_nt(&dst->v0_,src.v0_); + vuint<M>::store_nt(&dst->v1_,src.v1_); + vuint<M>::store_nt(&dst->v2_,src.v2_); +#endif + vuint<M>::store_nt(&dst->geomIDs,src.geomIDs); + vuint<M>::store_nt(&dst->primIDs,src.primIDs); + } + + /* Fill triangle from triangle list */ + template<typename PrimRefT> + __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) + { + vuint<M> v0 = zero, v1 = zero, v2 = zero; + vuint<M> geomID = -1, primID = -1; + const PrimRefT* prim = &prims[begin]; + + for (size_t i=0; i<M; i++) + { + if (begin<end) { + geomID[i] = prim->geomID(); + primID[i] = prim->primID(); +#if !defined(EMBREE_COMPACT_POLYS) + const TriangleMesh* mesh = scene->get<TriangleMesh>(prim->geomID()); + const TriangleMesh::Triangle& tri = mesh->triangle(prim->primID()); + unsigned int int_stride = mesh->vertices0.getStride()/4; + v0[i] = tri.v[0] * int_stride; + v1[i] = tri.v[1] * int_stride; + v2[i] = tri.v[2] * int_stride; +#endif + begin++; + } else { + assert(i); + if (likely(i > 0)) { + geomID[i] = geomID[0]; + primID[i] = -1; + v0[i] = v0[0]; + v1[i] = v0[0]; + v2[i] = v0[0]; + } + } + if (begin<end) prim = &prims[begin]; + } + new (this) TriangleMi(v0,v1,v2,geomID,primID); // FIXME: use non temporal store + } + + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) + { + fill(prims, begin, end, scene); + return linearBounds(scene, itime); + } + + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) + { + fill(prims, begin, end, scene); + return linearBounds(scene, time_range); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(TriangleMesh* mesh) + { + BBox3fa bounds = empty; + for (size_t i=0; i<M; i++) + { + if (primID(i) == -1) break; + const unsigned int primId = primID(i); + const TriangleMesh::Triangle& tri = mesh->triangle(primId); + const Vec3fa p0 = mesh->vertex(tri.v[0]); + const Vec3fa p1 = mesh->vertex(tri.v[1]); + const Vec3fa p2 = mesh->vertex(tri.v[2]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); + } + return bounds; + } + + protected: +#if !defined(EMBREE_COMPACT_POLYS) + vuint<M> v0_; // 4 byte offset of 1st vertex + vuint<M> v1_; // 4 byte offset of 2nd vertex + vuint<M> v2_; // 4 byte offset of 3rd vertex +#endif + vuint<M> geomIDs; // geometry ID of mesh + vuint<M> primIDs; // primitive ID of primitive inside mesh + }; + + namespace isa + { + + template<int M> + struct TriangleMi : public embree::TriangleMi<M> + { +#if !defined(EMBREE_COMPACT_POLYS) + using embree::TriangleMi<M>::v0_; + using embree::TriangleMi<M>::v1_; + using embree::TriangleMi<M>::v2_; +#endif + using embree::TriangleMi<M>::geomIDs; + using embree::TriangleMi<M>::primIDs; + using embree::TriangleMi<M>::geomID; + using embree::TriangleMi<M>::primID; + using embree::TriangleMi<M>::valid; + + /* loads a single vertex */ + template<int vid> + __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const + { +#if defined(EMBREE_COMPACT_POLYS) + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index)); + const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); + return (Vec3f) mesh->vertices[0][tri.v[vid]]; +#else + const vuint<M>& v = getVertexOffset<vid>(); + const float* vertices = scene->vertices[geomID(index)]; + return (Vec3f&) vertices[v[index]]; +#endif + } + + template<int vid, typename T> + __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const + { +#if defined(EMBREE_COMPACT_POLYS) + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index)); + const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); + const Vec3fa v0 = mesh->vertices[itime+0][tri.v[vid]]; + const Vec3fa v1 = mesh->vertices[itime+1][tri.v[vid]]; +#else + const vuint<M>& v = getVertexOffset<vid>(); + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index)); + const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0); + const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1); + const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); + const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); +#endif + const Vec3<T> p0(v0.x,v0.y,v0.z); + const Vec3<T> p1(v1.x,v1.y,v1.z); + return lerp(p0,p1,ftime); + } + + template<int vid, int K, typename T> + __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const + { + Vec3<T> p0, p1; + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index)); + + for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask)) + { +#if defined(EMBREE_COMPACT_POLYS) + const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); + const Vec3fa v0 = mesh->vertices[itime[i]+0][tri.v[vid]]; + const Vec3fa v1 = mesh->vertices[itime[i]+1][tri.v[vid]]; +#else + const vuint<M>& v = getVertexOffset<vid>(); + const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0); + const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1); + const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); + const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); +#endif + p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z; + p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z; + } + return (T(one)-ftime)*p0 + ftime*p1; + } + + struct Triangle { + vfloat4 v0,v1,v2; + }; + +#if defined(EMBREE_COMPACT_POLYS) + + __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const + { + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + if (unlikely(primID == -1)) return { zero, zero, zero }; + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const vfloat4 v0 = (vfloat4) mesh->vertices0[tri.v[0]]; + const vfloat4 v1 = (vfloat4) mesh->vertices0[tri.v[1]]; + const vfloat4 v2 = (vfloat4) mesh->vertices0[tri.v[2]]; + return { v0, v1, v2 }; + } + + __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const + { + const unsigned int primID = primIDs[i]; + if (unlikely(primID == -1)) return { zero, zero, zero }; + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const vfloat4 v0 = (vfloat4) mesh->vertices[itime][tri.v[0]]; + const vfloat4 v1 = (vfloat4) mesh->vertices[itime][tri.v[1]]; + const vfloat4 v2 = (vfloat4) mesh->vertices[itime][tri.v[2]]; + return { v0, v1, v2 }; + } + +#else + + __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const + { + const float* vertices = scene->vertices[geomID(i)]; + const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); + const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); + const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); + return { v0, v1, v2 }; + } + + __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const + { + const float* vertices = (const float*) mesh->vertexPtr(0,itime); + const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); + const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); + const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); + return { v0, v1, v2 }; + } + +#endif + + /* Gather the triangles */ + __forceinline void gather(Vec3vf<M>& p0, Vec3vf<M>& p1, Vec3vf<M>& p2, const Scene* const scene) const; + + template<int K> +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019 + __noinline +#else + __forceinline +#endif + void gather(const vbool<K>& valid, + Vec3vf<K>& p0, + Vec3vf<K>& p1, + Vec3vf<K>& p2, + const size_t index, + const Scene* const scene, + const vfloat<K>& time) const + { + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index)); + + vfloat<K> ftime; + const vint<K> itime = mesh->timeSegment<K>(time, ftime); + + const size_t first = bsf(movemask(valid)); + if (likely(all(valid,itime[first] == itime))) + { + p0 = getVertex<0>(index, scene, itime[first], ftime); + p1 = getVertex<1>(index, scene, itime[first], ftime); + p2 = getVertex<2>(index, scene, itime[first], ftime); + } else { + p0 = getVertex<0,K>(valid, index, scene, itime, ftime); + p1 = getVertex<1,K>(valid, index, scene, itime, ftime); + p2 = getVertex<2,K>(valid, index, scene, itime, ftime); + } + } + + __forceinline void gather(Vec3vf<M>& p0, + Vec3vf<M>& p1, + Vec3vf<M>& p2, + const TriangleMesh* mesh, + const Scene *const scene, + const int itime) const; + + __forceinline void gather(Vec3vf<M>& p0, + Vec3vf<M>& p1, + Vec3vf<M>& p2, + const Scene *const scene, + const float time) const; + + +#if !defined(EMBREE_COMPACT_POLYS) + template<int N> const vuint<M>& getVertexOffset() const; +#endif + }; + +#if !defined(EMBREE_COMPACT_POLYS) + template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<0>() const { return v0_; } + template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<1>() const { return v1_; } + template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<2>() const { return v2_; } +#endif + + template<> + __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + const Scene* const scene) const + { + const Triangle tri0 = loadTriangle(0,scene); + const Triangle tri1 = loadTriangle(1,scene); + const Triangle tri2 = loadTriangle(2,scene); + const Triangle tri3 = loadTriangle(3,scene); + transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); + transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); + transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); + } + + template<> + __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + const TriangleMesh* mesh, + const Scene *const scene, + const int itime) const + { + const Triangle tri0 = loadTriangle(0,itime,mesh); + const Triangle tri1 = loadTriangle(1,itime,mesh); + const Triangle tri2 = loadTriangle(2,itime,mesh); + const Triangle tri3 = loadTriangle(3,itime,mesh); + transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); + transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); + transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); + } + + template<> + __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + const Scene *const scene, + const float time) const + { + const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(0)); // in mblur mode all geometries are identical + + float ftime; + const int itime = mesh->timeSegment(time, ftime); + + Vec3vf4 a0,a1,a2; gather(a0,a1,a2,mesh,scene,itime); + Vec3vf4 b0,b1,b2; gather(b0,b1,b2,mesh,scene,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + p2 = lerp(a2,b2,vfloat4(ftime)); + } + } + + template<int M> + typename TriangleMi<M>::Type TriangleMi<M>::type; + + typedef TriangleMi<4> Triangle4i; +} diff --git a/thirdparty/embree/kernels/geometry/trianglei_intersector.h b/thirdparty/embree/kernels/geometry/trianglei_intersector.h new file mode 100644 index 0000000000..f7deb9e72d --- /dev/null +++ b/thirdparty/embree/kernels/geometry/trianglei_intersector.h @@ -0,0 +1,336 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "trianglei.h" +#include "triangle_intersector_moeller.h" +#include "triangle_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMiIntersector1Moeller + { + typedef TriangleMi<M> Primitive; + typedef MoellerTrumboreIntersector1<M> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + /*! Intersects M triangles with K rays */ + template<int M, int K, bool filter> + struct TriangleMiIntersectorKMoeller + { + typedef TriangleMi<M> Primitive; + typedef MoellerTrumboreIntersectorK<M,K> Precalculations; + + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri) + { + const Scene* scene = context->scene; + for (size_t i=0; i<Primitive::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size()); + const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene); + const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene); + const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene); + pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri) + { + vbool<K> valid0 = valid_i; + const Scene* scene = context->scene; + + for (size_t i=0; i<Primitive::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size()); + const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene); + const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene); + const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene); + pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + }; + + /*! Intersects M triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMiIntersector1Pluecker + { + typedef TriangleMi<M> Primitive; + typedef PlueckerIntersector1<M> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + /*! Intersects M triangles with K rays */ + template<int M, int K, bool filter> + struct TriangleMiIntersectorKPluecker + { + typedef TriangleMi<M> Primitive; + typedef PlueckerIntersectorK<M,K> Precalculations; + + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri) + { + const Scene* scene = context->scene; + for (size_t i=0; i<Primitive::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size()); + const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene); + const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene); + const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene); + pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri) + { + vbool<K> valid0 = valid_i; + const Scene* scene = context->scene; + + for (size_t i=0; i<Primitive::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size()); + const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene); + const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene); + const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene); + pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + }; + + /*! Intersects M motion blur triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMiMBIntersector1Moeller + { + typedef TriangleMi<M> Primitive; + typedef MoellerTrumboreIntersector1<M> Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); + pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); + return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + /*! Intersects M motion blur triangles with K rays. */ + template<int M, int K, bool filter> + struct TriangleMiMBIntersectorKMoeller + { + typedef TriangleMi<M> Primitive; + typedef MoellerTrumboreIntersectorK<M,K> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri) + { + for (size_t i=0; i<TriangleMi<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time()); + pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri) + { + vbool<K> valid0 = valid_i; + for (size_t i=0; i<TriangleMi<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time()); + pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); + pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); + return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + }; + + /*! Intersects M motion blur triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMiMBIntersector1Pluecker + { + typedef TriangleMi<M> Primitive; + typedef PlueckerIntersector1<M> Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); + pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); + return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + /*! Intersects M motion blur triangles with K rays. */ + template<int M, int K, bool filter> + struct TriangleMiMBIntersectorKPluecker + { + typedef TriangleMi<M> Primitive; + typedef PlueckerIntersectorK<M,K> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri) + { + for (size_t i=0; i<TriangleMi<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time()); + pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri) + { + vbool<K> valid0 = valid_i; + for (size_t i=0; i<TriangleMi<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + Vec3vf<K> v0,v1,v2; tri.template gather<K>(valid_i,v0,v1,v2,i,context->scene,ray.time()); + pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); + pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); + return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/trianglev.h b/thirdparty/embree/kernels/geometry/trianglev.h new file mode 100644 index 0000000000..cd94756b9e --- /dev/null +++ b/thirdparty/embree/kernels/geometry/trianglev.h @@ -0,0 +1,157 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + /* Stores the vertices of M triangles in struct of array layout */ + template <int M> + struct TriangleMv + { + public: + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored triangles */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline TriangleMv() {} + + /* Construction from vertices and IDs */ + __forceinline TriangleMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs) + : v0(v0), v1(v1), v2(v2), geomIDs(geomIDs), primIDs(primIDs) {} + + /* Returns a mask that tells which triangles are valid */ + __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); } + + /* Returns true if the specified triangle is valid */ + __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; } + + /* Returns the number of stored triangles */ + __forceinline size_t size() const { return bsf(~movemask(valid())); } + + /* Returns the geometry IDs */ + __forceinline vuint<M>& geomID() { return geomIDs; } + __forceinline const vuint<M>& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; } + + /* Returns the primitive IDs */ + __forceinline vuint<M>& primID() { return primIDs; } + __forceinline const vuint<M>& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; } + + /* Calculate the bounds of the triangles */ + __forceinline BBox3fa bounds() const + { + Vec3vf<M> lower = min(v0,v1,v2); + Vec3vf<M> upper = max(v0,v1,v2); + vbool<M> mask = valid(); + lower.x = select(mask,lower.x,vfloat<M>(pos_inf)); + lower.y = select(mask,lower.y,vfloat<M>(pos_inf)); + lower.z = select(mask,lower.z,vfloat<M>(pos_inf)); + upper.x = select(mask,upper.x,vfloat<M>(neg_inf)); + upper.y = select(mask,upper.y,vfloat<M>(neg_inf)); + upper.z = select(mask,upper.z,vfloat<M>(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Non temporal store */ + __forceinline static void store_nt(TriangleMv* dst, const TriangleMv& src) + { + vfloat<M>::store_nt(&dst->v0.x,src.v0.x); + vfloat<M>::store_nt(&dst->v0.y,src.v0.y); + vfloat<M>::store_nt(&dst->v0.z,src.v0.z); + vfloat<M>::store_nt(&dst->v1.x,src.v1.x); + vfloat<M>::store_nt(&dst->v1.y,src.v1.y); + vfloat<M>::store_nt(&dst->v1.z,src.v1.z); + vfloat<M>::store_nt(&dst->v2.x,src.v2.x); + vfloat<M>::store_nt(&dst->v2.y,src.v2.y); + vfloat<M>::store_nt(&dst->v2.z,src.v2.z); + vuint<M>::store_nt(&dst->geomIDs,src.geomIDs); + vuint<M>::store_nt(&dst->primIDs,src.primIDs); + } + + /* Fill triangle from triangle list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) + { + vuint<M> vgeomID = -1, vprimID = -1; + Vec3vf<M> v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; i<M && begin<end; i++, begin++) + { + const PrimRef& prim = prims[begin]; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + vgeomID [i] = geomID; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + TriangleMv::store_nt(this,TriangleMv(v0,v1,v2,vgeomID,vprimID)); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(TriangleMesh* mesh) + { + BBox3fa bounds = empty; + vuint<M> vgeomID = -1, vprimID = -1; + Vec3vf<M> v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; i<M; i++) + { + if (primID(i) == -1) break; + const unsigned geomId = geomID(i); + const unsigned primId = primID(i); + const TriangleMesh::Triangle& tri = mesh->triangle(primId); + const Vec3fa p0 = mesh->vertex(tri.v[0]); + const Vec3fa p1 = mesh->vertex(tri.v[1]); + const Vec3fa p2 = mesh->vertex(tri.v[2]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); + vgeomID [i] = geomId; + vprimID [i] = primId; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + new (this) TriangleMv(v0,v1,v2,vgeomID,vprimID); + return bounds; + } + + public: + Vec3vf<M> v0; // 1st vertex of the triangles + Vec3vf<M> v1; // 2nd vertex of the triangles + Vec3vf<M> v2; // 3rd vertex of the triangles + private: + vuint<M> geomIDs; // geometry ID + vuint<M> primIDs; // primitive ID + }; + + template<int M> + typename TriangleMv<M>::Type TriangleMv<M>::type; + + typedef TriangleMv<4> Triangle4v; +} diff --git a/thirdparty/embree/kernels/geometry/trianglev_intersector.h b/thirdparty/embree/kernels/geometry/trianglev_intersector.h new file mode 100644 index 0000000000..3abb7f8e32 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/trianglev_intersector.h @@ -0,0 +1,206 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "triangle_intersector_pluecker.h" +#include "triangle_intersector_moeller.h" +#include "triangle_intersector_woop.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMvIntersector1Moeller + { + typedef TriangleMv<M> Primitive; + typedef MoellerTrumboreIntersector1<M> Precalculations; + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + + template<int M, bool filter> + struct TriangleMvIntersector1Woop + { + typedef TriangleMv<M> Primitive; + typedef WoopIntersector1<M> intersec; + typedef WoopPrecalculations1<M> Precalculations; + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + + /*! Intersects M triangles with K rays */ + template<int M, int K, bool filter> + struct TriangleMvIntersectorKMoeller + { + typedef TriangleMv<M> Primitive; + typedef MoellerTrumboreIntersectorK<M,K> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri) + { + for (size_t i=0; i<M; i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i); + const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i); + const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i); + pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri) + { + vbool<K> valid0 = valid_i; + + for (size_t i=0; i<M; i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i); + const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i); + const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i); + pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<M>(),*/Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M + } + }; + + /*! Intersects M triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMvIntersector1Pluecker + { + typedef TriangleMv<M> Primitive; + typedef PlueckerIntersector1<M> Precalculations; + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + /*! Intersects M triangles with K rays */ + template<int M, int K, bool filter> + struct TriangleMvIntersectorKPluecker + { + typedef TriangleMv<M> Primitive; + typedef PlueckerIntersectorK<M,K> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri) + { + for (size_t i=0; i<M; i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i); + const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i); + const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i); + pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri) + { + vbool<K> valid0 = valid_i; + + for (size_t i=0; i<M; i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i); + const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i); + const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i); + pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + }; + } +} diff --git a/thirdparty/embree/kernels/geometry/trianglev_mb.h b/thirdparty/embree/kernels/geometry/trianglev_mb.h new file mode 100644 index 0000000000..b550a29fd5 --- /dev/null +++ b/thirdparty/embree/kernels/geometry/trianglev_mb.h @@ -0,0 +1,201 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + /* Stores the vertices of M triangles in struct of array layout */ + template<int M> + struct TriangleMvMB + { + public: + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + + static Type type; + + public: + + /* primitive supports single time segments */ + static const bool singleTimeSegment = true; + + /* Returns maximum number of stored triangles */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline TriangleMvMB() {} + + /* Construction from vertices and IDs */ + __forceinline TriangleMvMB(const Vec3vf<M>& a0, const Vec3vf<M>& a1, + const Vec3vf<M>& b0, const Vec3vf<M>& b1, + const Vec3vf<M>& c0, const Vec3vf<M>& c1, + const vuint<M>& geomIDs, const vuint<M>& primIDs) + : v0(a0), v1(b0), v2(c0), dv0(a1-a0), dv1(b1-b0), dv2(c1-c0), geomIDs(geomIDs), primIDs(primIDs) {} + + /* Returns a mask that tells which triangles are valid */ + __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); } + + /* Returns if the specified triangle is valid */ + __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; } + + /* Returns the number of stored triangles */ + __forceinline size_t size() const { return bsf(~movemask(valid())); } + + /* Returns the geometry IDs */ + __forceinline vuint<M>& geomID() { return geomIDs; } + __forceinline const vuint<M>& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; } + + /* Returns the primitive IDs */ + __forceinline vuint<M>& primID() { return primIDs; } + __forceinline const vuint<M>& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; } + + /* Calculate the bounds of the triangles at t0 */ + __forceinline BBox3fa bounds0() const + { + Vec3vf<M> lower = min(v0,v1,v2); + Vec3vf<M> upper = max(v0,v1,v2); + const vbool<M> mask = valid(); + lower.x = select(mask,lower.x,vfloat<M>(pos_inf)); + lower.y = select(mask,lower.y,vfloat<M>(pos_inf)); + lower.z = select(mask,lower.z,vfloat<M>(pos_inf)); + upper.x = select(mask,upper.x,vfloat<M>(neg_inf)); + upper.y = select(mask,upper.y,vfloat<M>(neg_inf)); + upper.z = select(mask,upper.z,vfloat<M>(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Calculate the bounds of the triangles at t1 */ + __forceinline BBox3fa bounds1() const + { + const Vec3vf<M> p0 = v0+dv0; + const Vec3vf<M> p1 = v1+dv1; + const Vec3vf<M> p2 = v2+dv2; + Vec3vf<M> lower = min(p0,p1,p2); + Vec3vf<M> upper = max(p0,p1,p2); + const vbool<M> mask = valid(); + lower.x = select(mask,lower.x,vfloat<M>(pos_inf)); + lower.y = select(mask,lower.y,vfloat<M>(pos_inf)); + lower.z = select(mask,lower.z,vfloat<M>(pos_inf)); + upper.x = select(mask,upper.x,vfloat<M>(neg_inf)); + upper.y = select(mask,upper.y,vfloat<M>(neg_inf)); + upper.z = select(mask,upper.z,vfloat<M>(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds() const { + return LBBox3fa(bounds0(),bounds1()); + } + + /* Fill triangle from triangle list */ + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) + { + vuint<M> vgeomID = -1, vprimID = -1; + Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero; + Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero; + + BBox3fa bounds0 = empty; + BBox3fa bounds1 = empty; + + for (size_t i=0; i<M && begin<end; i++, begin++) + { + const PrimRef& prim = prims[begin]; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0); + const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1); + const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0); + const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1); + const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0); + const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1); + vgeomID [i] = geomID; + vprimID [i] = primID; + va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z; + va1.x[i] = a1.x; va1.y[i] = a1.y; va1.z[i] = a1.z; + vb0.x[i] = b0.x; vb0.y[i] = b0.y; vb0.z[i] = b0.z; + vb1.x[i] = b1.x; vb1.y[i] = b1.y; vb1.z[i] = b1.z; + vc0.x[i] = c0.x; vc0.y[i] = c0.y; vc0.z[i] = c0.z; + vc1.x[i] = c1.x; vc1.y[i] = c1.y; vc1.z[i] = c1.z; + } + new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID); + return LBBox3fa(bounds0,bounds1); + } + + /* Fill triangle from triangle list */ + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) + { + vuint<M> vgeomID = -1, vprimID = -1; + Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero; + Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero; + + LBBox3fa allBounds = empty; + for (size_t i=0; i<M && begin<end; i++, begin++) + { + const PrimRefMB& prim = prims[begin]; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const TriangleMesh* const mesh = scene->get<TriangleMesh>(geomID); + const range<int> itime_range = mesh->timeSegmentRange(time_range); + assert(itime_range.size() == 1); + const int ilower = itime_range.begin(); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + allBounds.extend(mesh->linearBounds(primID, time_range)); + const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0); + const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1); + const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0); + const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1); + const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0); + const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1); + const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1)); + auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v); + auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v); + auto c01 = globalLinear(std::make_pair(c0,c1),time_range_v); + vgeomID [i] = geomID; + vprimID [i] = primID; + va0.x[i] = a01.first .x; va0.y[i] = a01.first .y; va0.z[i] = a01.first .z; + va1.x[i] = a01.second.x; va1.y[i] = a01.second.y; va1.z[i] = a01.second.z; + vb0.x[i] = b01.first .x; vb0.y[i] = b01.first .y; vb0.z[i] = b01.first .z; + vb1.x[i] = b01.second.x; vb1.y[i] = b01.second.y; vb1.z[i] = b01.second.z; + vc0.x[i] = c01.first .x; vc0.y[i] = c01.first .y; vc0.z[i] = c01.first .z; + vc1.x[i] = c01.second.x; vc1.y[i] = c01.second.y; vc1.z[i] = c01.second.z; + } + new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID); + return allBounds; + } + + public: + Vec3vf<M> v0; // 1st vertex of the triangles + Vec3vf<M> v1; // 2nd vertex of the triangles + Vec3vf<M> v2; // 3rd vertex of the triangles + Vec3vf<M> dv0; // difference vector between time steps t0 and t1 for first vertex + Vec3vf<M> dv1; // difference vector between time steps t0 and t1 for second vertex + Vec3vf<M> dv2; // difference vector between time steps t0 and t1 for third vertex + private: + vuint<M> geomIDs; // geometry ID + vuint<M> primIDs; // primitive ID + }; + + template<int M> + typename TriangleMvMB<M>::Type TriangleMvMB<M>::type; + + typedef TriangleMvMB<4> Triangle4vMB; +} diff --git a/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h new file mode 100644 index 0000000000..38cd52e85d --- /dev/null +++ b/thirdparty/embree/kernels/geometry/trianglev_mb_intersector.h @@ -0,0 +1,211 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "intersector_epilog.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M motion blur triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMvMBIntersector1Moeller + { + typedef TriangleMvMB<M> Primitive; + typedef MoellerTrumboreIntersector1<M> Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri) + { + STAT3(normal.trav_prims,1,1,1); + const Vec3vf<M> time(ray.time()); + const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0)); + const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1)); + const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2)); + pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri) + { + STAT3(shadow.trav_prims,1,1,1); + const Vec3vf<M> time(ray.time()); + const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0)); + const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1)); + const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2)); + return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + /*! Intersects M motion blur triangles with K rays. */ + template<int M, int K, bool filter> + struct TriangleMvMBIntersectorKMoeller + { + typedef TriangleMvMB<M> Primitive; + typedef MoellerTrumboreIntersectorK<M,K> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri) + { + for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> time(ray.time()); + const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i)); + const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i)); + const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i)); + pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri) + { + vbool<K> valid0 = valid_i; + + for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf<K> time(ray.time()); + const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i)); + const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i)); + const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i)); + pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri) + { + STAT3(normal.trav_prims,1,1,1); + const Vec3vf<M> time(ray.time()[k]); + const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0)); + const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1)); + const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2)); + pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri) + { + STAT3(shadow.trav_prims,1,1,1); + const Vec3vf<M> time(ray.time()[k]); + const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0)); + const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1)); + const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2)); + return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + }; + + /*! Intersects M motion blur triangles with 1 ray */ + template<int M, bool filter> + struct TriangleMvMBIntersector1Pluecker + { + typedef TriangleMvMB<M> Primitive; + typedef PlueckerIntersector1<M> Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri) + { + STAT3(normal.trav_prims,1,1,1); + const Vec3vf<M> time(ray.time()); + const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0)); + const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1)); + const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2)); + pre.intersect(ray,v0,v1,v2,UVIdentity<M>(),Intersect1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri) + { + STAT3(shadow.trav_prims,1,1,1); + const Vec3vf<M> time(ray.time()); + const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0)); + const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1)); + const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2)); + return pre.intersect(ray,v0,v1,v2,UVIdentity<M>(),Occluded1EpilogM<M,filter>(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri); + } + }; + + /*! Intersects M motion blur triangles with K rays. */ + template<int M, int K, bool filter> + struct TriangleMvMBIntersectorKPluecker + { + typedef TriangleMvMB<M> Primitive; + typedef PlueckerIntersectorK<M,K> Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri) + { + for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf<K> time(ray.time()); + const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i)); + const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i)); + const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i)); + pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri) + { + vbool<K> valid0 = valid_i; + + for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf<K> time(ray.time()); + const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i)); + const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i)); + const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i)); + pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri) + { + STAT3(normal.trav_prims,1,1,1); + const Vec3vf<M> time(ray.time()[k]); + const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0)); + const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1)); + const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2)); + pre.intersect(ray,k,v0,v1,v2,UVIdentity<M>(),Intersect1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri) + { + STAT3(shadow.trav_prims,1,1,1); + const Vec3vf<M> time(ray.time()[k]); + const Vec3vf<M> v0 = madd(time,Vec3vf<M>(tri.dv0),Vec3vf<M>(tri.v0)); + const Vec3vf<M> v1 = madd(time,Vec3vf<M>(tri.dv1),Vec3vf<M>(tri.v1)); + const Vec3vf<M> v2 = madd(time,Vec3vf<M>(tri.dv2),Vec3vf<M>(tri.v2)); + return pre.intersect(ray,k,v0,v1,v2,UVIdentity<M>(),Occluded1KEpilogM<M,K,filter>(ray,k,context,tri.geomID(),tri.primID())); + } + }; + } +} diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h new file mode 100644 index 0000000000..10f315cee7 --- /dev/null +++ b/thirdparty/embree/kernels/hash.h @@ -0,0 +1,5 @@ + +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_HASH "7c53133eb21424f7f0ae1e25bf357e358feaf6ab" diff --git a/thirdparty/embree/kernels/subdiv/bezier_curve.h b/thirdparty/embree/kernels/subdiv/bezier_curve.h new file mode 100644 index 0000000000..a5adad5cc9 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/bezier_curve.h @@ -0,0 +1,671 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +//#include "../common/scene_curves.h" +#include "../common/context.h" + +namespace embree +{ + class BezierBasis + { + public: + + template<typename T> + static __forceinline Vec4<T> eval(const T& u) + { + const T t1 = u; + const T t0 = 1.0f-t1; + const T B0 = t0 * t0 * t0; + const T B1 = 3.0f * t1 * (t0 * t0); + const T B2 = 3.0f * (t1 * t1) * t0; + const T B3 = t1 * t1 * t1; + return Vec4<T>(B0,B1,B2,B3); + } + + template<typename T> + static __forceinline Vec4<T> derivative(const T& u) + { + const T t1 = u; + const T t0 = 1.0f-t1; + const T B0 = -(t0*t0); + const T B1 = madd(-2.0f,t0*t1,t0*t0); + const T B2 = msub(+2.0f,t0*t1,t1*t1); + const T B3 = +(t1*t1); + return T(3.0f)*Vec4<T>(B0,B1,B2,B3); + } + + template<typename T> + static __forceinline Vec4<T> derivative2(const T& u) + { + const T t1 = u; + const T t0 = 1.0f-t1; + const T B0 = t0; + const T B1 = madd(-2.0f,t0,t1); + const T B2 = madd(-2.0f,t1,t0); + const T B3 = t1; + return T(6.0f)*Vec4<T>(B0,B1,B2,B3); + } + }; + + struct PrecomputedBezierBasis + { + enum { N = 16 }; + public: + PrecomputedBezierBasis() {} + PrecomputedBezierBasis(int shift); + + /* basis for bezier evaluation */ + public: + float c0[N+1][N+1]; + float c1[N+1][N+1]; + float c2[N+1][N+1]; + float c3[N+1][N+1]; + + /* basis for bezier derivative evaluation */ + public: + float d0[N+1][N+1]; + float d1[N+1][N+1]; + float d2[N+1][N+1]; + float d3[N+1][N+1]; + }; + extern PrecomputedBezierBasis bezier_basis0; + extern PrecomputedBezierBasis bezier_basis1; + + + template<typename V> + struct LinearBezierCurve + { + V v0,v1; + + __forceinline LinearBezierCurve () {} + + __forceinline LinearBezierCurve (const LinearBezierCurve& other) + : v0(other.v0), v1(other.v1) {} + + __forceinline LinearBezierCurve& operator= (const LinearBezierCurve& other) { + v0 = other.v0; v1 = other.v1; return *this; + } + + __forceinline LinearBezierCurve (const V& v0, const V& v1) + : v0(v0), v1(v1) {} + + __forceinline V begin() const { return v0; } + __forceinline V end () const { return v1; } + + bool hasRoot() const; + + friend embree_ostream operator<<(embree_ostream cout, const LinearBezierCurve& a) { + return cout << "LinearBezierCurve (" << a.v0 << ", " << a.v1 << ")"; + } + }; + + template<> __forceinline bool LinearBezierCurve<Interval1f>::hasRoot() const { + return numRoots(v0,v1); + } + + template<typename V> + struct QuadraticBezierCurve + { + V v0,v1,v2; + + __forceinline QuadraticBezierCurve () {} + + __forceinline QuadraticBezierCurve (const QuadraticBezierCurve& other) + : v0(other.v0), v1(other.v1), v2(other.v2) {} + + __forceinline QuadraticBezierCurve& operator= (const QuadraticBezierCurve& other) { + v0 = other.v0; v1 = other.v1; v2 = other.v2; return *this; + } + + __forceinline QuadraticBezierCurve (const V& v0, const V& v1, const V& v2) + : v0(v0), v1(v1), v2(v2) {} + + __forceinline V begin() const { return v0; } + __forceinline V end () const { return v2; } + + __forceinline V interval() const { + return merge(v0,v1,v2); + } + + __forceinline BBox<V> bounds() const { + return merge(BBox<V>(v0),BBox<V>(v1),BBox<V>(v2)); + } + + friend embree_ostream operator<<(embree_ostream cout, const QuadraticBezierCurve& a) { + return cout << "QuadraticBezierCurve ( (" << a.u.lower << ", " << a.u.upper << "), " << a.v0 << ", " << a.v1 << ", " << a.v2 << ")"; + } + }; + + + typedef QuadraticBezierCurve<float> QuadraticBezierCurve1f; + typedef QuadraticBezierCurve<Vec2fa> QuadraticBezierCurve2fa; + typedef QuadraticBezierCurve<Vec3fa> QuadraticBezierCurve3fa; + + template<typename Vertex> + struct CubicBezierCurve + { + Vertex v0,v1,v2,v3; + + __forceinline CubicBezierCurve() {} + + template<typename T1> + __forceinline CubicBezierCurve (const CubicBezierCurve<T1>& other) + : v0(other.v0), v1(other.v1), v2(other.v2), v3(other.v3) {} + + __forceinline CubicBezierCurve& operator= (const CubicBezierCurve& other) { + v0 = other.v0; v1 = other.v1; v2 = other.v2; v3 = other.v3; return *this; + } + + __forceinline CubicBezierCurve(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) + : v0(v0), v1(v1), v2(v2), v3(v3) {} + + __forceinline Vertex begin() const { + return v0; + } + + __forceinline Vertex end() const { + return v3; + } + + __forceinline Vertex center() const { + return 0.25f*(v0+v1+v2+v3); + } + + __forceinline Vertex begin_direction() const { + return v1-v0; + } + + __forceinline Vertex end_direction() const { + return v3-v2; + } + + __forceinline CubicBezierCurve<float> xfm(const Vertex& dx) const { + return CubicBezierCurve<float>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx)); + } + + __forceinline CubicBezierCurve<vfloatx> vxfm(const Vertex& dx) const { + return CubicBezierCurve<vfloatx>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx)); + } + + __forceinline CubicBezierCurve<float> xfm(const Vertex& dx, const Vertex& p) const { + return CubicBezierCurve<float>(dot(v0-p,dx),dot(v1-p,dx),dot(v2-p,dx),dot(v3-p,dx)); + } + + __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space) const + { + const Vec3fa q0 = xfmVector(space,v0); + const Vec3fa q1 = xfmVector(space,v1); + const Vec3fa q2 = xfmVector(space,v2); + const Vec3fa q3 = xfmVector(space,v3); + return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3); + } + + __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3fa q0 = xfmVector(space,v0-p); + const Vec3fa q1 = xfmVector(space,v1-p); + const Vec3fa q2 = xfmVector(space,v2-p); + const Vec3fa q3 = xfmVector(space,v3-p); + return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3); + } + + __forceinline CubicBezierCurve<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w); + const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w); + const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w); + const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w); + return CubicBezierCurve<Vec3ff>(q0,q1,q2,q3); + } + + __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const + { + const Vec3fa q0 = xfmVector(space,s*(v0-p)); + const Vec3fa q1 = xfmVector(space,s*(v1-p)); + const Vec3fa q2 = xfmVector(space,s*(v2-p)); + const Vec3fa q3 = xfmVector(space,s*(v3-p)); + return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3); + } + + __forceinline int maxRoots() const; + + __forceinline BBox<Vertex> bounds() const { + return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3)); + } + + __forceinline friend CubicBezierCurve operator +( const CubicBezierCurve& a, const CubicBezierCurve& b ) { + return CubicBezierCurve(a.v0+b.v0,a.v1+b.v1,a.v2+b.v2,a.v3+b.v3); + } + + __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const CubicBezierCurve& b ) { + return CubicBezierCurve(a.v0-b.v0,a.v1-b.v1,a.v2-b.v2,a.v3-b.v3); + } + + __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const Vertex& b ) { + return CubicBezierCurve(a.v0-b,a.v1-b,a.v2-b,a.v3-b); + } + + __forceinline friend CubicBezierCurve operator *( const Vertex& a, const CubicBezierCurve& b ) { + return CubicBezierCurve(a*b.v0,a*b.v1,a*b.v2,a*b.v3); + } + + __forceinline friend CubicBezierCurve cmadd( const Vertex& a, const CubicBezierCurve& b, const CubicBezierCurve& c) { + return CubicBezierCurve(madd(a,b.v0,c.v0),madd(a,b.v1,c.v1),madd(a,b.v2,c.v2),madd(a,b.v3,c.v3)); + } + + __forceinline friend CubicBezierCurve clerp ( const CubicBezierCurve& a, const CubicBezierCurve& b, const Vertex& t ) { + return cmadd((Vertex(1.0f)-t),a,t*b); + } + + __forceinline friend CubicBezierCurve merge ( const CubicBezierCurve& a, const CubicBezierCurve& b ) { + return CubicBezierCurve(merge(a.v0,b.v0),merge(a.v1,b.v1),merge(a.v2,b.v2),merge(a.v3,b.v3)); + } + + __forceinline void split(CubicBezierCurve& left, CubicBezierCurve& right, const float t = 0.5f) const + { + const Vertex p00 = v0; + const Vertex p01 = v1; + const Vertex p02 = v2; + const Vertex p03 = v3; + + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p12 = lerp(p02,p03,t); + const Vertex p20 = lerp(p10,p11,t); + const Vertex p21 = lerp(p11,p12,t); + const Vertex p30 = lerp(p20,p21,t); + + new (&left ) CubicBezierCurve(p00,p10,p20,p30); + new (&right) CubicBezierCurve(p30,p21,p12,p03); + } + + __forceinline CubicBezierCurve<Vec2vfx> split() const + { + const float u0 = 0.0f, u1 = 1.0f; + const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1))); + const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1))); + Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale); + const Vec2vfx P3 = shift_right_1(P0); + const Vec2vfx dP3du = shift_right_1(dP0du); + const Vec2vfx P1 = P0 + dP0du; + const Vec2vfx P2 = P3 - dP3du; + return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3); + } + + __forceinline CubicBezierCurve<Vec2vfx> split(const BBox1f& u) const + { + const float u0 = u.lower, u1 = u.upper; + const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1))); + const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1))); + Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale); + const Vec2vfx P3 = shift_right_1(P0); + const Vec2vfx dP3du = shift_right_1(dP0du); + const Vec2vfx P1 = P0 + dP0du; + const Vec2vfx P2 = P3 - dP3du; + return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3); + } + + __forceinline void eval(float t, Vertex& p, Vertex& dp) const + { + const Vertex p00 = v0; + const Vertex p01 = v1; + const Vertex p02 = v2; + const Vertex p03 = v3; + + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p12 = lerp(p02,p03,t); + const Vertex p20 = lerp(p10,p11,t); + const Vertex p21 = lerp(p11,p12,t); + const Vertex p30 = lerp(p20,p21,t); + + p = p30; + dp = Vertex(3.0f)*(p21-p20); + } + +#if 0 + __forceinline Vertex eval(float t) const + { + const Vertex p00 = v0; + const Vertex p01 = v1; + const Vertex p02 = v2; + const Vertex p03 = v3; + + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p12 = lerp(p02,p03,t); + const Vertex p20 = lerp(p10,p11,t); + const Vertex p21 = lerp(p11,p12,t); + const Vertex p30 = lerp(p20,p21,t); + + return p30; + } +#else + __forceinline Vertex eval(const float t) const + { + const Vec4<float> b = BezierBasis::eval(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } +#endif + + __forceinline Vertex eval_dt(float t) const + { + const Vertex p00 = v1-v0; + const Vertex p01 = v2-v1; + const Vertex p02 = v3-v2; + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p20 = lerp(p10,p11,t); + return Vertex(3.0f)*p20; + } + + __forceinline Vertex eval_du(const float t) const + { + const Vec4<float> b = BezierBasis::derivative(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_dudu(const float t) const + { + const Vec4<float> b = BezierBasis::derivative2(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline void evalN(const vfloatx& t, Vec2vfx& p, Vec2vfx& dp) const + { + const Vec2vfx p00 = v0; + const Vec2vfx p01 = v1; + const Vec2vfx p02 = v2; + const Vec2vfx p03 = v3; + + const Vec2vfx p10 = lerp(p00,p01,t); + const Vec2vfx p11 = lerp(p01,p02,t); + const Vec2vfx p12 = lerp(p02,p03,t); + + const Vec2vfx p20 = lerp(p10,p11,t); + const Vec2vfx p21 = lerp(p11,p12,t); + + const Vec2vfx p30 = lerp(p20,p21,t); + + p = p30; + dp = vfloatx(3.0f)*(p21-p20); + } + + __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const + { + const Vertex p00 = v0; + const Vertex p01 = v1; + const Vertex p02 = v2; + const Vertex p03 = v3; + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p12 = lerp(p02,p03,t); + const Vertex p20 = lerp(p10,p11,t); + const Vertex p21 = lerp(p11,p12,t); + const Vertex p30 = lerp(p20,p21,t); + p = p30; + dp = 3.0f*(p21-p20); + ddp = eval_dudu(t); + } + + __forceinline CubicBezierCurve clip(const Interval1f& u1) const + { + Vertex f0,df0; eval(u1.lower,f0,df0); + Vertex f1,df1; eval(u1.upper,f1,df1); + float s = u1.upper-u1.lower; + return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1); + } + + __forceinline QuadraticBezierCurve<Vertex> derivative() const + { + const Vertex q0 = 3.0f*(v1-v0); + const Vertex q1 = 3.0f*(v2-v1); + const Vertex q2 = 3.0f*(v3-v2); + return QuadraticBezierCurve<Vertex>(q0,q1,q2); + } + + __forceinline BBox<Vertex> derivative_bounds(const Interval1f& u1) const + { + Vertex f0,df0; eval(u1.lower,f0,df0); + Vertex f3,df3; eval(u1.upper,f3,df3); + const float s = u1.upper-u1.lower; + const Vertex f1 = f0+s*(1.0f/3.0f)*df0; + const Vertex f2 = f3-s*(1.0f/3.0f)*df3; + const Vertex q0 = s*df0; + const Vertex q1 = 3.0f*(f2-f1); + const Vertex q2 = s*df3; + return merge(BBox<Vertex>(q0),BBox<Vertex>(q1),BBox<Vertex>(q2)); + } + + template<int M> + __forceinline Vec4vf<M> veval(const vfloat<M>& t) const + { + const Vec4vf<M> b = BezierBasis::eval(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const + { + const Vec4vf<M> b = BezierBasis::derivative(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const + { + const Vec4vf<M> b = BezierBasis::derivative2(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const + { + const Vec4vf<M> p00 = v0; + const Vec4vf<M> p01 = v1; + const Vec4vf<M> p02 = v2; + const Vec4vf<M> p03 = v3; + + const Vec4vf<M> p10 = lerp(p00,p01,t); + const Vec4vf<M> p11 = lerp(p01,p02,t); + const Vec4vf<M> p12 = lerp(p02,p03,t); + const Vec4vf<M> p20 = lerp(p10,p11,t); + const Vec4vf<M> p21 = lerp(p11,p12,t); + const Vec4vf<M> p30 = lerp(p20,p21,t); + + p = p30; + dp = vfloat<M>(3.0f)*(p21-p20); + } + + template<int M, typename Vec = Vec4vf<M>> + __forceinline Vec eval0(const int ofs, const int size) const + { + assert(size <= PrecomputedBezierBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&bezier_basis0.c0[size][ofs]), Vec(v0), + madd(vfloat<M>::loadu(&bezier_basis0.c1[size][ofs]), Vec(v1), + madd(vfloat<M>::loadu(&bezier_basis0.c2[size][ofs]), Vec(v2), + vfloat<M>::loadu(&bezier_basis0.c3[size][ofs]) * Vec(v3)))); + } + + template<int M, typename Vec = Vec4vf<M>> + __forceinline Vec eval1(const int ofs, const int size) const + { + assert(size <= PrecomputedBezierBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&bezier_basis1.c0[size][ofs]), Vec(v0), + madd(vfloat<M>::loadu(&bezier_basis1.c1[size][ofs]), Vec(v1), + madd(vfloat<M>::loadu(&bezier_basis1.c2[size][ofs]), Vec(v2), + vfloat<M>::loadu(&bezier_basis1.c3[size][ofs]) * Vec(v3)))); + } + + template<int M, typename Vec = Vec4vf<M>> + __forceinline Vec derivative0(const int ofs, const int size) const + { + assert(size <= PrecomputedBezierBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&bezier_basis0.d0[size][ofs]), Vec(v0), + madd(vfloat<M>::loadu(&bezier_basis0.d1[size][ofs]), Vec(v1), + madd(vfloat<M>::loadu(&bezier_basis0.d2[size][ofs]), Vec(v2), + vfloat<M>::loadu(&bezier_basis0.d3[size][ofs]) * Vec(v3)))); + } + + template<int M, typename Vec = Vec4vf<M>> + __forceinline Vec derivative1(const int ofs, const int size) const + { + assert(size <= PrecomputedBezierBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&bezier_basis1.d0[size][ofs]), Vec(v0), + madd(vfloat<M>::loadu(&bezier_basis1.d1[size][ofs]), Vec(v1), + madd(vfloat<M>::loadu(&bezier_basis1.d2[size][ofs]), Vec(v2), + vfloat<M>::loadu(&bezier_basis1.d3[size][ofs]) * Vec(v3)))); + } + + /* calculates bounds of bezier curve geometry */ + __forceinline BBox3fa accurateBounds() const + { + const int N = 7; + const float scale = 1.0f/(3.0f*(N-1)); + Vec3vfx pl(pos_inf), pu(neg_inf); + for (int i=0; i<=N; i+=VSIZEX) + { + vintx vi = vintx(i)+vintx(step); + vboolx valid = vi <= vintx(N); + const Vec3vfx p = eval0<VSIZEX,Vec3vf<VSIZEX>>(i,N); + const Vec3vfx dp = derivative0<VSIZEX,Vec3vf<VSIZEX>>(i,N); + const Vec3vfx pm = p-Vec3vfx(scale)*select(vi!=vintx(0),dp,Vec3vfx(zero)); + const Vec3vfx pp = p+Vec3vfx(scale)*select(vi!=vintx(N),dp,Vec3vfx(zero)); + pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min + pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + return BBox3fa(lower,upper); + } + + /* calculates bounds of bezier curve geometry */ + __forceinline BBox3fa accurateRoundBounds() const + { + const int N = 7; + const float scale = 1.0f/(3.0f*(N-1)); + Vec4vfx pl(pos_inf), pu(neg_inf); + for (int i=0; i<=N; i+=VSIZEX) + { + vintx vi = vintx(i)+vintx(step); + vboolx valid = vi <= vintx(N); + const Vec4vfx p = eval0<VSIZEX>(i,N); + const Vec4vfx dp = derivative0<VSIZEX>(i,N); + const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); + const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); + pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min + pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const float r_min = reduce_min(pl.w); + const float r_max = reduce_max(pu.w); + const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); + return enlarge(BBox3fa(lower,upper),upper_r); + } + + /* calculates bounds when tessellated into N line segments */ + __forceinline BBox3fa accurateFlatBounds(int N) const + { + if (likely(N == 4)) + { + const Vec4vf4 pi = eval0<4>(0,4); + const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); + const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); + const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); + return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w)))); + } + else + { + Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); + for (int i=0; i<N; i+=VSIZEX) + { + vboolx valid = vintx(i)+vintx(step) < vintx(N); + const Vec4vfx pi = eval0<VSIZEX>(i,N); + + pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min + pl.y = select(valid,min(pl.y,pi.y),pl.y); + pl.z = select(valid,min(pl.z,pi.z),pl.z); + + pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min + pu.y = select(valid,max(pu.y,pi.y),pu.y); + pu.z = select(valid,max(pu.z,pi.z),pu.z); + + ru = select(valid,max(ru,abs(pi.w)),ru); + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const Vec3fa upper_r(reduce_max(ru)); + return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w)))); + } + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const CubicBezierCurve& curve) { + return cout << "CubicBezierCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; + } + }; + +#if defined(__AVX__) + template<> + __forceinline CubicBezierCurve<vfloat4> CubicBezierCurve<vfloat4>::clip(const Interval1f& u1) const + { + const vfloat8 p00 = vfloat8(v0); + const vfloat8 p01 = vfloat8(v1); + const vfloat8 p02 = vfloat8(v2); + const vfloat8 p03 = vfloat8(v3); + + const vfloat8 t(vfloat4(u1.lower),vfloat4(u1.upper)); + const vfloat8 p10 = lerp(p00,p01,t); + const vfloat8 p11 = lerp(p01,p02,t); + const vfloat8 p12 = lerp(p02,p03,t); + const vfloat8 p20 = lerp(p10,p11,t); + const vfloat8 p21 = lerp(p11,p12,t); + const vfloat8 p30 = lerp(p20,p21,t); + + const vfloat8 f01 = p30; + const vfloat8 df01 = vfloat8(3.0f)*(p21-p20); + + const vfloat4 f0 = extract4<0>(f01), f1 = extract4<1>(f01); + const vfloat4 df0 = extract4<0>(df01), df1 = extract4<1>(df01); + const float s = u1.upper-u1.lower; + return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1); + } +#endif + + template<typename Vertex> using BezierCurveT = CubicBezierCurve<Vertex>; + + typedef CubicBezierCurve<float> CubicBezierCurve1f; + typedef CubicBezierCurve<Vec2fa> CubicBezierCurve2fa; + typedef CubicBezierCurve<Vec3fa> CubicBezierCurve3fa; + typedef CubicBezierCurve<Vec3fa> BezierCurve3fa; + + template<> __forceinline int CubicBezierCurve<float>::maxRoots() const + { + float eps = 1E-4f; + bool neg0 = v0 <= 0.0f; bool zero0 = fabs(v0) < eps; + bool neg1 = v1 <= 0.0f; bool zero1 = fabs(v1) < eps; + bool neg2 = v2 <= 0.0f; bool zero2 = fabs(v2) < eps; + bool neg3 = v3 <= 0.0f; bool zero3 = fabs(v3) < eps; + return (neg0 != neg1 || zero0) + (neg1 != neg2 || zero1) + (neg2 != neg3 || zero2 || zero3); + } + + template<> __forceinline int CubicBezierCurve<Interval1f>::maxRoots() const { + return numRoots(v0,v1) + numRoots(v1,v2) + numRoots(v2,v3); + } + + template<typename CurveGeometry> + __forceinline CubicBezierCurve<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve<Vec3ff>& curve) + { + return CubicBezierCurve<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); + } +} diff --git a/thirdparty/embree/kernels/subdiv/bezier_patch.h b/thirdparty/embree/kernels/subdiv/bezier_patch.h new file mode 100644 index 0000000000..2ff03902a7 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/bezier_patch.h @@ -0,0 +1,372 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bezier_curve.h" + +namespace embree +{ + template<class T, class S> + static __forceinline T deCasteljau(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3) + { + const T v0_1 = lerp(v0,v1,uu); + const T v1_1 = lerp(v1,v2,uu); + const T v2_1 = lerp(v2,v3,uu); + const T v0_2 = lerp(v0_1,v1_1,uu); + const T v1_2 = lerp(v1_1,v2_1,uu); + const T v0_3 = lerp(v0_2,v1_2,uu); + return v0_3; + } + + template<class T, class S> + static __forceinline T deCasteljau_tangent(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3) + { + const T v0_1 = lerp(v0,v1,uu); + const T v1_1 = lerp(v1,v2,uu); + const T v2_1 = lerp(v2,v3,uu); + const T v0_2 = lerp(v0_1,v1_1,uu); + const T v1_2 = lerp(v1_1,v2_1,uu); + return S(3.0f)*(v1_2-v0_2); + } + + template<typename Vertex> + __forceinline Vertex computeInnerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 36.0f * (16.0f * v[y][x] + 4.0f * (v[y-1][x] + v[y+1][x] + v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y+1][x+1] + v[y-1][x+1] + v[y+1][x-1])); + } + + template<typename Vertex> + __forceinline Vertex computeTopEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y-1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y-1][x+1])); + } + + template<typename Vertex> + __forceinline Vertex computeBottomEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y+1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + v[y+1][x-1] + v[y+1][x+1]); + } + + template<typename Vertex> + __forceinline Vertex computeLeftEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x-1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x-1] + v[y+1][x-1]); + } + + template<typename Vertex> + __forceinline Vertex computeRightEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x+1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x+1] + v[y+1][x+1]); + } + + template<typename Vertex> + __forceinline Vertex computeCornerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x, const ssize_t delta_y, const ssize_t delta_x) + { + return 1.0f / 9.0f * (4.0f * v[y][x] + 2.0f * (v[y+delta_y][x] + v[y][x+delta_x]) + v[y+delta_y][x+delta_x]); + } + + template<typename Vertex, typename Vertex_t> + class __aligned(64) BezierPatchT + { + public: + Vertex matrix[4][4]; + + public: + + __forceinline BezierPatchT() {} + + __forceinline BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride); + + __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch); + + __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch, + const BezierCurveT<Vertex>* border0, + const BezierCurveT<Vertex>* border1, + const BezierCurveT<Vertex>* border2, + const BezierCurveT<Vertex>* border3); + + __forceinline BezierPatchT(const BSplinePatchT<Vertex,Vertex_t>& source) + { + /* compute inner bezier control points */ + matrix[0][0] = computeInnerBezierControlPoint(source.v,1,1); + matrix[0][3] = computeInnerBezierControlPoint(source.v,1,2); + matrix[3][3] = computeInnerBezierControlPoint(source.v,2,2); + matrix[3][0] = computeInnerBezierControlPoint(source.v,2,1); + + /* compute top edge control points */ + matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1); + matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); + + /* compute buttom edge control points */ + matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1); + matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2); + + /* compute left edge control points */ + matrix[1][0] = computeBottomEdgeBezierControlPoint(source.v,1,1); + matrix[2][0] = computeTopEdgeBezierControlPoint(source.v,2,1); + + /* compute right edge control points */ + matrix[1][3] = computeBottomEdgeBezierControlPoint(source.v,1,2); + matrix[2][3] = computeTopEdgeBezierControlPoint(source.v,2,2); + + /* compute corner control points */ + matrix[1][1] = computeCornerBezierControlPoint(source.v,1,1, 1, 1); + matrix[1][2] = computeCornerBezierControlPoint(source.v,1,2, 1,-1); + matrix[2][2] = computeCornerBezierControlPoint(source.v,2,2,-1,-1); + matrix[2][1] = computeCornerBezierControlPoint(source.v,2,1,-1, 1); + } + + static __forceinline Vertex_t bilinear(const Vec4f Bu, const Vertex matrix[4][4], const Vec4f Bv) + { + const Vertex_t M0 = madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))); + const Vertex_t M1 = madd(Bu.x,matrix[1][0],madd(Bu.y,matrix[1][1],madd(Bu.z,matrix[1][2],Bu.w * matrix[1][3]))); + const Vertex_t M2 = madd(Bu.x,matrix[2][0],madd(Bu.y,matrix[2][1],madd(Bu.z,matrix[2][2],Bu.w * matrix[2][3]))); + const Vertex_t M3 = madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3]))); + return madd(Bv.x,M0,madd(Bv.y,M1,madd(Bv.z,M2,Bv.w*M3))); + } + + static __forceinline Vertex_t eval(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::eval(uu); + const Vec4f Bv = BezierBasis::eval(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_du(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::derivative(uu); + const Vec4f Bv = BezierBasis::eval(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_dv(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::eval(uu); + const Vec4f Bv = BezierBasis::derivative(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_dudu(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::derivative2(uu); + const Vec4f Bv = BezierBasis::eval(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_dvdv(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::eval(uu); + const Vec4f Bv = BezierBasis::derivative2(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_dudv(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::derivative(uu); + const Vec4f Bv = BezierBasis::derivative(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t normal(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vertex_t dPdu = eval_du(matrix,uu,vv); + const Vertex_t dPdv = eval_dv(matrix,uu,vv); + return cross(dPdu,dPdv); + } + + __forceinline Vertex_t normal(const float uu, const float vv) + { + const Vertex_t dPdu = eval_du(matrix,uu,vv); + const Vertex_t dPdv = eval_dv(matrix,uu,vv); + return cross(dPdu,dPdv); + } + + __forceinline Vertex_t eval(const float uu, const float vv) const { + return eval(matrix,uu,vv); + } + + __forceinline Vertex_t eval_du(const float uu, const float vv) const { + return eval_du(matrix,uu,vv); + } + + __forceinline Vertex_t eval_dv(const float uu, const float vv) const { + return eval_dv(matrix,uu,vv); + } + + __forceinline Vertex_t eval_dudu(const float uu, const float vv) const { + return eval_dudu(matrix,uu,vv); + } + + __forceinline Vertex_t eval_dvdv(const float uu, const float vv) const { + return eval_dvdv(matrix,uu,vv); + } + + __forceinline Vertex_t eval_dudv(const float uu, const float vv) const { + return eval_dudv(matrix,uu,vv); + } + + __forceinline void eval(const float u, const float v, Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, const float dscale = 1.0f) const + { + if (P) { + *P = eval(u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = eval_du(u,v)*dscale; + assert(dPdv); *dPdv = eval_dv(u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); + } + } + + template<class vfloat> + __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const + { + const vfloat curve0_x = v_n[0] * vfloat(matrix[0][0][i]) + v_n[1] * vfloat(matrix[1][0][i]) + v_n[2] * vfloat(matrix[2][0][i]) + v_n[3] * vfloat(matrix[3][0][i]); + const vfloat curve1_x = v_n[0] * vfloat(matrix[0][1][i]) + v_n[1] * vfloat(matrix[1][1][i]) + v_n[2] * vfloat(matrix[2][1][i]) + v_n[3] * vfloat(matrix[3][1][i]); + const vfloat curve2_x = v_n[0] * vfloat(matrix[0][2][i]) + v_n[1] * vfloat(matrix[1][2][i]) + v_n[2] * vfloat(matrix[2][2][i]) + v_n[3] * vfloat(matrix[3][2][i]); + const vfloat curve3_x = v_n[0] * vfloat(matrix[0][3][i]) + v_n[1] * vfloat(matrix[1][3][i]) + v_n[2] * vfloat(matrix[2][3][i]) + v_n[3] * vfloat(matrix[3][3][i]); + return u_n[0] * curve0_x + u_n[1] * curve1_x + u_n[2] * curve2_x + u_n[3] * curve3_x; + } + + template<typename vbool, typename vfloat> + __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) const + { + if (P) { + const Vec4<vfloat> u_n = BezierBasis::eval(uu); + const Vec4<vfloat> v_n = BezierBasis::eval(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n)); + } + if (dPdu) + { + { + assert(dPdu); + const Vec4<vfloat> u_n = BezierBasis::derivative(uu); + const Vec4<vfloat> v_n = BezierBasis::eval(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale); + } + { + assert(dPdv); + const Vec4<vfloat> u_n = BezierBasis::eval(uu); + const Vec4<vfloat> v_n = BezierBasis::derivative(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale); + } + } + if (ddPdudu) + { + { + assert(ddPdudu); + const Vec4<vfloat> u_n = BezierBasis::derivative2(uu); + const Vec4<vfloat> v_n = BezierBasis::eval(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale)); + } + { + assert(ddPdvdv); + const Vec4<vfloat> u_n = BezierBasis::eval(uu); + const Vec4<vfloat> v_n = BezierBasis::derivative2(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale)); + } + { + assert(ddPdudv); + const Vec4<vfloat> u_n = BezierBasis::derivative(uu); + const Vec4<vfloat> v_n = BezierBasis::derivative(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale)); + } + } + } + + template<typename T> + static __forceinline Vec3<T> eval(const Vertex matrix[4][4], const T& uu, const T& vv) + { + const T one_minus_uu = 1.0f - uu; + const T one_minus_vv = 1.0f - vv; + + const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu; + const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv; + const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu); + const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv); + const T B2_u = 3.0f * (uu * one_minus_uu * uu); + const T B2_v = 3.0f * (vv * one_minus_vv * vv); + const T B3_u = uu * uu * uu; + const T B3_v = vv * vv * vv; + + const T x = + madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u*matrix[0][3].x))), + madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,matrix[1][1].x,madd(B2_u,matrix[1][2].x,B3_u*matrix[1][3].x))), + madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,matrix[2][1].x,madd(B2_u,matrix[2][2].x,B3_u*matrix[2][3].x))), + B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u*matrix[3][3].x)))))); + + const T y = + madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u*matrix[0][3].y))), + madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,matrix[1][1].y,madd(B2_u,matrix[1][2].y,B3_u*matrix[1][3].y))), + madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,matrix[2][1].y,madd(B2_u,matrix[2][2].y,B3_u*matrix[2][3].y))), + B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u*matrix[3][3].y)))))); + + const T z = + madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u*matrix[0][3].z))), + madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,matrix[1][1].z,madd(B2_u,matrix[1][2].z,B3_u*matrix[1][3].z))), + madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,matrix[2][1].z,madd(B2_u,matrix[2][2].z,B3_u*matrix[2][3].z))), + B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u*matrix[3][3].z)))))); + + return Vec3<T>(x,y,z); + } + + template<typename vfloat> + __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const { + return eval(matrix,uu,vv); + } + + template<class T> + static __forceinline Vec3<T> normal(const Vertex matrix[4][4], const T& uu, const T& vv) + { + + const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z); + const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z); + const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z); + const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z); + + const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z); + const Vec3<T> matrix_11 = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); + const Vec3<T> matrix_12 = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); + const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z); + + const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z); + const Vec3<T> matrix_21 = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); + const Vec3<T> matrix_22 = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); + const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z); + + const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z); + const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z); + const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z); + const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z); + + /* tangentU */ + const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30); + const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31); + const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32); + const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33); + + const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); + + /* tangentV */ + const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03); + const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13); + const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23); + const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33); + + const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); + + /* normal = tangentU x tangentV */ + const Vec3<T> n = cross(tangentU,tangentV); + return n; + } + + template<typename vfloat> + __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const { + return normal(matrix,uu,vv); + } + }; + + typedef BezierPatchT<Vec3fa,Vec3fa_t> BezierPatch3fa; +} diff --git a/thirdparty/embree/kernels/subdiv/bilinear_patch.h b/thirdparty/embree/kernels/subdiv/bilinear_patch.h new file mode 100644 index 0000000000..cade104a6c --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/bilinear_patch.h @@ -0,0 +1,191 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bezier_curve.h" + +namespace embree +{ + template<typename Vertex, typename Vertex_t = Vertex> + class __aligned(64) BilinearPatchT + { + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing; + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + + public: + Vertex v[4]; + + public: + + __forceinline BilinearPatchT () {} + + __forceinline BilinearPatchT (const HalfEdge* edge, const BufferView<Vertex>& vertices) { + init(edge,vertices.getPtr(),vertices.getStride()); + } + + __forceinline BilinearPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { + init(edge,vertices,stride); + } + + __forceinline void init (const HalfEdge* edge, const char* vertices, size_t stride) + { + v[0] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); + v[1] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); + v[2] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); + v[3] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); + } + + __forceinline BilinearPatchT (const CatmullClarkPatch& patch) + { + v[0] = patch.ring[0].getLimitVertex(); + v[1] = patch.ring[1].getLimitVertex(); + v[2] = patch.ring[2].getLimitVertex(); + v[3] = patch.ring[3].getLimitVertex(); + } + + __forceinline BBox<Vertex> bounds() const + { + + BBox<Vertex> bounds (v[0]); + bounds.extend(v[1]); + bounds.extend(v[2]); + bounds.extend(v[3]); + return bounds; + } + + __forceinline Vertex eval(const float uu, const float vv) const { + return lerp(lerp(v[0],v[1],uu),lerp(v[3],v[2],uu),vv); + } + + __forceinline Vertex eval_du(const float uu, const float vv) const { + return lerp(v[1]-v[0],v[2]-v[3],vv); + } + + __forceinline Vertex eval_dv(const float uu, const float vv) const { + return lerp(v[3]-v[0],v[2]-v[1],uu); + } + + __forceinline Vertex eval_dudu(const float uu, const float vv) const { + return Vertex(zero); + } + + __forceinline Vertex eval_dvdv(const float uu, const float vv) const { + return Vertex(zero); + } + + __forceinline Vertex eval_dudv(const float uu, const float vv) const { + return (v[2]-v[3]) - (v[1]-v[0]); + } + + __forceinline Vertex normal(const float uu, const float vv) const { + return cross(eval_du(uu,vv),eval_dv(uu,vv)); + } + + __forceinline void eval(const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, + const float dscale = 1.0f) const + { + if (P) { + *P = eval(u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = eval_du(u,v)*dscale; + assert(dPdv); *dPdv = eval_dv(u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); + } + } + + template<class vfloat> + __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const + { + const vfloat x = lerp(lerp(v[0].x,v[1].x,uu),lerp(v[3].x,v[2].x,uu),vv); + const vfloat y = lerp(lerp(v[0].y,v[1].y,uu),lerp(v[3].y,v[2].y,uu),vv); + const vfloat z = lerp(lerp(v[0].z,v[1].z,uu),lerp(v[3].z,v[2].z,uu),vv); + return Vec3<vfloat>(x,y,z); + } + + template<class vfloat> + __forceinline Vec3<vfloat> eval_du(const vfloat& uu, const vfloat& vv) const + { + const vfloat x = lerp(v[1].x-v[0].x,v[2].x-v[3].x,vv); + const vfloat y = lerp(v[1].y-v[0].y,v[2].y-v[3].y,vv); + const vfloat z = lerp(v[1].z-v[0].z,v[2].z-v[3].z,vv); + return Vec3<vfloat>(x,y,z); + } + + template<class vfloat> + __forceinline Vec3<vfloat> eval_dv(const vfloat& uu, const vfloat& vv) const + { + const vfloat x = lerp(v[3].x-v[0].x,v[2].x-v[1].x,uu); + const vfloat y = lerp(v[3].y-v[0].y,v[2].y-v[1].y,uu); + const vfloat z = lerp(v[3].z-v[0].z,v[2].z-v[1].z,uu); + return Vec3<vfloat>(x,y,z); + } + + template<typename vfloat> + __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const { + return cross(eval_du(uu,vv),eval_dv(uu,vv)); + } + + template<class vfloat> + __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv) const { + return lerp(lerp(v[0][i],v[1][i],uu),lerp(v[3][i],v[2][i],uu),vv); + } + + template<class vfloat> + __forceinline vfloat eval_du(const size_t i, const vfloat& uu, const vfloat& vv) const { + return lerp(v[1][i]-v[0][i],v[2][i]-v[3][i],vv); + } + + template<class vfloat> + __forceinline vfloat eval_dv(const size_t i, const vfloat& uu, const vfloat& vv) const { + return lerp(v[3][i]-v[0][i],v[2][i]-v[1][i],uu); + } + + template<class vfloat> + __forceinline vfloat eval_dudu(const size_t i, const vfloat& uu, const vfloat& vv) const { + return vfloat(zero); + } + + template<class vfloat> + __forceinline vfloat eval_dvdv(const size_t i, const vfloat& uu, const vfloat& vv) const { + return vfloat(zero); + } + + template<class vfloat> + __forceinline vfloat eval_dudv(const size_t i, const vfloat& uu, const vfloat& vv) const { + return (v[2][i]-v[3][i]) - (v[1][i]-v[0][i]); + } + + template<typename vbool, typename vfloat> + __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) const + { + if (P) { + for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv)); + } + if (dPdu) { + for (size_t i=0; i<N; i++) { + assert(dPdu); vfloat::store(valid,dPdu+i*dstride,eval_du(i,uu,vv)*dscale); + assert(dPdv); vfloat::store(valid,dPdv+i*dstride,eval_dv(i,uu,vv)*dscale); + } + } + if (ddPdudu) { + for (size_t i=0; i<N; i++) { + assert(ddPdudu); vfloat::store(valid,ddPdudu+i*dstride,eval_dudu(i,uu,vv)*sqr(dscale)); + assert(ddPdvdv); vfloat::store(valid,ddPdvdv+i*dstride,eval_dvdv(i,uu,vv)*sqr(dscale)); + assert(ddPdudv); vfloat::store(valid,ddPdudv+i*dstride,eval_dudv(i,uu,vv)*sqr(dscale)); + } + } + } + }; + + typedef BilinearPatchT<Vec3fa,Vec3fa_t> BilinearPatch3fa; +} diff --git a/thirdparty/embree/kernels/subdiv/bspline_curve.h b/thirdparty/embree/kernels/subdiv/bspline_curve.h new file mode 100644 index 0000000000..51489ef37c --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/bspline_curve.h @@ -0,0 +1,320 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "bezier_curve.h" + +namespace embree +{ + class BSplineBasis + { + public: + + template<typename T> + static __forceinline Vec4<T> eval(const T& u) + { + const T t = u; + const T s = T(1.0f) - u; + const T n0 = s*s*s; + const T n1 = (4.0f*(s*s*s)+(t*t*t)) + (12.0f*((s*t)*s) + 6.0f*((t*s)*t)); + const T n2 = (4.0f*(t*t*t)+(s*s*s)) + (12.0f*((t*s)*t) + 6.0f*((s*t)*s)); + const T n3 = t*t*t; + return T(1.0f/6.0f)*Vec4<T>(n0,n1,n2,n3); + } + + template<typename T> + static __forceinline Vec4<T> derivative(const T& u) + { + const T t = u; + const T s = 1.0f - u; + const T n0 = -s*s; + const T n1 = -t*t - 4.0f*(t*s); + const T n2 = s*s + 4.0f*(s*t); + const T n3 = t*t; + return T(0.5f)*Vec4<T>(n0,n1,n2,n3); + } + + template<typename T> + static __forceinline Vec4<T> derivative2(const T& u) + { + const T t = u; + const T s = 1.0f - u; + const T n0 = s; + const T n1 = t - 2.0f*s; + const T n2 = s - 2.0f*t; + const T n3 = t; + return Vec4<T>(n0,n1,n2,n3); + } + }; + + struct PrecomputedBSplineBasis + { + enum { N = 16 }; + public: + PrecomputedBSplineBasis() {} + PrecomputedBSplineBasis(int shift); + + /* basis for bspline evaluation */ + public: + float c0[N+1][N+1]; + float c1[N+1][N+1]; + float c2[N+1][N+1]; + float c3[N+1][N+1]; + + /* basis for bspline derivative evaluation */ + public: + float d0[N+1][N+1]; + float d1[N+1][N+1]; + float d2[N+1][N+1]; + float d3[N+1][N+1]; + }; + extern PrecomputedBSplineBasis bspline_basis0; + extern PrecomputedBSplineBasis bspline_basis1; + + template<typename Vertex> + struct BSplineCurveT + { + Vertex v0,v1,v2,v3; + + __forceinline BSplineCurveT() {} + + __forceinline BSplineCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) + : v0(v0), v1(v1), v2(v2), v3(v3) {} + + __forceinline Vertex begin() const { + return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2)); + } + + __forceinline Vertex end() const { + return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3)); + } + + __forceinline Vertex center() const { + return 0.25f*(v0+v1+v2+v3); + } + + __forceinline BBox<Vertex> bounds() const { + return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3)); + } + + __forceinline friend BSplineCurveT operator -( const BSplineCurveT& a, const Vertex& b ) { + return BSplineCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b); + } + + __forceinline BSplineCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w); + const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w); + const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w); + const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w); + return BSplineCurveT<Vec3ff>(q0,q1,q2,q3); + } + + __forceinline Vertex eval(const float t) const + { + const Vec4<float> b = BSplineBasis::eval(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_du(const float t) const + { + const Vec4<float> b = BSplineBasis::derivative(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_dudu(const float t) const + { + const Vec4<float> b = BSplineBasis::derivative2(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const + { + p = eval(t); + dp = eval_du(t); + ddp = eval_dudu(t); + } + + template<int M> + __forceinline Vec4vf<M> veval(const vfloat<M>& t) const + { + const Vec4vf<M> b = BSplineBasis::eval(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const + { + const Vec4vf<M> b = BSplineBasis::derivative(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const + { + const Vec4vf<M> b = BSplineBasis::derivative2(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const + { + p = veval<M>(t); + dp = veval_du<M>(t); + } + + template<int M> + __forceinline Vec4vf<M> eval0(const int ofs, const int size) const + { + assert(size <= PrecomputedBSplineBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&bspline_basis0.c0[size][ofs]), Vec4vf<M>(v0), + madd(vfloat<M>::loadu(&bspline_basis0.c1[size][ofs]), Vec4vf<M>(v1), + madd(vfloat<M>::loadu(&bspline_basis0.c2[size][ofs]), Vec4vf<M>(v2), + vfloat<M>::loadu(&bspline_basis0.c3[size][ofs]) * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> eval1(const int ofs, const int size) const + { + assert(size <= PrecomputedBSplineBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&bspline_basis1.c0[size][ofs]), Vec4vf<M>(v0), + madd(vfloat<M>::loadu(&bspline_basis1.c1[size][ofs]), Vec4vf<M>(v1), + madd(vfloat<M>::loadu(&bspline_basis1.c2[size][ofs]), Vec4vf<M>(v2), + vfloat<M>::loadu(&bspline_basis1.c3[size][ofs]) * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const + { + assert(size <= PrecomputedBSplineBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&bspline_basis0.d0[size][ofs]), Vec4vf<M>(v0), + madd(vfloat<M>::loadu(&bspline_basis0.d1[size][ofs]), Vec4vf<M>(v1), + madd(vfloat<M>::loadu(&bspline_basis0.d2[size][ofs]), Vec4vf<M>(v2), + vfloat<M>::loadu(&bspline_basis0.d3[size][ofs]) * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const + { + assert(size <= PrecomputedBSplineBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&bspline_basis1.d0[size][ofs]), Vec4vf<M>(v0), + madd(vfloat<M>::loadu(&bspline_basis1.d1[size][ofs]), Vec4vf<M>(v1), + madd(vfloat<M>::loadu(&bspline_basis1.d2[size][ofs]), Vec4vf<M>(v2), + vfloat<M>::loadu(&bspline_basis1.d3[size][ofs]) * Vec4vf<M>(v3)))); + } + + /* calculates bounds of bspline curve geometry */ + __forceinline BBox3fa accurateRoundBounds() const + { + const int N = 7; + const float scale = 1.0f/(3.0f*(N-1)); + Vec4vfx pl(pos_inf), pu(neg_inf); + for (int i=0; i<=N; i+=VSIZEX) + { + vintx vi = vintx(i)+vintx(step); + vboolx valid = vi <= vintx(N); + const Vec4vfx p = eval0<VSIZEX>(i,N); + const Vec4vfx dp = derivative0<VSIZEX>(i,N); + const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); + const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); + pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min + pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const float r_min = reduce_min(pl.w); + const float r_max = reduce_max(pu.w); + const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); + return enlarge(BBox3fa(lower,upper),upper_r); + } + + /* calculates bounds when tessellated into N line segments */ + __forceinline BBox3fa accurateFlatBounds(int N) const + { + if (likely(N == 4)) + { + const Vec4vf4 pi = eval0<4>(0,4); + const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); + const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); + const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); + const Vec3ff pe = end(); + return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w)))); + } + else + { + Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); + for (int i=0; i<=N; i+=VSIZEX) + { + vboolx valid = vintx(i)+vintx(step) <= vintx(N); + const Vec4vfx pi = eval0<VSIZEX>(i,N); + + pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min + pl.y = select(valid,min(pl.y,pi.y),pl.y); + pl.z = select(valid,min(pl.z,pi.z),pl.z); + + pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min + pu.y = select(valid,max(pu.y,pi.y),pu.y); + pu.z = select(valid,max(pu.z,pi.z),pu.z); + + ru = select(valid,max(ru,abs(pi.w)),ru); + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const Vec3fa upper_r(reduce_max(ru)); + return enlarge(BBox3fa(lower,upper),upper_r); + } + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const BSplineCurveT& curve) { + return cout << "BSplineCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; + } + }; + + template<typename Vertex> + __forceinline void convert(const BezierCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve) { + ocurve = icurve; + } + + template<typename Vertex> + __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve) { + ocurve = icurve; + } + + template<typename Vertex> + __forceinline void convert(const BezierCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve) + { + const Vertex v0 = madd(6.0f,icurve.v0,madd(-7.0f,icurve.v1,2.0f*icurve.v2)); + const Vertex v1 = msub(2.0f,icurve.v1,icurve.v2); + const Vertex v2 = msub(2.0f,icurve.v2,icurve.v1); + const Vertex v3 = madd(2.0f,icurve.v1,madd(-7.0f,icurve.v2,6.0f*icurve.v3)); + ocurve = BSplineCurveT<Vertex>(v0,v1,v2,v3); + } + + template<typename Vertex> + __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve) + { + const Vertex v0 = madd(1.0f/6.0f,icurve.v0,madd(2.0f/3.0f,icurve.v1,1.0f/6.0f*icurve.v2)); + const Vertex v1 = madd(2.0f/3.0f,icurve.v1,1.0f/3.0f*icurve.v2); + const Vertex v2 = madd(1.0f/3.0f,icurve.v1,2.0f/3.0f*icurve.v2); + const Vertex v3 = madd(1.0f/6.0f,icurve.v1,madd(2.0f/3.0f,icurve.v2,1.0f/6.0f*icurve.v3)); + ocurve = BezierCurveT<Vertex>(v0,v1,v2,v3); + } + + template<typename CurveGeometry> + __forceinline BSplineCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT<Vec3ff>& curve) + { + return BSplineCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); + } + + typedef BSplineCurveT<Vec3fa> BSplineCurve3fa; +} + diff --git a/thirdparty/embree/kernels/subdiv/bspline_patch.h b/thirdparty/embree/kernels/subdiv/bspline_patch.h new file mode 100644 index 0000000000..ff47f01c7a --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/bspline_patch.h @@ -0,0 +1,449 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bspline_curve.h" + +namespace embree +{ + template<typename Vertex, typename Vertex_t = Vertex> + class __aligned(64) BSplinePatchT + { + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing; + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + + public: + + __forceinline BSplinePatchT () {} + + __forceinline BSplinePatchT (const CatmullClarkPatch& patch) { + init(patch); + } + + __forceinline BSplinePatchT(const CatmullClarkPatch& patch, + const BezierCurveT<Vertex>* border0, + const BezierCurveT<Vertex>* border1, + const BezierCurveT<Vertex>* border2, + const BezierCurveT<Vertex>* border3) + { + init(patch); + } + + __forceinline BSplinePatchT (const HalfEdge* edge, const char* vertices, size_t stride) { + init(edge,vertices,stride); + } + + __forceinline Vertex hard_corner(const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + return 4.0f*v11 - 2.0f*(v12+v21) + v22; + } + + __forceinline Vertex soft_convex_corner( const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + return -8.0f*v11 + 4.0f*(v12+v21) + v22; + } + + __forceinline Vertex convex_corner(const float vertex_crease_weight, + const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + if (std::isinf(vertex_crease_weight)) return hard_corner(v01,v02,v10,v11,v12,v20,v21,v22); + else return soft_convex_corner(v01,v02,v10,v11,v12,v20,v21,v22); + } + + __forceinline Vertex load(const HalfEdge* edge, const char* vertices, size_t stride) { + return Vertex_t::loadu(vertices+edge->getStartVertexIndex()*stride); + } + + __forceinline void init_border(const CatmullClarkRing& edge0, + Vertex& v01, Vertex& v02, + const Vertex& v11, const Vertex& v12, + const Vertex& v21, const Vertex& v22) + { + if (likely(edge0.has_opposite_back(0))) + { + v01 = edge0.back(2); + v02 = edge0.back(1); + } else { + v01 = 2.0f*v11-v21; + v02 = 2.0f*v12-v22; + } + } + + __forceinline void init_corner(const CatmullClarkRing& edge0, + Vertex& v00, const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + const bool MAYBE_UNUSED has_back1 = edge0.has_opposite_back(1); + const bool has_back0 = edge0.has_opposite_back(0); + const bool has_front1 = edge0.has_opposite_front(1); + const bool MAYBE_UNUSED has_front2 = edge0.has_opposite_front(2); + + if (likely(has_back0)) { + if (likely(has_front1)) { assert(has_back1 && has_front2); v00 = edge0.back(3); } + else { assert(!has_back1); v00 = 2.0f*v01-v02; } + } + else { + if (likely(has_front1)) { assert(!has_front2); v00 = 2.0f*v10-v20; } + else v00 = convex_corner(edge0.vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22); + } + } + + void init(const CatmullClarkPatch& patch) + { + /* fill inner vertices */ + const Vertex v11 = v[1][1] = patch.ring[0].vtx; + const Vertex v12 = v[1][2] = patch.ring[1].vtx; + const Vertex v22 = v[2][2] = patch.ring[2].vtx; + const Vertex v21 = v[2][1] = patch.ring[3].vtx; + + /* fill border vertices */ + init_border(patch.ring[0],v[0][1],v[0][2],v11,v12,v21,v22); + init_border(patch.ring[1],v[1][3],v[2][3],v12,v22,v11,v21); + init_border(patch.ring[2],v[3][2],v[3][1],v22,v21,v12,v11); + init_border(patch.ring[3],v[2][0],v[1][0],v21,v11,v22,v12); + + /* fill corner vertices */ + init_corner(patch.ring[0],v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22); + init_corner(patch.ring[1],v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21); + init_corner(patch.ring[2],v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11); + init_corner(patch.ring[3],v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12); + } + + void init_border(const HalfEdge* edge0, const char* vertices, size_t stride, + Vertex& v01, Vertex& v02, + const Vertex& v11, const Vertex& v12, + const Vertex& v21, const Vertex& v22) + { + if (likely(edge0->hasOpposite())) + { + const HalfEdge* e = edge0->opposite()->next()->next(); + v01 = load(e,vertices,stride); + v02 = load(e->next(),vertices,stride); + } else { + v01 = 2.0f*v11-v21; + v02 = 2.0f*v12-v22; + } + } + + void init_corner(const HalfEdge* edge0, const char* vertices, size_t stride, + Vertex& v00, const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + const bool has_back0 = edge0->hasOpposite(); + const bool has_front1 = edge0->prev()->hasOpposite(); + + if (likely(has_back0)) + { + const HalfEdge* e = edge0->opposite()->next(); + if (likely(has_front1)) + { + assert(e->hasOpposite()); + assert(edge0->prev()->opposite()->prev()->hasOpposite()); + v00 = load(e->opposite()->prev(),vertices,stride); + } + else { + assert(!e->hasOpposite()); + v00 = 2.0f*v01-v02; + } + } + else + { + if (likely(has_front1)) { + assert(!edge0->prev()->opposite()->prev()->hasOpposite()); + v00 = 2.0f*v10-v20; + } + else { + assert(edge0->vertex_crease_weight == 0.0f || std::isinf(edge0->vertex_crease_weight)); + v00 = convex_corner(edge0->vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22); + } + } + } + + void init(const HalfEdge* edge0, const char* vertices, size_t stride) + { + assert( edge0->isRegularFace() ); + + /* fill inner vertices */ + const Vertex v11 = v[1][1] = load(edge0,vertices,stride); const HalfEdge* edge1 = edge0->next(); + const Vertex v12 = v[1][2] = load(edge1,vertices,stride); const HalfEdge* edge2 = edge1->next(); + const Vertex v22 = v[2][2] = load(edge2,vertices,stride); const HalfEdge* edge3 = edge2->next(); + const Vertex v21 = v[2][1] = load(edge3,vertices,stride); assert(edge0 == edge3->next()); + + /* fill border vertices */ + init_border(edge0,vertices,stride,v[0][1],v[0][2],v11,v12,v21,v22); + init_border(edge1,vertices,stride,v[1][3],v[2][3],v12,v22,v11,v21); + init_border(edge2,vertices,stride,v[3][2],v[3][1],v22,v21,v12,v11); + init_border(edge3,vertices,stride,v[2][0],v[1][0],v21,v11,v22,v12); + + /* fill corner vertices */ + init_corner(edge0,vertices,stride,v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22); + init_corner(edge1,vertices,stride,v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21); + init_corner(edge2,vertices,stride,v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11); + init_corner(edge3,vertices,stride,v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12); + } + + __forceinline BBox<Vertex> bounds() const + { + const Vertex* const cv = &v[0][0]; + BBox<Vertex> bounds (cv[0]); + for (size_t i=1; i<16 ; i++) + bounds.extend( cv[i] ); + return bounds; + } + + __forceinline Vertex eval(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::eval(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::eval(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_du(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::eval(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::derivative(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_dv(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::derivative(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::eval(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_dudu(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::eval(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::derivative2(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_dvdv(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::derivative2(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::eval(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_dudv(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::derivative(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::derivative(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex normal(const float uu, const float vv) const + { + const Vertex tu = eval_du(uu,vv); + const Vertex tv = eval_dv(uu,vv); + return cross(tu,tv); + } + + template<typename T> + __forceinline Vec3<T> eval(const T& uu, const T& vv, const Vec4<T>& u_n, const Vec4<T>& v_n) const + { + const T curve0_x = madd(v_n[0],T(v[0][0].x),madd(v_n[1],T(v[1][0].x),madd(v_n[2],T(v[2][0].x),v_n[3] * T(v[3][0].x)))); + const T curve1_x = madd(v_n[0],T(v[0][1].x),madd(v_n[1],T(v[1][1].x),madd(v_n[2],T(v[2][1].x),v_n[3] * T(v[3][1].x)))); + const T curve2_x = madd(v_n[0],T(v[0][2].x),madd(v_n[1],T(v[1][2].x),madd(v_n[2],T(v[2][2].x),v_n[3] * T(v[3][2].x)))); + const T curve3_x = madd(v_n[0],T(v[0][3].x),madd(v_n[1],T(v[1][3].x),madd(v_n[2],T(v[2][3].x),v_n[3] * T(v[3][3].x)))); + const T x = madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); + + const T curve0_y = madd(v_n[0],T(v[0][0].y),madd(v_n[1],T(v[1][0].y),madd(v_n[2],T(v[2][0].y),v_n[3] * T(v[3][0].y)))); + const T curve1_y = madd(v_n[0],T(v[0][1].y),madd(v_n[1],T(v[1][1].y),madd(v_n[2],T(v[2][1].y),v_n[3] * T(v[3][1].y)))); + const T curve2_y = madd(v_n[0],T(v[0][2].y),madd(v_n[1],T(v[1][2].y),madd(v_n[2],T(v[2][2].y),v_n[3] * T(v[3][2].y)))); + const T curve3_y = madd(v_n[0],T(v[0][3].y),madd(v_n[1],T(v[1][3].y),madd(v_n[2],T(v[2][3].y),v_n[3] * T(v[3][3].y)))); + const T y = madd(u_n[0],curve0_y,madd(u_n[1],curve1_y,madd(u_n[2],curve2_y,u_n[3] * curve3_y))); + + const T curve0_z = madd(v_n[0],T(v[0][0].z),madd(v_n[1],T(v[1][0].z),madd(v_n[2],T(v[2][0].z),v_n[3] * T(v[3][0].z)))); + const T curve1_z = madd(v_n[0],T(v[0][1].z),madd(v_n[1],T(v[1][1].z),madd(v_n[2],T(v[2][1].z),v_n[3] * T(v[3][1].z)))); + const T curve2_z = madd(v_n[0],T(v[0][2].z),madd(v_n[1],T(v[1][2].z),madd(v_n[2],T(v[2][2].z),v_n[3] * T(v[3][2].z)))); + const T curve3_z = madd(v_n[0],T(v[0][3].z),madd(v_n[1],T(v[1][3].z),madd(v_n[2],T(v[2][3].z),v_n[3] * T(v[3][3].z)))); + const T z = madd(u_n[0],curve0_z,madd(u_n[1],curve1_z,madd(u_n[2],curve2_z,u_n[3] * curve3_z))); + + return Vec3<T>(x,y,z); + } + + template<typename T> + __forceinline Vec3<T> eval(const T& uu, const T& vv) const + { + const Vec4<T> u_n = BSplineBasis::eval(uu); + const Vec4<T> v_n = BSplineBasis::eval(vv); + return eval(uu,vv,u_n,v_n); + } + + template<typename T> + __forceinline Vec3<T> eval_du(const T& uu, const T& vv) const + { + const Vec4<T> u_n = BSplineBasis::derivative(uu); + const Vec4<T> v_n = BSplineBasis::eval(vv); + return eval(uu,vv,u_n,v_n); + } + + template<typename T> + __forceinline Vec3<T> eval_dv(const T& uu, const T& vv) const + { + const Vec4<T> u_n = BSplineBasis::eval(uu); + const Vec4<T> v_n = BSplineBasis::derivative(vv); + return eval(uu,vv,u_n,v_n); + } + + template<typename T> + __forceinline Vec3<T> eval_dudu(const T& uu, const T& vv) const + { + const Vec4<T> u_n = BSplineBasis::derivative2(uu); + const Vec4<T> v_n = BSplineBasis::eval(vv); + return eval(uu,vv,u_n,v_n); + } + + template<typename T> + __forceinline Vec3<T> eval_dvdv(const T& uu, const T& vv) const + { + const Vec4<T> u_n = BSplineBasis::eval(uu); + const Vec4<T> v_n = BSplineBasis::derivative2(vv); + return eval(uu,vv,u_n,v_n); + } + + template<typename T> + __forceinline Vec3<T> eval_dudv(const T& uu, const T& vv) const + { + const Vec4<T> u_n = BSplineBasis::derivative(uu); + const Vec4<T> v_n = BSplineBasis::derivative(vv); + return eval(uu,vv,u_n,v_n); + } + + template<typename T> + __forceinline Vec3<T> normal(const T& uu, const T& vv) const { + return cross(eval_du(uu,vv),eval_dv(uu,vv)); + } + + void eval(const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, + const float dscale = 1.0f) const + { + if (P) { + *P = eval(u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = eval_du(u,v)*dscale; + assert(dPdv); *dPdv = eval_dv(u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); + } + } + + template<class vfloat> + __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const + { + const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i])))); + const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(v[1][1][i]),madd(v_n[2],vfloat(v[2][1][i]),v_n[3] * vfloat(v[3][1][i])))); + const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(v[1][2][i]),madd(v_n[2],vfloat(v[2][2][i]),v_n[3] * vfloat(v[3][2][i])))); + const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i])))); + return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); + } + + template<typename vbool, typename vfloat> + void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) const + { + if (P) { + const Vec4<vfloat> u_n = BSplineBasis::eval(uu); + const Vec4<vfloat> v_n = BSplineBasis::eval(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n)); + } + if (dPdu) + { + { + assert(dPdu); + const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); + const Vec4<vfloat> v_n = BSplineBasis::eval(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale); + } + { + assert(dPdv); + const Vec4<vfloat> u_n = BSplineBasis::eval(uu); + const Vec4<vfloat> v_n = BSplineBasis::derivative(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale); + } + } + if (ddPdudu) + { + { + assert(ddPdudu); + const Vec4<vfloat> u_n = BSplineBasis::derivative2(uu); + const Vec4<vfloat> v_n = BSplineBasis::eval(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale)); + } + { + assert(ddPdvdv); + const Vec4<vfloat> u_n = BSplineBasis::eval(uu); + const Vec4<vfloat> v_n = BSplineBasis::derivative2(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale)); + } + { + assert(ddPdudv); + const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); + const Vec4<vfloat> v_n = BSplineBasis::derivative(vv); + for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale)); + } + } + } + + friend __forceinline embree_ostream operator<<(embree_ostream o, const BSplinePatchT& p) + { + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + o << "[" << y << "][" << x << "] " << p.v[y][x] << embree_endl; + return o; + } + + public: + Vertex v[4][4]; + }; + + typedef BSplinePatchT<Vec3fa,Vec3fa_t> BSplinePatch3fa; +} diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h b/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h new file mode 100644 index 0000000000..46959797bf --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/catmullclark_coefficients.h @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/geometry.h" + +namespace embree +{ + static const size_t MAX_PATCH_VALENCE = 16; //!< maximum number of vertices of a patch + static const size_t MAX_RING_FACE_VALENCE = 64; //!< maximum number of faces per ring + static const size_t MAX_RING_EDGE_VALENCE = 2*64; //!< maximum number of edges per ring + + class CatmullClarkPrecomputedCoefficients + { + private: + + float table_cos_2PI_div_n[MAX_RING_FACE_VALENCE+1]; + + float* table_limittangent_a[MAX_RING_FACE_VALENCE+1]; + float* table_limittangent_b[MAX_RING_FACE_VALENCE+1]; + float table_limittangent_c[MAX_RING_FACE_VALENCE+1]; + + __forceinline float set_cos_2PI_div_n(const size_t n) { + if (unlikely(n == 0)) return 1.0f; + return cosf(2.0f*float(pi)/(float)n); + } + + __forceinline float set_limittangent_a(const size_t i, const size_t n) + { + if (unlikely(n == 0)) return 1.0f; + const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n)); + const float c1 = (1.0f/(float)n + cosf(float(pi)/(float)n) * c0); + return cosf(2.0f*float(pi)*(float)i/(float)n) * c1; + } + + __forceinline float set_limittangent_b(const size_t i, const size_t n) + { + if (unlikely(n == 0)) return 1.0f; + const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n)); + return cosf((2.0f*float(pi)*i+float(pi))/(float)n) * c0; + } + + __forceinline float set_limittangent_c(const size_t n) + { + if (unlikely(n == 0)) return 1.0f; + return 2.0f/16.0f * (5.0f + cosf(2.0f*float(pi)/(float)n) + cosf(float(pi)/(float)n) * sqrtf(18.0f+2.0f*cosf(2.0f*float(pi)/(float)n))); + } + + public: + + __forceinline float cos_2PI_div_n(const size_t n) + { + if (likely(n <= MAX_RING_FACE_VALENCE)) + return table_cos_2PI_div_n[n]; + else + return set_cos_2PI_div_n(n); + } + + __forceinline float limittangent_a(const size_t i, const size_t n) + { + assert(n <= MAX_RING_FACE_VALENCE); + assert(i < n); + return table_limittangent_a[n][i]; + } + + __forceinline float limittangent_b(const size_t i, const size_t n) + { + assert(n <= MAX_RING_FACE_VALENCE); + assert(i < n); + return table_limittangent_b[n][i]; + } + + __forceinline float limittangent_c(const size_t n) + { + assert(n <= MAX_RING_FACE_VALENCE); + return table_limittangent_c[n]; + } + + static CatmullClarkPrecomputedCoefficients table; + + CatmullClarkPrecomputedCoefficients(); + ~CatmullClarkPrecomputedCoefficients(); + }; +} diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_patch.h b/thirdparty/embree/kernels/subdiv/catmullclark_patch.h new file mode 100644 index 0000000000..91772d94ed --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/catmullclark_patch.h @@ -0,0 +1,562 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_ring.h" +#include "bezier_curve.h" + +namespace embree +{ + template<typename Vertex, typename Vertex_t = Vertex> + class __aligned(64) CatmullClarkPatchT + { + public: + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring; + typedef typename CatmullClark1Ring::Type Type; + + array_t<CatmullClark1RingT<Vertex,Vertex_t>,4> ring; + + public: + __forceinline CatmullClarkPatchT () {} + + __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const char* vertices, size_t stride) { + init(first_half_edge,vertices,stride); + } + + __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) { + init(first_half_edge,vertices.getPtr(),vertices.getStride()); + } + + __forceinline void init (const HalfEdge* first_half_edge, const char* vertices, size_t stride) + { + for (unsigned i=0; i<4; i++) + ring[i].init(first_half_edge+i,vertices,stride); + + assert(verify()); + } + + __forceinline size_t bytes() const { + return ring[0].bytes()+ring[1].bytes()+ring[2].bytes()+ring[3].bytes(); + } + + __forceinline void serialize(void* ptr, size_t& ofs) const + { + for (size_t i=0; i<4; i++) + ring[i].serialize((char*)ptr,ofs); + } + + __forceinline void deserialize(void* ptr) + { + size_t ofs = 0; + for (size_t i=0; i<4; i++) + ring[i].deserialize((char*)ptr,ofs); + } + + __forceinline BBox3fa bounds() const + { + BBox3fa bounds (ring[0].bounds()); + for (size_t i=1; i<4; i++) + bounds.extend(ring[i].bounds()); + return bounds; + } + + __forceinline Type type() const + { + const int ty0 = ring[0].type() ^ CatmullClark1Ring::TYPE_CREASES; + const int ty1 = ring[1].type() ^ CatmullClark1Ring::TYPE_CREASES; + const int ty2 = ring[2].type() ^ CatmullClark1Ring::TYPE_CREASES; + const int ty3 = ring[3].type() ^ CatmullClark1Ring::TYPE_CREASES; + return (Type) ((ty0 & ty1 & ty2 & ty3) ^ CatmullClark1Ring::TYPE_CREASES); + } + + __forceinline bool isFinalResolution(float res) const { + return ring[0].isFinalResolution(res) && ring[1].isFinalResolution(res) && ring[2].isFinalResolution(res) && ring[3].isFinalResolution(res); + } + + static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0, + const CatmullClark1RingT<Vertex,Vertex_t>& p1, + CatmullClark1RingT<Vertex,Vertex_t>& dest0, + CatmullClark1RingT<Vertex,Vertex_t>& dest1) + { + assert(p1.face_valence > 2); + dest1.vertex_level = dest0.vertex_level = p0.edge_level; + dest1.face_valence = dest0.face_valence = 4; + dest1.edge_valence = dest0.edge_valence = 8; + dest1.border_index = dest0.border_index = -1; + dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; + dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; + + dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; + dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0]; + dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx; + dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4]; + dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1]; + dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2]; + dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; + dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; + + dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; + dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1]; + dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f; + dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; + + if (p0.eval_unique_identifier <= p1.eval_unique_identifier) + { + dest0.eval_start_index = 3; + dest1.eval_start_index = 0; + dest0.eval_unique_identifier = p0.eval_unique_identifier; + dest1.eval_unique_identifier = p0.eval_unique_identifier; + } + else + { + dest0.eval_start_index = 1; + dest1.eval_start_index = 2; + dest0.eval_unique_identifier = p1.eval_unique_identifier; + dest1.eval_unique_identifier = p1.eval_unique_identifier; + } + } + + static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0, + const CatmullClark1RingT<Vertex,Vertex_t> &p1, + CatmullClark1RingT<Vertex,Vertex_t> &dest0, + CatmullClark1RingT<Vertex,Vertex_t> &dest1) + { + dest1.vertex_level = dest0.vertex_level = p0.edge_level; + dest1.face_valence = dest0.face_valence = 3; + dest1.edge_valence = dest0.edge_valence = 6; + dest0.border_index = 2; + dest1.border_index = 4; + dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; + dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; + + dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; + dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0]; + dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx; + dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy + dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; + dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; + + dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; + dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1]; + dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; + + if (p0.eval_unique_identifier <= p1.eval_unique_identifier) + { + dest0.eval_start_index = 1; + dest1.eval_start_index = 2; + dest0.eval_unique_identifier = p0.eval_unique_identifier; + dest1.eval_unique_identifier = p0.eval_unique_identifier; + } + else + { + dest0.eval_start_index = 2; + dest1.eval_start_index = 0; + dest0.eval_unique_identifier = p1.eval_unique_identifier; + dest1.eval_unique_identifier = p1.eval_unique_identifier; + } + } + + static __forceinline void init_regular(const Vertex_t ¢er, const Vertex_t center_ring[8], const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest) + { + dest.vertex_level = 0.0f; + dest.face_valence = 4; + dest.edge_valence = 8; + dest.border_index = -1; + dest.vtx = (Vertex_t)center; + dest.vertex_crease_weight = 0.0f; + for (size_t i=0; i<8; i++) + dest.ring[i] = (Vertex_t)center_ring[(offset+i)%8]; + for (size_t i=0; i<4; i++) + dest.crease_weight[i] = 0.0f; + + dest.eval_start_index = (8-offset)>>1; + if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence; + assert( dest.eval_start_index < dest.face_valence ); + dest.eval_unique_identifier = 0; + } + + __noinline void subdivide(array_t<CatmullClarkPatchT,4>& patch) const + { + ring[0].subdivide(patch[0].ring[0]); + ring[1].subdivide(patch[1].ring[1]); + ring[2].subdivide(patch[2].ring[2]); + ring[3].subdivide(patch[3].ring[3]); + + patch[0].ring[0].edge_level = 0.5f*ring[0].edge_level; + patch[0].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); + patch[0].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); + patch[0].ring[3].edge_level = 0.5f*ring[3].edge_level; + + patch[1].ring[0].edge_level = 0.5f*ring[0].edge_level; + patch[1].ring[1].edge_level = 0.5f*ring[1].edge_level; + patch[1].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); + patch[1].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); + + patch[2].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); + patch[2].ring[1].edge_level = 0.5f*ring[1].edge_level; + patch[2].ring[2].edge_level = 0.5f*ring[2].edge_level; + patch[2].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); + + patch[3].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); + patch[3].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); + patch[3].ring[2].edge_level = 0.5f*ring[2].edge_level; + patch[3].ring[3].edge_level = 0.5f*ring[3].edge_level; + + const bool regular0 = ring[0].has_last_face() && ring[1].face_valence > 2; + if (likely(regular0)) + init_regular(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]); + else + init_border(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]); + + const bool regular1 = ring[1].has_last_face() && ring[2].face_valence > 2; + if (likely(regular1)) + init_regular(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]); + else + init_border(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]); + + const bool regular2 = ring[2].has_last_face() && ring[3].face_valence > 2; + if (likely(regular2)) + init_regular(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]); + else + init_border(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]); + + const bool regular3 = ring[3].has_last_face() && ring[0].face_valence > 2; + if (likely(regular3)) + init_regular(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]); + else + init_border(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]); + + Vertex_t center = (ring[0].vtx + ring[1].vtx + ring[2].vtx + ring[3].vtx) * 0.25f; + + Vertex_t center_ring[8]; + center_ring[0] = (Vertex_t)patch[3].ring[3].ring[0]; + center_ring[7] = (Vertex_t)patch[3].ring[3].vtx; + center_ring[6] = (Vertex_t)patch[2].ring[2].ring[0]; + center_ring[5] = (Vertex_t)patch[2].ring[2].vtx; + center_ring[4] = (Vertex_t)patch[1].ring[1].ring[0]; + center_ring[3] = (Vertex_t)patch[1].ring[1].vtx; + center_ring[2] = (Vertex_t)patch[0].ring[0].ring[0]; + center_ring[1] = (Vertex_t)patch[0].ring[0].vtx; + + init_regular(center,center_ring,0,patch[0].ring[2]); + init_regular(center,center_ring,2,patch[1].ring[3]); + init_regular(center,center_ring,4,patch[2].ring[0]); + init_regular(center,center_ring,6,patch[3].ring[1]); + + assert(patch[0].verify()); + assert(patch[1].verify()); + assert(patch[2].verify()); + assert(patch[3].verify()); + } + + bool verify() const { + return ring[0].hasValidPositions() && ring[1].hasValidPositions() && ring[2].hasValidPositions() && ring[3].hasValidPositions(); + } + + __forceinline void init( FinalQuad& quad ) const + { + quad.vtx[0] = (Vertex_t)ring[0].vtx; + quad.vtx[1] = (Vertex_t)ring[1].vtx; + quad.vtx[2] = (Vertex_t)ring[2].vtx; + quad.vtx[3] = (Vertex_t)ring[3].vtx; + }; + + friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClarkPatchT &p) + { + o << "CatmullClarkPatch { " << embree_endl; + for (size_t i=0; i<4; i++) + o << "ring" << i << ": " << p.ring[i] << embree_endl; + o << "}" << embree_endl; + return o; + } + }; + + typedef CatmullClarkPatchT<Vec3fa,Vec3fa_t> CatmullClarkPatch3fa; + + template<typename Vertex, typename Vertex_t = Vertex> + class __aligned(64) GeneralCatmullClarkPatchT + { + public: + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring; + typedef BezierCurveT<Vertex> BezierCurve; + + static const unsigned SIZE = MAX_PATCH_VALENCE; + DynamicStackArray<GeneralCatmullClark1RingT<Vertex,Vertex_t>,8,SIZE> ring; + unsigned N; + + __forceinline GeneralCatmullClarkPatchT () + : N(0) {} + + GeneralCatmullClarkPatchT (const HalfEdge* h, const char* vertices, size_t stride) { + init(h,vertices,stride); + } + + __forceinline GeneralCatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) { + init(first_half_edge,vertices.getPtr(),vertices.getStride()); + } + + __forceinline void init (const HalfEdge* h, const char* vertices, size_t stride) + { + unsigned int i = 0; + const HalfEdge* edge = h; + do { + ring[i].init(edge,vertices,stride); + edge = edge->next(); + i++; + } while ((edge != h) && (i < SIZE)); + N = i; + } + + __forceinline unsigned size() const { + return N; + } + + __forceinline bool isQuadPatch() const { + return (N == 4) && ring[0].only_quads && ring[1].only_quads && ring[2].only_quads && ring[3].only_quads; + } + + static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0, + const CatmullClark1RingT<Vertex,Vertex_t>& p1, + CatmullClark1RingT<Vertex,Vertex_t>& dest0, + CatmullClark1RingT<Vertex,Vertex_t>& dest1) + { + assert(p1.face_valence > 2); + dest1.vertex_level = dest0.vertex_level = p0.edge_level; + dest1.face_valence = dest0.face_valence = 4; + dest1.edge_valence = dest0.edge_valence = 8; + dest1.border_index = dest0.border_index = -1; + dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; + dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; + + dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; + dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0]; + dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx; + dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4]; + dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1]; + dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2]; + dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; + dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; + + dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; + dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1]; + dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f; + dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; + + if (p0.eval_unique_identifier <= p1.eval_unique_identifier) + { + dest0.eval_start_index = 3; + dest1.eval_start_index = 0; + dest0.eval_unique_identifier = p0.eval_unique_identifier; + dest1.eval_unique_identifier = p0.eval_unique_identifier; + } + else + { + dest0.eval_start_index = 1; + dest1.eval_start_index = 2; + dest0.eval_unique_identifier = p1.eval_unique_identifier; + dest1.eval_unique_identifier = p1.eval_unique_identifier; + } + } + + + static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0, + const CatmullClark1RingT<Vertex,Vertex_t> &p1, + CatmullClark1RingT<Vertex,Vertex_t> &dest0, + CatmullClark1RingT<Vertex,Vertex_t> &dest1) + { + dest1.vertex_level = dest0.vertex_level = p0.edge_level; + dest1.face_valence = dest0.face_valence = 3; + dest1.edge_valence = dest0.edge_valence = 6; + dest0.border_index = 2; + dest1.border_index = 4; + dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; + dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; + + dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; + dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0]; + dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx; + dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy + dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; + dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; + + dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; + dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1]; + dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; + + if (p0.eval_unique_identifier <= p1.eval_unique_identifier) + { + dest0.eval_start_index = 1; + dest1.eval_start_index = 2; + dest0.eval_unique_identifier = p0.eval_unique_identifier; + dest1.eval_unique_identifier = p0.eval_unique_identifier; + } + else + { + dest0.eval_start_index = 2; + dest1.eval_start_index = 0; + dest0.eval_unique_identifier = p1.eval_unique_identifier; + dest1.eval_unique_identifier = p1.eval_unique_identifier; + } + } + + static __forceinline void init_regular(const Vertex_t ¢er, const array_t<Vertex_t,2*SIZE>& center_ring, const float vertex_level, const unsigned int N, const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest) + { + assert(N<(MAX_RING_FACE_VALENCE)); + assert(2*N<(MAX_RING_EDGE_VALENCE)); + dest.vertex_level = vertex_level; + dest.face_valence = N; + dest.edge_valence = 2*N; + dest.border_index = -1; + dest.vtx = (Vertex_t)center; + dest.vertex_crease_weight = 0.0f; + for (unsigned i=0; i<2*N; i++) { + dest.ring[i] = (Vertex_t)center_ring[(2*N+offset+i-1)%(2*N)]; + assert(isvalid(dest.ring[i])); + } + for (unsigned i=0; i<N; i++) + dest.crease_weight[i] = 0.0f; + + assert(offset <= 2*N); + dest.eval_start_index = (2*N-offset)>>1; + if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence; + + assert( dest.eval_start_index < dest.face_valence ); + dest.eval_unique_identifier = 0; + } + + __noinline void subdivide(array_t<CatmullClarkPatch,SIZE>& patch, unsigned& N_o) const + { + N_o = N; + assert( N ); + for (unsigned i=0; i<N; i++) { + unsigned ip1 = (i+1)%N; // FIXME: % + ring[i].subdivide(patch[i].ring[0]); + patch[i] .ring[0].edge_level = 0.5f*ring[i].edge_level; + patch[ip1].ring[3].edge_level = 0.5f*ring[i].edge_level; + + assert( patch[i].ring[0].hasValidPositions() ); + + } + assert(N < 2*SIZE); + Vertex_t center = Vertex_t(0.0f); + array_t<Vertex_t,2*SIZE> center_ring; + float center_vertex_level = 2.0f; // guarantees that irregular vertices get always isolated also for non-quads + + for (unsigned i=0; i<N; i++) + { + unsigned ip1 = (i+1)%N; // FIXME: % + unsigned im1 = (i+N-1)%N; // FIXME: % + bool regular = ring[i].has_last_face() && ring[ip1].face_valence > 2; + if (likely(regular)) init_regular(patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); + else init_border (patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); + + assert( patch[i].ring[1].hasValidPositions() ); + assert( patch[ip1].ring[3].hasValidPositions() ); + + float level = 0.25f*(ring[im1].edge_level+ring[ip1].edge_level); + patch[i].ring[1].edge_level = patch[ip1].ring[2].edge_level = level; + center_vertex_level = max(center_vertex_level,level); + + center += ring[i].vtx; + center_ring[2*i+0] = (Vertex_t)patch[i].ring[0].vtx; + center_ring[2*i+1] = (Vertex_t)patch[i].ring[0].ring[0]; + } + center /= float(N); + + for (unsigned int i=0; i<N; i++) { + init_regular(center,center_ring,center_vertex_level,N,2*i,patch[i].ring[2]); + + assert( patch[i].ring[2].hasValidPositions() ); + } + } + + void init(CatmullClarkPatch& patch) const + { + assert(size() == 4); + ring[0].convert(patch.ring[0]); + ring[1].convert(patch.ring[1]); + ring[2].convert(patch.ring[2]); + ring[3].convert(patch.ring[3]); + } + + static void fix_quad_ring_order (array_t<CatmullClarkPatch,GeneralCatmullClarkPatchT::SIZE>& patches) + { + CatmullClark1Ring patches1ring1 = patches[1].ring[1]; + patches[1].ring[1] = patches[1].ring[0]; // FIXME: optimize these assignments + patches[1].ring[0] = patches[1].ring[3]; + patches[1].ring[3] = patches[1].ring[2]; + patches[1].ring[2] = patches1ring1; + + CatmullClark1Ring patches2ring2 = patches[2].ring[2]; + patches[2].ring[2] = patches[2].ring[0]; + patches[2].ring[0] = patches2ring2; + CatmullClark1Ring patches2ring3 = patches[2].ring[3]; + patches[2].ring[3] = patches[2].ring[1]; + patches[2].ring[1] = patches2ring3; + + CatmullClark1Ring patches3ring3 = patches[3].ring[3]; + patches[3].ring[3] = patches[3].ring[0]; + patches[3].ring[0] = patches[3].ring[1]; + patches[3].ring[1] = patches[3].ring[2]; + patches[3].ring[2] = patches3ring3; + } + + __forceinline void getLimitBorder(BezierCurve curves[GeneralCatmullClarkPatchT::SIZE]) const + { + Vertex P0 = ring[0].getLimitVertex(); + for (unsigned i=0; i<N; i++) + { + const unsigned i0 = i, i1 = i+1==N ? 0 : i+1; + const Vertex P1 = madd(1.0f/3.0f,ring[i0].getLimitTangent(),P0); + const Vertex P3 = ring[i1].getLimitVertex(); + const Vertex P2 = madd(1.0f/3.0f,ring[i1].getSecondLimitTangent(),P3); + new (&curves[i]) BezierCurve(P0,P1,P2,P3); + P0 = P3; + } + } + + __forceinline void getLimitBorder(BezierCurve curves[2], const unsigned subPatch) const + { + const unsigned i0 = subPatch; + const Vertex t0_p = ring[i0].getLimitTangent(); + const Vertex t0_m = ring[i0].getSecondLimitTangent(); + + const unsigned i1 = subPatch+1 == N ? 0 : subPatch+1; + const Vertex t1_p = ring[i1].getLimitTangent(); + const Vertex t1_m = ring[i1].getSecondLimitTangent(); + + const unsigned i2 = subPatch == 0 ? N-1 : subPatch-1; + const Vertex t2_p = ring[i2].getLimitTangent(); + const Vertex t2_m = ring[i2].getSecondLimitTangent(); + + const Vertex b00 = ring[i0].getLimitVertex(); + const Vertex b03 = ring[i1].getLimitVertex(); + const Vertex b33 = ring[i2].getLimitVertex(); + + const Vertex b01 = madd(1.0/3.0f,t0_p,b00); + const Vertex b11 = madd(1.0/3.0f,t0_m,b00); + + //const Vertex b13 = madd(1.0/3.0f,t1_p,b03); + const Vertex b02 = madd(1.0/3.0f,t1_m,b03); + + const Vertex b22 = madd(1.0/3.0f,t2_p,b33); + const Vertex b23 = madd(1.0/3.0f,t2_m,b33); + + new (&curves[0]) BezierCurve(b00,b01,b02,b03); + new (&curves[1]) BezierCurve(b33,b22,b11,b00); + } + + friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClarkPatchT &p) + { + o << "GeneralCatmullClarkPatch { " << embree_endl; + for (unsigned i=0; i<p.N; i++) + o << "ring" << i << ": " << p.ring[i] << embree_endl; + o << "}" << embree_endl; + return o; + } + }; + + typedef GeneralCatmullClarkPatchT<Vec3fa,Vec3fa_t> GeneralCatmullClarkPatch3fa; +} diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h new file mode 100644 index 0000000000..e5ad5dadfe --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h @@ -0,0 +1,826 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/geometry.h" +#include "../common/buffer.h" +#include "half_edge.h" +#include "catmullclark_coefficients.h" + +namespace embree +{ + struct __aligned(64) FinalQuad { + Vec3fa vtx[4]; + }; + + template<typename Vertex, typename Vertex_t = Vertex> + struct __aligned(64) CatmullClark1RingT + { + ALIGNED_STRUCT_(64); + + int border_index; //!< edge index where border starts + unsigned int face_valence; //!< number of adjacent quad faces + unsigned int edge_valence; //!< number of adjacent edges (2*face_valence) + float vertex_crease_weight; //!< weight of vertex crease (0 if no vertex crease) + DynamicStackArray<float,16,MAX_RING_FACE_VALENCE> crease_weight; //!< edge crease weights for each adjacent edge + float vertex_level; //!< maximum level of all adjacent edges + float edge_level; //!< level of first edge + unsigned int eval_start_index; //!< topology dependent index to start evaluation + unsigned int eval_unique_identifier; //!< topology dependent unique identifier for this ring + Vertex vtx; //!< center vertex + DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring; //!< ring of neighboring vertices + + public: + CatmullClark1RingT () + : eval_start_index(0), eval_unique_identifier(0) {} // FIXME: default constructor should be empty + + /*! calculates number of bytes required to serialize this structure */ + __forceinline size_t bytes() const + { + size_t ofs = 0; + ofs += sizeof(border_index); + ofs += sizeof(face_valence); + assert(2*face_valence == edge_valence); + ofs += sizeof(vertex_crease_weight); + ofs += face_valence*sizeof(float); + ofs += sizeof(vertex_level); + ofs += sizeof(edge_level); + ofs += sizeof(eval_start_index); + ofs += sizeof(eval_unique_identifier); + ofs += sizeof(vtx); + ofs += edge_valence*sizeof(Vertex); + return ofs; + } + + template<typename Ty> + static __forceinline void store(char* ptr, size_t& ofs, const Ty& v) { + *(Ty*)&ptr[ofs] = v; ofs += sizeof(Ty); + } + + template<typename Ty> + static __forceinline void load(char* ptr, size_t& ofs, Ty& v) { + v = *(Ty*)&ptr[ofs]; ofs += sizeof(Ty); + } + + /*! serializes the ring to some memory location */ + __forceinline void serialize(char* ptr, size_t& ofs) const + { + store(ptr,ofs,border_index); + store(ptr,ofs,face_valence); + store(ptr,ofs,vertex_crease_weight); + for (size_t i=0; i<face_valence; i++) + store(ptr,ofs,crease_weight[i]); + store(ptr,ofs,vertex_level); + store(ptr,ofs,edge_level); + store(ptr,ofs,eval_start_index); + store(ptr,ofs,eval_unique_identifier); + Vertex_t::storeu(&ptr[ofs],vtx); ofs += sizeof(Vertex); + for (size_t i=0; i<edge_valence; i++) { + Vertex_t::storeu(&ptr[ofs],ring[i]); ofs += sizeof(Vertex); + } + } + + /*! deserializes the ring from some memory location */ + __forceinline void deserialize(char* ptr, size_t& ofs) + { + load(ptr,ofs,border_index); + load(ptr,ofs,face_valence); + edge_valence = 2*face_valence; + load(ptr,ofs,vertex_crease_weight); + for (size_t i=0; i<face_valence; i++) + load(ptr,ofs,crease_weight[i]); + load(ptr,ofs,vertex_level); + load(ptr,ofs,edge_level); + load(ptr,ofs,eval_start_index); + load(ptr,ofs,eval_unique_identifier); + vtx = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex); + for (size_t i=0; i<edge_valence; i++) { + ring[i] = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex); + } + } + + __forceinline bool hasBorder() const { + return border_index != -1; + } + + __forceinline const Vertex& front(size_t i) const { + assert(edge_valence>i); + return ring[i]; + } + + __forceinline const Vertex& back(size_t i) const { + assert(edge_valence>=i); + return ring[edge_valence-i]; + } + + __forceinline bool has_last_face() const { + return (size_t)border_index != (size_t)edge_valence-2; + } + + __forceinline bool has_opposite_front(size_t i) const { + return (size_t)border_index != 2*i; + } + + __forceinline bool has_opposite_back(size_t i) const { + return (size_t)border_index != ((size_t)edge_valence-2-2*i); + } + + __forceinline BBox3fa bounds() const + { + BBox3fa bounds ( vtx ); + for (size_t i = 0; i<edge_valence ; i++) + bounds.extend( ring[i] ); + return bounds; + } + + /*! initializes the ring from the half edge structure */ + __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride) + { + border_index = -1; + vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride); + vertex_crease_weight = h->vertex_crease_weight; + + HalfEdge* p = (HalfEdge*) h; + + unsigned i=0; + unsigned min_vertex_index = (unsigned)-1; + unsigned min_vertex_index_face = (unsigned)-1; + edge_level = p->edge_level; + vertex_level = 0.0f; + + do + { + vertex_level = max(vertex_level,p->edge_level); + crease_weight[i/2] = p->edge_crease_weight; + assert(p->hasOpposite() || p->edge_crease_weight == float(inf)); + + /* store first two vertices of face */ + p = p->next(); + const unsigned index0 = p->getStartVertexIndex(); + ring[i++] = Vertex_t::loadu(vertices+index0*stride); + if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; } + p = p->next(); + + const unsigned index1 = p->getStartVertexIndex(); + ring[i++] = Vertex_t::loadu(vertices+index1*stride); + p = p->next(); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else + { + /* find minimum start vertex */ + const unsigned index0 = p->getStartVertexIndex(); + if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; } + + /*! mark first border edge and store dummy vertex for face between the two border edges */ + border_index = i; + crease_weight[i/2] = inf; + ring[i++] = Vertex_t::loadu(vertices+index0*stride); + ring[i++] = vtx; // dummy vertex + + /*! goto other side of border */ + p = (HalfEdge*) h; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != h); + + edge_valence = i; + face_valence = i >> 1; + eval_unique_identifier = min_vertex_index; + eval_start_index = min_vertex_index_face; + + assert( hasValidPositions() ); + } + + __forceinline void subdivide(CatmullClark1RingT& dest) const + { + dest.edge_level = 0.5f*edge_level; + dest.vertex_level = 0.5f*vertex_level; + dest.face_valence = face_valence; + dest.edge_valence = edge_valence; + dest.border_index = border_index; + dest.vertex_crease_weight = max(0.0f,vertex_crease_weight-1.0f); + dest.eval_start_index = eval_start_index; + dest.eval_unique_identifier = eval_unique_identifier; + + /* calculate face points */ + Vertex_t S = Vertex_t(0.0f); + for (size_t i=0; i<face_valence; i++) + { + size_t face_index = i + eval_start_index; if (face_index >= face_valence) face_index -= face_valence; assert(face_index < face_valence); + size_t index0 = 2*face_index+0; if (index0 >= edge_valence) index0 -= edge_valence; assert(index0 < edge_valence); + size_t index1 = 2*face_index+1; if (index1 >= edge_valence) index1 -= edge_valence; assert(index1 < edge_valence); + size_t index2 = 2*face_index+2; if (index2 >= edge_valence) index2 -= edge_valence; assert(index2 < edge_valence); + S += dest.ring[index1] = ((vtx + ring[index1]) + (ring[index0] + ring[index2])) * 0.25f; + } + + /* calculate new edge points */ + size_t num_creases = 0; + array_t<size_t,MAX_RING_FACE_VALENCE> crease_id; + + for (size_t i=0; i<face_valence; i++) + { + size_t face_index = i + eval_start_index; + if (face_index >= face_valence) face_index -= face_valence; + const float edge_crease = crease_weight[face_index]; + dest.crease_weight[face_index] = max(edge_crease-1.0f,0.0f); + + size_t index = 2*face_index; + size_t prev_index = face_index == 0 ? edge_valence-1 : 2*face_index-1; + size_t next_index = 2*face_index+1; + + const Vertex_t v = vtx + ring[index]; + const Vertex_t f = dest.ring[prev_index] + dest.ring[next_index]; + S += ring[index]; + + /* fast path for regular edge points */ + if (likely(edge_crease <= 0.0f)) { + dest.ring[index] = (v+f) * 0.25f; + } + + /* slower path for hard edge rule */ + else { + crease_id[num_creases++] = face_index; + dest.ring[index] = v*0.5f; + + /* even slower path for blended edge rule */ + if (unlikely(edge_crease < 1.0f)) { + dest.ring[index] = lerp((v+f)*0.25f,v*0.5f,edge_crease); + } + } + } + + /* compute new vertex using smooth rule */ + const float inv_face_valence = 1.0f / (float)face_valence; + const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence; + dest.vtx = v_smooth; + + /* compute new vertex using vertex_crease_weight rule */ + if (unlikely(vertex_crease_weight > 0.0f)) + { + if (vertex_crease_weight >= 1.0f) { + dest.vtx = vtx; + } else { + dest.vtx = lerp(v_smooth,vtx,vertex_crease_weight); + } + return; + } + + /* no edge crease rule and dart rule */ + if (likely(num_creases <= 1)) + return; + + /* compute new vertex using crease rule */ + if (likely(num_creases == 2)) + { + /* update vertex using crease rule */ + const size_t crease0 = crease_id[0], crease1 = crease_id[1]; + const Vertex_t v_sharp = (Vertex_t)(ring[2*crease0] + 6.0f*vtx + ring[2*crease1]) * (1.0f / 8.0f); + dest.vtx = v_sharp; + + /* update crease_weights using chaikin rule */ + const float crease_weight0 = crease_weight[crease0], crease_weight1 = crease_weight[crease1]; + dest.crease_weight[crease0] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f); + dest.crease_weight[crease1] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f); + + /* interpolate between sharp and smooth rule */ + const float v_blend = 0.5f*(crease_weight0+crease_weight1); + if (unlikely(v_blend < 1.0f)) { + dest.vtx = lerp(v_smooth,v_sharp,v_blend); + } + } + + /* compute new vertex using corner rule */ + else { + dest.vtx = vtx; + } + } + + __forceinline bool isRegular1() const + { + if (border_index == -1) { + if (face_valence == 4) return true; + } else { + if (face_valence < 4) return true; + } + return false; + } + + __forceinline size_t numEdgeCreases() const + { + ssize_t numCreases = 0; + for (size_t i=0; i<face_valence; i++) { + numCreases += crease_weight[i] > 0.0f; + } + return numCreases; + } + + enum Type { + TYPE_NONE = 0, //!< invalid type + TYPE_REGULAR = 1, //!< regular patch when ignoring creases + TYPE_REGULAR_CREASES = 2, //!< regular patch when considering creases + TYPE_GREGORY = 4, //!< gregory patch when ignoring creases + TYPE_GREGORY_CREASES = 8, //!< gregory patch when considering creases + TYPE_CREASES = 16 //!< patch has crease features + }; + + __forceinline Type type() const + { + /* check if there is an edge crease anywhere */ + const size_t numCreases = numEdgeCreases(); + const bool noInnerCreases = hasBorder() ? numCreases == 2 : numCreases == 0; + + Type crease_mask = (Type) (TYPE_REGULAR | TYPE_GREGORY); + if (noInnerCreases ) crease_mask = (Type) (crease_mask | TYPE_REGULAR_CREASES | TYPE_GREGORY_CREASES); + if (numCreases != 0) crease_mask = (Type) (crease_mask | TYPE_CREASES); + + /* calculate if this vertex is regular */ + bool hasBorder = border_index != -1; + if (face_valence == 2 && hasBorder) { + if (vertex_crease_weight == 0.0f ) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + else if (vertex_crease_weight == float(inf)) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + else return TYPE_CREASES; + } + else if (vertex_crease_weight != 0.0f) return TYPE_CREASES; + else if (face_valence == 3 && hasBorder) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + else if (face_valence == 4 && !hasBorder) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + else return (Type) (crease_mask & (TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + } + + __forceinline bool isFinalResolution(float res) const { + return vertex_level <= res; + } + + /* computes the limit vertex */ + __forceinline Vertex getLimitVertex() const + { + /* return hard corner */ + if (unlikely(std::isinf(vertex_crease_weight))) + return vtx; + + /* border vertex rule */ + if (unlikely(border_index != -1)) + { + const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; + return (4.0f * vtx + (ring[border_index] + ring[second_border_index])) * 1.0f/6.0f; + } + + Vertex_t F( 0.0f ); + Vertex_t E( 0.0f ); + + assert(eval_start_index < face_valence); + + for (size_t i=0; i<face_valence; i++) { + size_t index = i+eval_start_index; + if (index >= face_valence) index -= face_valence; + F += ring[2*index+1]; + E += ring[2*index]; + } + + const float n = (float)face_valence; + return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n); + } + + /* gets limit tangent in the direction of egde vtx -> ring[0] */ + __forceinline Vertex getLimitTangent() const + { + if (unlikely(std::isinf(vertex_crease_weight))) + return ring[0] - vtx; + + /* border vertex rule */ + if (unlikely(border_index != -1)) + { + if (border_index != (int)edge_valence-2 ) { + return ring[0] - vtx; + } + else + { + const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; + return (ring[second_border_index] - ring[border_index]) * 0.5f; + } + } + + Vertex_t alpha( 0.0f ); + Vertex_t beta ( 0.0f ); + + const size_t n = face_valence; + + assert(eval_start_index < face_valence); + + Vertex_t q( 0.0f ); + for (size_t i=0; i<face_valence; i++) + { + size_t index = i+eval_start_index; + if (index >= face_valence) index -= face_valence; + const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(index,n); + const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(index,n); + alpha += a * ring[2*index]; + beta += b * ring[2*index+1]; + } + + const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n); + return sigma * (alpha + beta); + } + + /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ + __forceinline Vertex getSecondLimitTangent() const + { + if (unlikely(std::isinf(vertex_crease_weight))) + return ring[2] - vtx; + + /* border vertex rule */ + if (unlikely(border_index != -1)) + { + if (border_index != 2) { + return ring[2] - vtx; + } + else { + const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; + return (ring[border_index] - ring[second_border_index]) * 0.5f; + } + } + + Vertex_t alpha( 0.0f ); + Vertex_t beta ( 0.0f ); + + const size_t n = face_valence; + + assert(eval_start_index < face_valence); + + for (size_t i=0; i<face_valence; i++) + { + size_t index = i+eval_start_index; + if (index >= face_valence) index -= face_valence; + + size_t prev_index = index == 0 ? face_valence-1 : index-1; // need to be bit-wise exact in cosf eval + const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(prev_index,n); + const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(prev_index,n); + alpha += a * ring[2*index]; + beta += b * ring[2*index+1]; + } + + const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n); + return sigma* (alpha + beta); + } + + /* gets surface normal */ + const Vertex getNormal() const { + return cross(getLimitTangent(),getSecondLimitTangent()); + } + + /* returns center of the n-th quad in the 1-ring */ + __forceinline Vertex getQuadCenter(const size_t index) const + { + const Vertex_t &p0 = vtx; + const Vertex_t &p1 = ring[2*index+0]; + const Vertex_t &p2 = ring[2*index+1]; + const Vertex_t &p3 = index == face_valence-1 ? ring[0] : ring[2*index+2]; + const Vertex p = (p0+p1+p2+p3) * 0.25f; + return p; + } + + /* returns center of the n-th edge in the 1-ring */ + __forceinline Vertex getEdgeCenter(const size_t index) const { + return (vtx + ring[index*2]) * 0.5f; + } + + bool hasValidPositions() const + { + for (size_t i=0; i<edge_valence; i++) { + if (!isvalid(ring[i])) + return false; + } + return true; + } + + friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClark1RingT &c) + { + o << "vtx " << c.vtx << " size = " << c.edge_valence << ", " << + "hard_edge = " << c.border_index << ", face_valence " << c.face_valence << + ", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", eval_start_index: " << c.eval_start_index << ", ring: " << embree_endl; + + for (unsigned int i=0; i<min(c.edge_valence,(unsigned int)MAX_RING_FACE_VALENCE); i++) { + o << i << " -> " << c.ring[i]; + if (i % 2 == 0) o << " crease = " << c.crease_weight[i/2]; + o << embree_endl; + } + return o; + } + }; + + typedef CatmullClark1RingT<Vec3fa,Vec3fa_t> CatmullClark1Ring3fa; + + template<typename Vertex, typename Vertex_t = Vertex> + struct __aligned(64) GeneralCatmullClark1RingT + { + ALIGNED_STRUCT_(64); + + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring; + + struct Face + { + __forceinline Face() {} + __forceinline Face (int size, float crease_weight) + : size(size), crease_weight(crease_weight) {} + + // FIXME: add member that returns total number of vertices + + int size; // number of vertices-2 of nth face in ring + float crease_weight; + }; + + Vertex vtx; + DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring; + DynamicStackArray<Face,16,MAX_RING_FACE_VALENCE> faces; + unsigned int face_valence; + unsigned int edge_valence; + int border_face; + float vertex_crease_weight; + float vertex_level; //!< maximum level of adjacent edges + float edge_level; // level of first edge + bool only_quads; // true if all faces are quads + unsigned int eval_start_face_index; + unsigned int eval_start_vertex_index; + unsigned int eval_unique_identifier; + + public: + GeneralCatmullClark1RingT() + : eval_start_face_index(0), eval_start_vertex_index(0), eval_unique_identifier(0) {} + + __forceinline bool isRegular() const + { + if (border_face == -1 && face_valence == 4) return true; + return false; + } + + __forceinline bool has_last_face() const { + return border_face != (int)face_valence-1; + } + + __forceinline bool has_second_face() const { + return (border_face == -1) || (border_face >= 2); + } + + bool hasValidPositions() const + { + for (size_t i=0; i<edge_valence; i++) { + if (!isvalid(ring[i])) + return false; + } + return true; + } + + __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride) + { + only_quads = true; + border_face = -1; + vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride); + vertex_crease_weight = h->vertex_crease_weight; + HalfEdge* p = (HalfEdge*) h; + + unsigned int e=0, f=0; + unsigned min_vertex_index = (unsigned)-1; + unsigned min_vertex_index_face = (unsigned)-1; + unsigned min_vertex_index_vertex = (unsigned)-1; + edge_level = p->edge_level; + vertex_level = 0.0f; + do + { + HalfEdge* p_prev = p->prev(); + HalfEdge* p_next = p->next(); + const float crease_weight = p->edge_crease_weight; + assert(p->hasOpposite() || p->edge_crease_weight == float(inf)); + vertex_level = max(vertex_level,p->edge_level); + + /* find minimum start vertex */ + unsigned vertex_index = p_next->getStartVertexIndex(); + if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; } + + /* store first N-2 vertices of face */ + unsigned int vn = 0; + for (p = p_next; p!=p_prev; p=p->next()) { + ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride); + vn++; + } + faces[f++] = Face(vn,crease_weight); + only_quads &= (vn == 2); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else + { + /* find minimum start vertex */ + unsigned vertex_index = p->getStartVertexIndex(); + if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; } + + /*! mark first border edge and store dummy vertex for face between the two border edges */ + border_face = f; + faces[f++] = Face(2,inf); + ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride); + ring[e++] = vtx; // dummy vertex + + /*! goto other side of border */ + p = (HalfEdge*) h; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != h); + + edge_valence = e; + face_valence = f; + eval_unique_identifier = min_vertex_index; + eval_start_face_index = min_vertex_index_face; + eval_start_vertex_index = min_vertex_index_vertex; + + assert( hasValidPositions() ); + } + + __forceinline void subdivide(CatmullClark1Ring& dest) const + { + dest.edge_level = 0.5f*edge_level; + dest.vertex_level = 0.5f*vertex_level; + dest.face_valence = face_valence; + dest.edge_valence = 2*face_valence; + dest.border_index = border_face == -1 ? -1 : 2*border_face; // FIXME: + dest.vertex_crease_weight = max(0.0f,vertex_crease_weight-1.0f); + dest.eval_start_index = eval_start_face_index; + dest.eval_unique_identifier = eval_unique_identifier; + assert(dest.face_valence <= MAX_RING_FACE_VALENCE); + + /* calculate face points */ + Vertex_t S = Vertex_t(0.0f); + for (size_t face=0, v=eval_start_vertex_index; face<face_valence; face++) { + size_t f = (face + eval_start_face_index)%face_valence; + + Vertex_t F = vtx; + for (size_t k=v; k<=v+faces[f].size; k++) F += ring[k%edge_valence]; // FIXME: optimize + S += dest.ring[2*f+1] = F/float(faces[f].size+2); + v+=faces[f].size; + v%=edge_valence; + } + + /* calculate new edge points */ + size_t num_creases = 0; + array_t<size_t,MAX_RING_FACE_VALENCE> crease_id; + Vertex_t C = Vertex_t(0.0f); + for (size_t face=0, j=eval_start_vertex_index; face<face_valence; face++) + { + size_t i = (face + eval_start_face_index)%face_valence; + + const Vertex_t v = vtx + ring[j]; + Vertex_t f = dest.ring[2*i+1]; + if (i == 0) f += dest.ring[dest.edge_valence-1]; + else f += dest.ring[2*i-1]; + S += ring[j]; + dest.crease_weight[i] = max(faces[i].crease_weight-1.0f,0.0f); + + /* fast path for regular edge points */ + if (likely(faces[i].crease_weight <= 0.0f)) { + dest.ring[2*i] = (v+f) * 0.25f; + } + + /* slower path for hard edge rule */ + else { + C += ring[j]; crease_id[num_creases++] = i; + dest.ring[2*i] = v*0.5f; + + /* even slower path for blended edge rule */ + if (unlikely(faces[i].crease_weight < 1.0f)) { + dest.ring[2*i] = lerp((v+f)*0.25f,v*0.5f,faces[i].crease_weight); + } + } + j+=faces[i].size; + j%=edge_valence; + } + + /* compute new vertex using smooth rule */ + const float inv_face_valence = 1.0f / (float)face_valence; + const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence; + dest.vtx = v_smooth; + + /* compute new vertex using vertex_crease_weight rule */ + if (unlikely(vertex_crease_weight > 0.0f)) + { + if (vertex_crease_weight >= 1.0f) { + dest.vtx = vtx; + } else { + dest.vtx = lerp(vtx,v_smooth,vertex_crease_weight); + } + return; + } + + if (likely(num_creases <= 1)) + return; + + /* compute new vertex using crease rule */ + if (likely(num_creases == 2)) { + const Vertex_t v_sharp = (Vertex_t)(C + 6.0f * vtx) * (1.0f / 8.0f); + const float crease_weight0 = faces[crease_id[0]].crease_weight; + const float crease_weight1 = faces[crease_id[1]].crease_weight; + dest.vtx = v_sharp; + dest.crease_weight[crease_id[0]] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f); + dest.crease_weight[crease_id[1]] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f); + const float v_blend = 0.5f*(crease_weight0+crease_weight1); + if (unlikely(v_blend < 1.0f)) { + dest.vtx = lerp(v_sharp,v_smooth,v_blend); + } + } + + /* compute new vertex using corner rule */ + else { + dest.vtx = vtx; + } + } + + void convert(CatmullClark1Ring& dst) const + { + dst.edge_level = edge_level; + dst.vertex_level = vertex_level; + dst.vtx = vtx; + dst.face_valence = face_valence; + dst.edge_valence = 2*face_valence; + dst.border_index = border_face == -1 ? -1 : 2*border_face; + for (size_t i=0; i<face_valence; i++) + dst.crease_weight[i] = faces[i].crease_weight; + dst.vertex_crease_weight = vertex_crease_weight; + for (size_t i=0; i<edge_valence; i++) dst.ring[i] = ring[i]; + + dst.eval_start_index = eval_start_face_index; + dst.eval_unique_identifier = eval_unique_identifier; + + assert( dst.hasValidPositions() ); + } + + + /* gets limit tangent in the direction of egde vtx -> ring[0] */ + __forceinline Vertex getLimitTangent() const + { + CatmullClark1Ring cc_vtx; + + /* fast path for quad only rings */ + if (only_quads) + { + convert(cc_vtx); + return cc_vtx.getLimitTangent(); + } + + subdivide(cc_vtx); + return 2.0f * cc_vtx.getLimitTangent(); + } + + /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ + __forceinline Vertex getSecondLimitTangent() const + { + CatmullClark1Ring cc_vtx; + + /* fast path for quad only rings */ + if (only_quads) + { + convert(cc_vtx); + return cc_vtx.getSecondLimitTangent(); + } + + subdivide(cc_vtx); + return 2.0f * cc_vtx.getSecondLimitTangent(); + } + + + /* gets limit vertex */ + __forceinline Vertex getLimitVertex() const + { + CatmullClark1Ring cc_vtx; + + /* fast path for quad only rings */ + if (only_quads) + convert(cc_vtx); + else + subdivide(cc_vtx); + return cc_vtx.getLimitVertex(); + } + + friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClark1RingT &c) + { + o << "vtx " << c.vtx << " size = " << c.edge_valence << ", border_face = " << c.border_face << ", " << " face_valence = " << c.face_valence << + ", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", ring: " << embree_endl; + for (size_t v=0, f=0; f<c.face_valence; v+=c.faces[f++].size) { + for (size_t i=v; i<v+c.faces[f].size; i++) { + o << i << " -> " << c.ring[i]; + if (i == v) o << " crease = " << c.faces[f].crease_weight; + o << embree_endl; + } + } + return o; + } + }; +} diff --git a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h new file mode 100644 index 0000000000..74fc4c1230 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h @@ -0,0 +1,297 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/scene_curves.h" + +/* + + Implements Catmul Rom curves with control points p0, p1, p2, p3. At + t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1 + the curve goes through p2 with tangent (p3-p2)/2. + + */ + +namespace embree +{ + class CatmullRomBasis + { + public: + + template<typename T> + static __forceinline Vec4<T> eval(const T& u) + { + const T t = u; + const T s = T(1.0f) - u; + const T n0 = - t * s * s; + const T n1 = 2.0f + t * t * (3.0f * t - 5.0f); + const T n2 = 2.0f + s * s * (3.0f * s - 5.0f); + const T n3 = - s * t * t; + return T(0.5f) * Vec4<T>(n0, n1, n2, n3); + } + + template<typename T> + static __forceinline Vec4<T> derivative(const T& u) + { + const T t = u; + const T s = 1.0f - u; + const T n0 = - s * s + 2.0f * s * t; + const T n1 = 2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t; + const T n2 = 2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s; + const T n3 = -2.0f * s * t + t * t; + return T(0.5f) * Vec4<T>(n0, n1, n2, n3); + } + + template<typename T> + static __forceinline Vec4<T> derivative2(const T& u) + { + const T t = u; + const T n0 = -3.0f * t + 2.0f; + const T n1 = 9.0f * t - 5.0f; + const T n2 = -9.0f * t + 4.0f; + const T n3 = 3.0f * t - 1.0f; + return Vec4<T>(n0, n1, n2, n3); + } + }; + + struct PrecomputedCatmullRomBasis + { + enum { N = 16 }; + public: + PrecomputedCatmullRomBasis() {} + PrecomputedCatmullRomBasis(int shift); + + /* basis for bspline evaluation */ + public: + float c0[N+1][N+1]; + float c1[N+1][N+1]; + float c2[N+1][N+1]; + float c3[N+1][N+1]; + + /* basis for bspline derivative evaluation */ + public: + float d0[N+1][N+1]; + float d1[N+1][N+1]; + float d2[N+1][N+1]; + float d3[N+1][N+1]; + }; + extern PrecomputedCatmullRomBasis catmullrom_basis0; + extern PrecomputedCatmullRomBasis catmullrom_basis1; + + template<typename Vertex> + struct CatmullRomCurveT + { + Vertex v0,v1,v2,v3; + + __forceinline CatmullRomCurveT() {} + + __forceinline CatmullRomCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) + : v0(v0), v1(v1), v2(v2), v3(v3) {} + + __forceinline Vertex begin() const { + return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2)); + } + + __forceinline Vertex end() const { + return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3)); + } + + __forceinline Vertex center() const { + return 0.25f*(v0+v1+v2+v3); + } + + __forceinline BBox<Vertex> bounds() const { + return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3)); + } + + __forceinline friend CatmullRomCurveT operator -( const CatmullRomCurveT& a, const Vertex& b ) { + return CatmullRomCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b); + } + + __forceinline CatmullRomCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3ff q0(xfmVector(space,v0-p), v0.w); + const Vec3ff q1(xfmVector(space,v1-p), v1.w); + const Vec3ff q2(xfmVector(space,v2-p), v2.w); + const Vec3ff q3(xfmVector(space,v3-p), v3.w); + return CatmullRomCurveT<Vec3ff>(q0,q1,q2,q3); + } + + __forceinline Vertex eval(const float t) const + { + const Vec4<float> b = CatmullRomBasis::eval(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_du(const float t) const + { + const Vec4<float> b = CatmullRomBasis::derivative(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_dudu(const float t) const + { + const Vec4<float> b = CatmullRomBasis::derivative2(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const + { + p = eval(t); + dp = eval_du(t); + ddp = eval_dudu(t); + } + + template<int M> + __forceinline Vec4vf<M> veval(const vfloat<M>& t) const + { + const Vec4vf<M> b = CatmullRomBasis::eval(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const + { + const Vec4vf<M> b = CatmullRomBasis::derivative(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const + { + const Vec4vf<M> b = CatmullRomBasis::derivative2(t); + return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const + { + p = veval<M>(t); + dp = veval_du<M>(t); + } + + template<int M> + __forceinline Vec4vf<M> eval0(const int ofs, const int size) const + { + assert(size <= PrecomputedCatmullRomBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&catmullrom_basis0.c0[size][ofs]), Vec4vf<M>(v0), + madd(vfloat<M>::loadu(&catmullrom_basis0.c1[size][ofs]), Vec4vf<M>(v1), + madd(vfloat<M>::loadu(&catmullrom_basis0.c2[size][ofs]), Vec4vf<M>(v2), + vfloat<M>::loadu(&catmullrom_basis0.c3[size][ofs]) * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> eval1(const int ofs, const int size) const + { + assert(size <= PrecomputedCatmullRomBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&catmullrom_basis1.c0[size][ofs]), Vec4vf<M>(v0), + madd(vfloat<M>::loadu(&catmullrom_basis1.c1[size][ofs]), Vec4vf<M>(v1), + madd(vfloat<M>::loadu(&catmullrom_basis1.c2[size][ofs]), Vec4vf<M>(v2), + vfloat<M>::loadu(&catmullrom_basis1.c3[size][ofs]) * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const + { + assert(size <= PrecomputedCatmullRomBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&catmullrom_basis0.d0[size][ofs]), Vec4vf<M>(v0), + madd(vfloat<M>::loadu(&catmullrom_basis0.d1[size][ofs]), Vec4vf<M>(v1), + madd(vfloat<M>::loadu(&catmullrom_basis0.d2[size][ofs]), Vec4vf<M>(v2), + vfloat<M>::loadu(&catmullrom_basis0.d3[size][ofs]) * Vec4vf<M>(v3)))); + } + + template<int M> + __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const + { + assert(size <= PrecomputedCatmullRomBasis::N); + assert(ofs <= size); + return madd(vfloat<M>::loadu(&catmullrom_basis1.d0[size][ofs]), Vec4vf<M>(v0), + madd(vfloat<M>::loadu(&catmullrom_basis1.d1[size][ofs]), Vec4vf<M>(v1), + madd(vfloat<M>::loadu(&catmullrom_basis1.d2[size][ofs]), Vec4vf<M>(v2), + vfloat<M>::loadu(&catmullrom_basis1.d3[size][ofs]) * Vec4vf<M>(v3)))); + } + + /* calculates bounds of catmull-rom curve geometry */ + __forceinline BBox3fa accurateRoundBounds() const + { + const int N = 7; + const float scale = 1.0f/(3.0f*(N-1)); + Vec4vfx pl(pos_inf), pu(neg_inf); + for (int i=0; i<=N; i+=VSIZEX) + { + vintx vi = vintx(i)+vintx(step); + vboolx valid = vi <= vintx(N); + const Vec4vfx p = eval0<VSIZEX>(i,N); + const Vec4vfx dp = derivative0<VSIZEX>(i,N); + const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); + const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); + pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min + pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const float r_min = reduce_min(pl.w); + const float r_max = reduce_max(pu.w); + const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); + return enlarge(BBox3fa(lower,upper),upper_r); + } + + /* calculates bounds when tessellated into N line segments */ + __forceinline BBox3fa accurateFlatBounds(int N) const + { + if (likely(N == 4)) + { + const Vec4vf4 pi = eval0<4>(0,4); + const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); + const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); + const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); + const Vec3ff pe = end(); + return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w)))); + } + else + { + Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); + for (int i=0; i<=N; i+=VSIZEX) + { + vboolx valid = vintx(i)+vintx(step) <= vintx(N); + const Vec4vfx pi = eval0<VSIZEX>(i,N); + + pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min + pl.y = select(valid,min(pl.y,pi.y),pl.y); + pl.z = select(valid,min(pl.z,pi.z),pl.z); + + pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min + pu.y = select(valid,max(pu.y,pi.y),pu.y); + pu.z = select(valid,max(pu.z,pi.z),pu.z); + + ru = select(valid,max(ru,abs(pi.w)),ru); + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const Vec3fa upper_r(reduce_max(ru)); + return enlarge(BBox3fa(lower,upper),upper_r); + } + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const CatmullRomCurveT& curve) { + return cout << "CatmullRomCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; + } + }; + + template<typename CurveGeometry> + __forceinline CatmullRomCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT<Vec3ff>& curve) + { + return CatmullRomCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); + } + + typedef CatmullRomCurveT<Vec3fa> CatmullRomCurve3fa; +} + diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h new file mode 100644 index 0000000000..58c0b63e62 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval.h @@ -0,0 +1,226 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" + +namespace embree +{ + namespace isa + { + template<typename Vertex, typename Vertex_t = Vertex> + struct FeatureAdaptiveEval + { + public: + + typedef PatchT<Vertex,Vertex_t> Patch; + typedef typename Patch::Ref Ref; + typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch; + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing; + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch; + typedef BezierPatchT<Vertex,Vertex_t> BezierPatch; + typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch; + typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch; + typedef BezierCurveT<Vertex> BezierCurve; + + public: + + FeatureAdaptiveEval (const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) + { + switch (edge->patch_type) { + case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; + case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; +#if PATCH_USE_GREGORY == 2 + case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; +#endif + default: { + GeneralCatmullClarkPatch patch(edge,vertices,stride); + eval(patch,Vec2f(u,v),0); + break; + } + } + } + + FeatureAdaptiveEval (CatmullClarkPatch& patch, const float u, const float v, float dscale, size_t depth, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) + { + eval(patch,Vec2f(u,v),dscale,depth); + } + + void eval_general_quad(const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE>& patches, const Vec2f& uv, size_t depth) + { + float u = uv.x, v = uv.y; + if (v < 0.5f) { + if (u < 0.5f) { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,0); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1); +#endif + if (dPdu && dPdv) { + const Vertex dpdx = *dPdu, dpdy = *dPdv; + *dPdu = dpdx; *dPdv = dpdy; + } + } + else { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,1); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1); +#endif + if (dPdu && dPdv) { + const Vertex dpdx = *dPdu, dpdy = *dPdv; + *dPdu = -dpdy; *dPdv = dpdx; + } + } + } else { + if (u > 0.5f) { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,2); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1); +#endif + if (dPdu && dPdv) { + const Vertex dpdx = *dPdu, dpdy = *dPdv; + *dPdu = -dpdx; *dPdv = -dpdy; + } + } + else { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,3); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1); +#endif + if (dPdu && dPdv) { + const Vertex dpdx = *dPdu, dpdy = *dPdv; + *dPdu = dpdy; *dPdv = -dpdx; + } + } + } + } + + __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) + { + const int max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR; +//#if PATCH_MIN_RESOLUTION +// return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=(size_t)max_eval_depth; +//#else + return depth>=(size_t)max_eval_depth; +//#endif + } + + void eval(CatmullClarkPatch& patch, Vec2f uv, float dscale, size_t depth, + BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr) + { + while (true) + { + typename CatmullClarkPatch::Type ty = patch.type(); + + if (unlikely(final(patch,ty,depth))) + { + if (ty & CatmullClarkRing::TYPE_REGULAR) { + RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(234423,c,c,-1); + return; + } else { + IrregularFillPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(34534,c,-1,c); + return; + } + } + else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { + assert(depth > 0); + RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(43524,c,c,-1); + return; + } +#if PATCH_USE_GREGORY == 2 + else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { + assert(depth > 0); + GregoryPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(23498,c,-1,c); + return; + } +#endif + else + { + array_t<CatmullClarkPatch,4> patches; + patch.subdivide(patches); // FIXME: only have to generate one of the patches + + const float u = uv.x, v = uv.y; + if (v < 0.5f) { + if (u < 0.5f) { patch = patches[0]; uv = Vec2f(2.0f*u,2.0f*v); dscale *= 2.0f; } + else { patch = patches[1]; uv = Vec2f(2.0f*u-1.0f,2.0f*v); dscale *= 2.0f; } + } else { + if (u > 0.5f) { patch = patches[2]; uv = Vec2f(2.0f*u-1.0f,2.0f*v-1.0f); dscale *= 2.0f; } + else { patch = patches[3]; uv = Vec2f(2.0f*u,2.0f*v-1.0f); dscale *= 2.0f; } + } + depth++; + } + } + } + + void eval(const GeneralCatmullClarkPatch& patch, const Vec2f& uv, const size_t depth) + { + /* convert into standard quad patch if possible */ + if (likely(patch.isQuadPatch())) + { + CatmullClarkPatch qpatch; patch.init(qpatch); + return eval(qpatch,uv,1.0f,depth); + } + + /* subdivide patch */ + unsigned N; + array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; + patch.subdivide(patches,N); // FIXME: only have to generate one of the patches + + /* parametrization for quads */ + if (N == 4) + eval_general_quad(patch,patches,uv,depth); + + /* parametrization for arbitrary polygons */ + else + { + const unsigned l = (unsigned) floor(0.5f*uv.x); const float u = 2.0f*frac(0.5f*uv.x)-0.5f; + const unsigned h = (unsigned) floor(0.5f*uv.y); const float v = 2.0f*frac(0.5f*uv.y)-0.5f; + const unsigned i = 4*h+l; assert(i<N); + if (i >= N) return; + +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,i); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[i],Vec2f(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[i],Vec2f(u,v),1.0f,depth+1); +#endif + } + } + + private: + Vertex* const P; + Vertex* const dPdu; + Vertex* const dPdv; + Vertex* const ddPdudu; + Vertex* const ddPdvdv; + Vertex* const ddPdudv; + }; + } +} diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h new file mode 100644 index 0000000000..4755aba28d --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_grid.h @@ -0,0 +1,359 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" +#include "catmullclark_patch.h" +#include "bspline_patch.h" +#include "gregory_patch.h" +#include "tessellation.h" + +namespace embree +{ + namespace isa + { + struct FeatureAdaptiveEvalGrid + { + typedef CatmullClark1Ring3fa CatmullClarkRing; + typedef CatmullClarkPatch3fa CatmullClarkPatch; + typedef BilinearPatch3fa BilinearPatch; + typedef BSplinePatch3fa BSplinePatch; + typedef BezierPatch3fa BezierPatch; + typedef GregoryPatch3fa GregoryPatch; + + private: + const unsigned x0,x1; + const unsigned y0,y1; + const unsigned swidth,sheight; + const float rcp_swidth, rcp_sheight; + float* const Px; + float* const Py; + float* const Pz; + float* const U; + float* const V; + float* const Nx; + float* const Ny; + float* const Nz; + const unsigned dwidth; + //const unsigned dheight; + unsigned count; + + + public: + FeatureAdaptiveEvalGrid (const GeneralCatmullClarkPatch3fa& patch, unsigned subPatch, + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + float* Px, float* Py, float* Pz, float* U, float* V, + float* Nx, float* Ny, float* Nz, + const unsigned dwidth, const unsigned dheight) + : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), + Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0) + { + assert(swidth < (2<<20) && sheight < (2<<20)); + const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1))); + const BBox2f erange(Vec2f((float)x0,(float)y0),Vec2f((float)x1,(float)y1)); + + /* convert into standard quad patch if possible */ + if (likely(patch.isQuadPatch())) + { + CatmullClarkPatch3fa qpatch; patch.init(qpatch); + eval(qpatch, srange, erange, 0); + assert(count == (x1-x0+1)*(y1-y0+1)); + return; + } + + /* subdivide patch */ + unsigned N; + array_t<CatmullClarkPatch3fa,GeneralCatmullClarkPatch3fa::SIZE> patches; + patch.subdivide(patches,N); + + if (N == 4) + { + const Vec2f c = srange.center(); + const BBox2f srange0(srange.lower,c); + const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); + const BBox2f srange2(c,srange.upper); + const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); + +#if PATCH_USE_GREGORY == 2 + BezierCurve3fa borders[GeneralCatmullClarkPatch3fa::SIZE]; patch.getLimitBorder(borders); + BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve3fa border1l,border1r; borders[1].subdivide(border1l,border1r); + BezierCurve3fa border2l,border2r; borders[2].subdivide(border2l,border2r); + BezierCurve3fa border3l,border3r; borders[3].subdivide(border3l,border3r); + GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches); + eval(patches[0],srange0,intersect(srange0,erange),1,&border0l,nullptr,nullptr,&border3r); + eval(patches[1],srange1,intersect(srange1,erange),1,&border0r,&border1l,nullptr,nullptr); + eval(patches[2],srange2,intersect(srange2,erange),1,nullptr,&border1r,&border2l,nullptr); + eval(patches[3],srange3,intersect(srange3,erange),1,nullptr,nullptr,&border2r,&border3l); +#else + GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches); + eval(patches[0],srange0,intersect(srange0,erange),1); + eval(patches[1],srange1,intersect(srange1,erange),1); + eval(patches[2],srange2,intersect(srange2,erange),1); + eval(patches[3],srange3,intersect(srange3,erange),1); +#endif + } + else + { + assert(subPatch < N); + +#if PATCH_USE_GREGORY == 2 + BezierCurve3fa borders[2]; patch.getLimitBorder(borders,subPatch); + BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve3fa border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[subPatch], srange, erange, 1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[subPatch], srange, erange, 1); +#endif + + } + assert(count == (x1-x0+1)*(y1-y0+1)); + } + + FeatureAdaptiveEvalGrid (const CatmullClarkPatch3fa& patch, + const BBox2f& srange, const BBox2f& erange, const unsigned depth, + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + float* Px, float* Py, float* Pz, float* U, float* V, + float* Nx, float* Ny, float* Nz, + const unsigned dwidth, const unsigned dheight) + : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), + Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0) + { + eval(patch,srange,erange,depth); + } + + template<typename Patch> + void evalLocalGrid(const Patch& patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1) + { + const float scale_x = rcp(srange.upper.x-srange.lower.x); + const float scale_y = rcp(srange.upper.y-srange.lower.y); + count += (lx1-lx0)*(ly1-ly0); + +#if 0 + for (unsigned iy=ly0; iy<ly1; iy++) { + for (unsigned ix=lx0; ix<lx1; ix++) { + const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x); + const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y); + const Vec3fa p = patch.eval(lu,lv); + const float u = float(ix)*rcp_swidth; + const float v = float(iy)*rcp_sheight; + const int ofs = (iy-y0)*dwidth+(ix-x0); + Px[ofs] = p.x; + Py[ofs] = p.y; + Pz[ofs] = p.z; + U[ofs] = u; + V[ofs] = v; + } + } +#else + foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) { + const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x); + const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y); + const Vec3vfx p = patch.eval(lu,lv); + Vec3vfx n = zero; + if (unlikely(Nx != nullptr)) n = normalize_safe(patch.normal(lu,lv)); + const vfloatx u = vfloatx(ix)*rcp_swidth; + const vfloatx v = vfloatx(iy)*rcp_sheight; + const vintx ofs = (iy-y0)*dwidth+(ix-x0); + if (likely(all(valid)) && all(iy==iy[0])) { + const unsigned ofs2 = ofs[0]; + vfloatx::storeu(Px+ofs2,p.x); + vfloatx::storeu(Py+ofs2,p.y); + vfloatx::storeu(Pz+ofs2,p.z); + vfloatx::storeu(U+ofs2,u); + vfloatx::storeu(V+ofs2,v); + if (unlikely(Nx != nullptr)) { + vfloatx::storeu(Nx+ofs2,n.x); + vfloatx::storeu(Ny+ofs2,n.y); + vfloatx::storeu(Nz+ofs2,n.z); + } + } else { + foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) { + const unsigned ofs2 = ofs[j]-j; + vfloatx::storeu(valid,Px+ofs2,p.x); + vfloatx::storeu(valid,Py+ofs2,p.y); + vfloatx::storeu(valid,Pz+ofs2,p.z); + vfloatx::storeu(valid,U+ofs2,u); + vfloatx::storeu(valid,V+ofs2,v); + if (unlikely(Nx != nullptr)) { + vfloatx::storeu(valid,Nx+ofs2,n.x); + vfloatx::storeu(valid,Ny+ofs2,n.y); + vfloatx::storeu(valid,Nz+ofs2,n.z); + } + }); + } + }); +#endif + } + + __forceinline bool final(const CatmullClarkPatch3fa& patch, const CatmullClarkRing::Type type, unsigned depth) + { + const unsigned max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR; +//#if PATCH_MIN_RESOLUTION +// return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth; +//#else + return depth>=max_eval_depth; +//#endif + } + + void eval(const CatmullClarkPatch3fa& patch, const BBox2f& srange, const BBox2f& erange, const unsigned depth, + const BezierCurve3fa* border0 = nullptr, const BezierCurve3fa* border1 = nullptr, const BezierCurve3fa* border2 = nullptr, const BezierCurve3fa* border3 = nullptr) + { + if (erange.empty()) + return; + + int lx0 = (int) ceilf(erange.lower.x); + int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0)); + int ly0 = (int) ceilf(erange.lower.y); + int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0)); + if (lx0 >= lx1 || ly0 >= ly1) return; + + CatmullClarkPatch::Type ty = patch.type(); + + if (unlikely(final(patch,ty,depth))) + { + if (ty & CatmullClarkRing::TYPE_REGULAR) { + RegularPatch rpatch(patch,border0,border1,border2,border3); + evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1); + return; + } else { + IrregularFillPatch ipatch(patch,border0,border1,border2,border3); + evalLocalGrid(ipatch,srange,lx0,lx1,ly0,ly1); + return; + } + } + else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { + assert(depth > 0); + RegularPatch rpatch(patch,border0,border1,border2,border3); + evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1); + return; + } +#if PATCH_USE_GREGORY == 2 + else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { + assert(depth > 0); + GregoryPatch gpatch(patch,border0,border1,border2,border3); + evalLocalGrid(gpatch,srange,lx0,lx1,ly0,ly1); + } +#endif + else + { + array_t<CatmullClarkPatch3fa,4> patches; + patch.subdivide(patches); + + const Vec2f c = srange.center(); + const BBox2f srange0(srange.lower,c); + const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); + const BBox2f srange2(c,srange.upper); + const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); + + eval(patches[0],srange0,intersect(srange0,erange),depth+1); + eval(patches[1],srange1,intersect(srange1,erange),depth+1); + eval(patches[2],srange2,intersect(srange2,erange),depth+1); + eval(patches[3],srange3,intersect(srange3,erange),depth+1); + } + } + }; + + template<typename Eval, typename Patch> + bool stitch_col(const Patch& patch, int subPatch, + const bool right, const unsigned y0, const unsigned y1, const int fine_y, const int coarse_y, + float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dx0, const unsigned dwidth, const unsigned dheight) + { + assert(coarse_y <= fine_y); + if (likely(fine_y == coarse_y)) + return false; + + const unsigned y0s = stitch(y0,fine_y,coarse_y); + const unsigned y1s = stitch(y1,fine_y,coarse_y); + const unsigned M = y1s-y0s+1 + VSIZEX; + + dynamic_large_stack_array(float,px,M,64*sizeof(float)); + dynamic_large_stack_array(float,py,M,64*sizeof(float)); + dynamic_large_stack_array(float,pz,M,64*sizeof(float)); + dynamic_large_stack_array(float,u,M,64*sizeof(float)); + dynamic_large_stack_array(float,v,M,64*sizeof(float)); + dynamic_large_stack_array(float,nx,M,64*sizeof(float)); + dynamic_large_stack_array(float,ny,M,64*sizeof(float)); + dynamic_large_stack_array(float,nz,M,64*sizeof(float)); + const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz)); + Eval(patch,subPatch, right,right, y0s,y1s, 2,coarse_y+1, px,py,pz,u,v, + has_Nxyz ? (float*)nx : nullptr,has_Nxyz ? (float*)ny : nullptr ,has_Nxyz ? (float*)nz : nullptr, 1,4097); + + for (unsigned y=y0; y<=y1; y++) + { + const unsigned ys = stitch(y,fine_y,coarse_y)-y0s; + Px[(y-y0)*dwidth+dx0] = px[ys]; + Py[(y-y0)*dwidth+dx0] = py[ys]; + Pz[(y-y0)*dwidth+dx0] = pz[ys]; + U [(y-y0)*dwidth+dx0] = u[ys]; + V [(y-y0)*dwidth+dx0] = v[ys]; + if (unlikely(has_Nxyz)) { + Nx[(y-y0)*dwidth+dx0] = nx[ys]; + Ny[(y-y0)*dwidth+dx0] = ny[ys]; + Nz[(y-y0)*dwidth+dx0] = nz[ys]; + } + } + return true; + } + + template<typename Eval, typename Patch> + bool stitch_row(const Patch& patch, int subPatch, + const bool bottom, const unsigned x0, const unsigned x1, const int fine_x, const int coarse_x, + float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dy0, const unsigned dwidth, const unsigned dheight) + { + assert(coarse_x <= fine_x); + if (likely(fine_x == coarse_x)) + return false; + + const unsigned x0s = stitch(x0,fine_x,coarse_x); + const unsigned x1s = stitch(x1,fine_x,coarse_x); + const unsigned M = x1s-x0s+1 + VSIZEX; + + dynamic_large_stack_array(float,px,M,32*sizeof(float)); + dynamic_large_stack_array(float,py,M,32*sizeof(float)); + dynamic_large_stack_array(float,pz,M,32*sizeof(float)); + dynamic_large_stack_array(float,u,M,32*sizeof(float)); + dynamic_large_stack_array(float,v,M,32*sizeof(float)); + dynamic_large_stack_array(float,nx,M,32*sizeof(float)); + dynamic_large_stack_array(float,ny,M,32*sizeof(float)); + dynamic_large_stack_array(float,nz,M,32*sizeof(float)); + const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz)); + Eval(patch,subPatch, x0s,x1s, bottom,bottom, coarse_x+1,2, px,py,pz,u,v, + has_Nxyz ? (float*)nx :nullptr, has_Nxyz ? (float*)ny : nullptr , has_Nxyz ? (float*)nz : nullptr, 4097,1); + + for (unsigned x=x0; x<=x1; x++) + { + const unsigned xs = stitch(x,fine_x,coarse_x)-x0s; + Px[dy0*dwidth+x-x0] = px[xs]; + Py[dy0*dwidth+x-x0] = py[xs]; + Pz[dy0*dwidth+x-x0] = pz[xs]; + U [dy0*dwidth+x-x0] = u[xs]; + V [dy0*dwidth+x-x0] = v[xs]; + if (unlikely(has_Nxyz)) { + Nx[dy0*dwidth+x-x0] = nx[xs]; + Ny[dy0*dwidth+x-x0] = ny[xs]; + Nz[dy0*dwidth+x-x0] = nz[xs]; + } + } + return true; + } + + template<typename Eval, typename Patch> + void feature_adaptive_eval_grid (const Patch& patch, unsigned subPatch, const float levels[4], + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dwidth, const unsigned dheight) + { + bool sl = false, sr = false, st = false, sb = false; + if (levels) { + sl = x0 == 0 && stitch_col<Eval,Patch>(patch,subPatch,0,y0,y1,sheight-1,int(levels[3]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0 ,dwidth,dheight); + sr = x1 == swidth-1 && stitch_col<Eval,Patch>(patch,subPatch,1,y0,y1,sheight-1,int(levels[1]), Px,Py,Pz,U,V,Nx,Ny,Nz, x1-x0,dwidth,dheight); + st = y0 == 0 && stitch_row<Eval,Patch>(patch,subPatch,0,x0,x1,swidth-1,int(levels[0]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0 ,dwidth,dheight); + sb = y1 == sheight-1 && stitch_row<Eval,Patch>(patch,subPatch,1,x0,x1,swidth-1,int(levels[2]), Px,Py,Pz,U,V,Nx,Ny,Nz, y1-y0,dwidth,dheight); + } + const unsigned ofs = st*dwidth+sl; + Eval(patch,subPatch,x0+sl,x1-sr,y0+st,y1-sb, swidth,sheight, Px+ofs,Py+ofs,Pz+ofs,U+ofs,V+ofs,Nx?Nx+ofs:nullptr,Ny?Ny+ofs:nullptr,Nz?Nz+ofs:nullptr, dwidth,dheight); + } + } +} + diff --git a/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h new file mode 100644 index 0000000000..edab0db12f --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/feature_adaptive_eval_simd.h @@ -0,0 +1,186 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" + +namespace embree +{ + namespace isa + { + template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex> + struct FeatureAdaptiveEvalSimd + { + public: + + typedef PatchT<Vertex,Vertex_t> Patch; + typedef typename Patch::Ref Ref; + typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch; + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing; + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch; + typedef BezierPatchT<Vertex,Vertex_t> BezierPatch; + typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch; + typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch; + typedef BezierCurveT<Vertex> BezierCurve; + + FeatureAdaptiveEvalSimd (const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid, const vfloat& u, const vfloat& v, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) + { + switch (edge->patch_type) { + case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; + case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; +#if PATCH_USE_GREGORY == 2 + case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatchT<Vertex,Vertex_t>(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; +#endif + default: { + GeneralCatmullClarkPatch patch(edge,vertices,stride); + eval_direct(valid,patch,Vec2<vfloat>(u,v),0); + break; + } + } + } + + FeatureAdaptiveEvalSimd (const CatmullClarkPatch& patch, const vbool& valid, const vfloat& u, const vfloat& v, float dscale, size_t depth, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) + { + eval_direct(valid,patch,Vec2<vfloat>(u,v),dscale,depth); + } + + template<size_t N> + __forceinline void eval_quad_direct(const vbool& valid, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth) + { + const vfloat u = uv.x, v = uv.y; + const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; + const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; + const vbool u0v0_mask = valid & u0_mask & v0_mask; + const vbool u0v1_mask = valid & u0_mask & v1_mask; + const vbool u1v0_mask = valid & u1_mask & v0_mask; + const vbool u1v1_mask = valid & u1_mask & v1_mask; + if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1); + if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1); + if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1); + if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1); + } + + template<size_t N> + __forceinline void eval_general_quad_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth) + { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r); + BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r); + BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r); +#endif + GeneralCatmullClarkPatch::fix_quad_ring_order(patches); + const vfloat u = uv.x, v = uv.y; + const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; + const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; + const vbool u0v0_mask = valid & u0_mask & v0_mask; + const vbool u0v1_mask = valid & u0_mask & v1_mask; + const vbool u1v0_mask = valid & u1_mask & v0_mask; + const vbool u1v1_mask = valid & u1_mask & v1_mask; +#if PATCH_USE_GREGORY == 2 + if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1,&border0l,nullptr,nullptr,&border3r); + if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1,&border0r,&border1l,nullptr,nullptr); + if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,&border1r,&border2l,nullptr); + if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,nullptr,&border2r,&border3l); +#else + if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1); + if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1); + if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1); + if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1); +#endif + } + + __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) + { + const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR; +//#if PATCH_MIN_RESOLUTION +// return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth; +//#else + return depth>=max_eval_depth; +//#endif + } + + void eval_direct(const vbool& valid, const CatmullClarkPatch& patch, const Vec2<vfloat>& uv, float dscale, size_t depth, + BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr) + { + typename CatmullClarkPatch::Type ty = patch.type(); + + if (unlikely(final(patch,ty,depth))) + { + if (ty & CatmullClarkRing::TYPE_REGULAR) { + RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } else { + IrregularFillPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } + } + else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { + assert(depth > 0); RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } +#if PATCH_USE_GREGORY == 2 + else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { + assert(depth > 0); GregoryPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } +#endif + else + { + array_t<CatmullClarkPatch,4> patches; + patch.subdivide(patches); // FIXME: only have to generate one of the patches + eval_quad_direct(valid,patches,uv,dscale,depth); + } + } + + void eval_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, const Vec2<vfloat>& uv, const size_t depth) + { + /* convert into standard quad patch if possible */ + if (likely(patch.isQuadPatch())) { + CatmullClarkPatch qpatch; patch.init(qpatch); + return eval_direct(valid,qpatch,uv,1.0f,depth); + } + + /* subdivide patch */ + unsigned Nc; + array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; + patch.subdivide(patches,Nc); // FIXME: only have to generate one of the patches + + /* parametrization for quads */ + if (Nc == 4) + eval_general_quad_direct(valid,patch,patches,uv,1.0f,depth); + + /* parametrization for arbitrary polygons */ + else + { + const vint l = (vint)floor(0.5f*uv.x); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; + const vint h = (vint)floor(0.5f*uv.y); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; + const vint i = (h<<2)+l; assert(all(valid,i<Nc)); + foreach_unique(valid,i,[&](const vbool& valid, const int i) { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,i); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1); +#endif + }); + } + } + + private: + float* const P; + float* const dPdu; + float* const dPdv; + float* const ddPdudu; + float* const ddPdvdv; + float* const ddPdudv; + const size_t dstride; + const size_t N; + }; + } +} diff --git a/thirdparty/embree/kernels/subdiv/gregory_patch.h b/thirdparty/embree/kernels/subdiv/gregory_patch.h new file mode 100644 index 0000000000..9026d5c407 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/gregory_patch.h @@ -0,0 +1,893 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bezier_patch.h" +#include "bezier_curve.h" +#include "catmullclark_coefficients.h" + +namespace embree +{ + template<typename Vertex, typename Vertex_t = Vertex> + class __aligned(64) GregoryPatchT + { + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch; + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring; + typedef BezierCurveT<Vertex> BezierCurve; + + public: + Vertex v[4][4]; + Vertex f[2][2]; + + __forceinline GregoryPatchT() {} + + __forceinline GregoryPatchT(const CatmullClarkPatch& patch) { + init(patch); + } + + __forceinline GregoryPatchT(const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) + { + init_crackfix(patch,border0,border1,border2,border3); + } + + __forceinline GregoryPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { + init(CatmullClarkPatch(edge,vertices,stride)); + } + + __forceinline Vertex& p0() { return v[0][0]; } + __forceinline Vertex& p1() { return v[0][3]; } + __forceinline Vertex& p2() { return v[3][3]; } + __forceinline Vertex& p3() { return v[3][0]; } + + __forceinline Vertex& e0_p() { return v[0][1]; } + __forceinline Vertex& e0_m() { return v[1][0]; } + __forceinline Vertex& e1_p() { return v[1][3]; } + __forceinline Vertex& e1_m() { return v[0][2]; } + __forceinline Vertex& e2_p() { return v[3][2]; } + __forceinline Vertex& e2_m() { return v[2][3]; } + __forceinline Vertex& e3_p() { return v[2][0]; } + __forceinline Vertex& e3_m() { return v[3][1]; } + + __forceinline Vertex& f0_p() { return v[1][1]; } + __forceinline Vertex& f1_p() { return v[1][2]; } + __forceinline Vertex& f2_p() { return v[2][2]; } + __forceinline Vertex& f3_p() { return v[2][1]; } + __forceinline Vertex& f0_m() { return f[0][0]; } + __forceinline Vertex& f1_m() { return f[0][1]; } + __forceinline Vertex& f2_m() { return f[1][1]; } + __forceinline Vertex& f3_m() { return f[1][0]; } + + __forceinline const Vertex& p0() const { return v[0][0]; } + __forceinline const Vertex& p1() const { return v[0][3]; } + __forceinline const Vertex& p2() const { return v[3][3]; } + __forceinline const Vertex& p3() const { return v[3][0]; } + + __forceinline const Vertex& e0_p() const { return v[0][1]; } + __forceinline const Vertex& e0_m() const { return v[1][0]; } + __forceinline const Vertex& e1_p() const { return v[1][3]; } + __forceinline const Vertex& e1_m() const { return v[0][2]; } + __forceinline const Vertex& e2_p() const { return v[3][2]; } + __forceinline const Vertex& e2_m() const { return v[2][3]; } + __forceinline const Vertex& e3_p() const { return v[2][0]; } + __forceinline const Vertex& e3_m() const { return v[3][1]; } + + __forceinline const Vertex& f0_p() const { return v[1][1]; } + __forceinline const Vertex& f1_p() const { return v[1][2]; } + __forceinline const Vertex& f2_p() const { return v[2][2]; } + __forceinline const Vertex& f3_p() const { return v[2][1]; } + __forceinline const Vertex& f0_m() const { return f[0][0]; } + __forceinline const Vertex& f1_m() const { return f[0][1]; } + __forceinline const Vertex& f2_m() const { return f[1][1]; } + __forceinline const Vertex& f3_m() const { return f[1][0]; } + + __forceinline Vertex initCornerVertex(const CatmullClarkPatch& irreg_patch, const size_t index) { + return irreg_patch.ring[index].getLimitVertex(); + } + + __forceinline Vertex initPositiveEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) { + return madd(1.0f/3.0f,irreg_patch.ring[index].getLimitTangent(),p_vtx); + } + + __forceinline Vertex initNegativeEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) { + return madd(1.0f/3.0f,irreg_patch.ring[index].getSecondLimitTangent(),p_vtx); + } + + __forceinline Vertex initPositiveEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) + { + CatmullClark1Ring3fa r0,r1,r2; + irreg_patch.ring[index].subdivide(r0); + r0.subdivide(r1); + r1.subdivide(r2); + return madd(8.0f/3.0f,r2.getLimitTangent(),p_vtx); + } + + __forceinline Vertex initNegativeEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) + { + CatmullClark1Ring3fa r0,r1,r2; + irreg_patch.ring[index].subdivide(r0); + r0.subdivide(r1); + r1.subdivide(r2); + return madd(8.0f/3.0f,r2.getSecondLimitTangent(),p_vtx); + } + + void initFaceVertex(const CatmullClarkPatch& irreg_patch, + const size_t index, + const Vertex& p_vtx, + const Vertex& e0_p_vtx, + const Vertex& e1_m_vtx, + const unsigned int face_valence_p1, + const Vertex& e0_m_vtx, + const Vertex& e3_p_vtx, + const unsigned int face_valence_p3, + Vertex& f_p_vtx, + Vertex& f_m_vtx) + { + const unsigned int face_valence = irreg_patch.ring[index].face_valence; + const unsigned int edge_valence = irreg_patch.ring[index].edge_valence; + const unsigned int border_index = irreg_patch.ring[index].border_index; + + const Vertex& vtx = irreg_patch.ring[index].vtx; + const Vertex e_i = irreg_patch.ring[index].getEdgeCenter(0); + const Vertex c_i_m_1 = irreg_patch.ring[index].getQuadCenter(0); + const Vertex e_i_m_1 = irreg_patch.ring[index].getEdgeCenter(1); + + Vertex c_i, e_i_p_1; + const bool hasHardEdge0 = + std::isinf(irreg_patch.ring[index].vertex_crease_weight) && + std::isinf(irreg_patch.ring[index].crease_weight[0]); + + if (unlikely((border_index == edge_valence-2) || hasHardEdge0)) + { + /* mirror quad center and edge mid-point */ + c_i = madd(2.0f, e_i - c_i_m_1, c_i_m_1); + e_i_p_1 = madd(2.0f, vtx - e_i_m_1, e_i_m_1); + } + else + { + c_i = irreg_patch.ring[index].getQuadCenter( face_valence-1 ); + e_i_p_1 = irreg_patch.ring[index].getEdgeCenter( face_valence-1 ); + } + + Vertex c_i_m_2, e_i_m_2; + const bool hasHardEdge1 = + std::isinf(irreg_patch.ring[index].vertex_crease_weight) && + std::isinf(irreg_patch.ring[index].crease_weight[1]); + + if (unlikely(border_index == 2 || hasHardEdge1)) + { + /* mirror quad center and edge mid-point */ + c_i_m_2 = madd(2.0f, e_i_m_1 - c_i_m_1, c_i_m_1); + e_i_m_2 = madd(2.0f, vtx - e_i, + e_i); + } + else + { + c_i_m_2 = irreg_patch.ring[index].getQuadCenter( 1 ); + e_i_m_2 = irreg_patch.ring[index].getEdgeCenter( 2 ); + } + + const float d = 3.0f; + //const float c = cosf(2.0f*M_PI/(float)face_valence); + //const float c_e_p = cosf(2.0f*M_PI/(float)face_valence_p1); + //const float c_e_m = cosf(2.0f*M_PI/(float)face_valence_p3); + + const float c = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence); + const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1); + const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3); + + const Vertex r_e_p = 1.0f/3.0f * (e_i_m_1 - e_i_p_1) + 2.0f/3.0f * (c_i_m_1 - c_i); + const Vertex r_e_m = 1.0f/3.0f * (e_i - e_i_m_2) + 2.0f/3.0f * (c_i_m_1 - c_i_m_2); + + f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); + f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); + } + + __noinline void init(const CatmullClarkPatch& patch) + { + assert( patch.ring[0].hasValidPositions() ); + assert( patch.ring[1].hasValidPositions() ); + assert( patch.ring[2].hasValidPositions() ); + assert( patch.ring[3].hasValidPositions() ); + + p0() = initCornerVertex(patch,0); + p1() = initCornerVertex(patch,1); + p2() = initCornerVertex(patch,2); + p3() = initCornerVertex(patch,3); + + e0_p() = initPositiveEdgeVertex(patch,0, p0()); + e1_p() = initPositiveEdgeVertex(patch,1, p1()); + e2_p() = initPositiveEdgeVertex(patch,2, p2()); + e3_p() = initPositiveEdgeVertex(patch,3, p3()); + + e0_m() = initNegativeEdgeVertex(patch,0, p0()); + e1_m() = initNegativeEdgeVertex(patch,1, p1()); + e2_m() = initNegativeEdgeVertex(patch,2, p2()); + e3_m() = initNegativeEdgeVertex(patch,3, p3()); + + const unsigned int face_valence_p0 = patch.ring[0].face_valence; + const unsigned int face_valence_p1 = patch.ring[1].face_valence; + const unsigned int face_valence_p2 = patch.ring[2].face_valence; + const unsigned int face_valence_p3 = patch.ring[3].face_valence; + + initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() ); + initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() ); + initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() ); + initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() ); + + } + + __noinline void init_crackfix(const CatmullClarkPatch& patch, + const BezierCurve* border0, + const BezierCurve* border1, + const BezierCurve* border2, + const BezierCurve* border3) + { + assert( patch.ring[0].hasValidPositions() ); + assert( patch.ring[1].hasValidPositions() ); + assert( patch.ring[2].hasValidPositions() ); + assert( patch.ring[3].hasValidPositions() ); + + p0() = initCornerVertex(patch,0); + p1() = initCornerVertex(patch,1); + p2() = initCornerVertex(patch,2); + p3() = initCornerVertex(patch,3); + + e0_p() = initPositiveEdgeVertex(patch,0, p0()); + e1_p() = initPositiveEdgeVertex(patch,1, p1()); + e2_p() = initPositiveEdgeVertex(patch,2, p2()); + e3_p() = initPositiveEdgeVertex(patch,3, p3()); + + e0_m() = initNegativeEdgeVertex(patch,0, p0()); + e1_m() = initNegativeEdgeVertex(patch,1, p1()); + e2_m() = initNegativeEdgeVertex(patch,2, p2()); + e3_m() = initNegativeEdgeVertex(patch,3, p3()); + + if (unlikely(border0 != nullptr)) + { + p0() = border0->v0; + e0_p() = border0->v1; + e1_m() = border0->v2; + p1() = border0->v3; + } + + if (unlikely(border1 != nullptr)) + { + p1() = border1->v0; + e1_p() = border1->v1; + e2_m() = border1->v2; + p2() = border1->v3; + } + + if (unlikely(border2 != nullptr)) + { + p2() = border2->v0; + e2_p() = border2->v1; + e3_m() = border2->v2; + p3() = border2->v3; + } + + if (unlikely(border3 != nullptr)) + { + p3() = border3->v0; + e3_p() = border3->v1; + e0_m() = border3->v2; + p0() = border3->v3; + } + + const unsigned int face_valence_p0 = patch.ring[0].face_valence; + const unsigned int face_valence_p1 = patch.ring[1].face_valence; + const unsigned int face_valence_p2 = patch.ring[2].face_valence; + const unsigned int face_valence_p3 = patch.ring[3].face_valence; + + initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() ); + initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() ); + initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() ); + initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() ); + } + + + void computeGregoryPatchFacePoints(const unsigned int face_valence, + const Vertex& r_e_p, + const Vertex& r_e_m, + const Vertex& p_vtx, + const Vertex& e0_p_vtx, + const Vertex& e1_m_vtx, + const unsigned int face_valence_p1, + const Vertex& e0_m_vtx, + const Vertex& e3_p_vtx, + const unsigned int face_valence_p3, + Vertex& f_p_vtx, + Vertex& f_m_vtx, + const float d = 3.0f) + { + //const float c = cosf(2.0*M_PI/(float)face_valence); + //const float c_e_p = cosf(2.0*M_PI/(float)face_valence_p1); + //const float c_e_m = cosf(2.0*M_PI/(float)face_valence_p3); + + const float c = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence); + const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1); + const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3); + + + f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); + f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); + f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); + f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); + } + + __noinline void init(const GeneralCatmullClarkPatch& patch) + { + assert(patch.size() == 4); +#if 0 + CatmullClarkPatch qpatch; patch.init(qpatch); + init(qpatch); +#else + const float face_valence_p0 = patch.ring[0].face_valence; + const float face_valence_p1 = patch.ring[1].face_valence; + const float face_valence_p2 = patch.ring[2].face_valence; + const float face_valence_p3 = patch.ring[3].face_valence; + + Vertex p0_r_p, p0_r_m; + patch.ring[0].computeGregoryPatchEdgePoints( p0(), e0_p(), e0_m(), p0_r_p, p0_r_m ); + + Vertex p1_r_p, p1_r_m; + patch.ring[1].computeGregoryPatchEdgePoints( p1(), e1_p(), e1_m(), p1_r_p, p1_r_m ); + + Vertex p2_r_p, p2_r_m; + patch.ring[2].computeGregoryPatchEdgePoints( p2(), e2_p(), e2_m(), p2_r_p, p2_r_m ); + + Vertex p3_r_p, p3_r_m; + patch.ring[3].computeGregoryPatchEdgePoints( p3(), e3_p(), e3_m(), p3_r_p, p3_r_m ); + + computeGregoryPatchFacePoints(face_valence_p0, p0_r_p, p0_r_m, p0(), e0_p(), e1_m(), face_valence_p1, e0_m(), e3_p(), face_valence_p3, f0_p(), f0_m() ); + computeGregoryPatchFacePoints(face_valence_p1, p1_r_p, p1_r_m, p1(), e1_p(), e2_m(), face_valence_p2, e1_m(), e0_p(), face_valence_p0, f1_p(), f1_m() ); + computeGregoryPatchFacePoints(face_valence_p2, p2_r_p, p2_r_m, p2(), e2_p(), e3_m(), face_valence_p3, e2_m(), e1_p(), face_valence_p1, f2_p(), f2_m() ); + computeGregoryPatchFacePoints(face_valence_p3, p3_r_p, p3_r_m, p3(), e3_p(), e0_m(), face_valence_p0, e3_m(), e2_p(), face_valence_p3, f3_p(), f3_m() ); + +#endif + } + + + __forceinline void convert_to_bezier() + { + f0_p() = (f0_p() + f0_m()) * 0.5f; + f1_p() = (f1_p() + f1_m()) * 0.5f; + f2_p() = (f2_p() + f2_m()) * 0.5f; + f3_p() = (f3_p() + f3_m()) * 0.5f; + f0_m() = Vertex( zero ); + f1_m() = Vertex( zero ); + f2_m() = Vertex( zero ); + f3_m() = Vertex( zero ); + } + + static __forceinline void computeInnerVertices(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv, + Vertex_t& matrix_11, Vertex_t& matrix_12, Vertex_t& matrix_22, Vertex_t& matrix_21) + { + if (unlikely(uu == 0.0f || uu == 1.0f || vv == 0.0f || vv == 1.0f)) + { + matrix_11 = matrix[1][1]; + matrix_12 = matrix[1][2]; + matrix_22 = matrix[2][2]; + matrix_21 = matrix[2][1]; + } + else + { + const Vertex_t f0_p = matrix[1][1]; + const Vertex_t f1_p = matrix[1][2]; + const Vertex_t f2_p = matrix[2][2]; + const Vertex_t f3_p = matrix[2][1]; + + const Vertex_t f0_m = f_m[0][0]; + const Vertex_t f1_m = f_m[0][1]; + const Vertex_t f2_m = f_m[1][1]; + const Vertex_t f3_m = f_m[1][0]; + + matrix_11 = ( uu * f0_p + vv * f0_m)*rcp(uu+vv); + matrix_12 = ((1.0f-uu) * f1_m + vv * f1_p)*rcp(1.0f-uu+vv); + matrix_22 = ((1.0f-uu) * f2_p + (1.0f-vv) * f2_m)*rcp(2.0f-uu-vv); + matrix_21 = ( uu * f3_m + (1.0f-vv) * f3_p)*rcp(1.0f+uu-vv); + } + } + + template<typename vfloat> + static __forceinline void computeInnerVertices(const Vertex v[4][4], const Vertex f[2][2], + size_t i, const vfloat& uu, const vfloat& vv, vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) + { + const auto m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); + + const vfloat f0_p = v[1][1][i]; + const vfloat f1_p = v[1][2][i]; + const vfloat f2_p = v[2][2][i]; + const vfloat f3_p = v[2][1][i]; + + const vfloat f0_m = f[0][0][i]; + const vfloat f1_m = f[0][1][i]; + const vfloat f2_m = f[1][1][i]; + const vfloat f3_m = f[1][0][i]; + + const vfloat one_minus_uu = vfloat(1.0f) - uu; + const vfloat one_minus_vv = vfloat(1.0f) - vv; + + const vfloat f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); + const vfloat f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); + const vfloat f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); + const vfloat f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); + + matrix_11 = select(m_border,f0_p,f0_i); + matrix_12 = select(m_border,f1_p,f1_i); + matrix_22 = select(m_border,f2_p,f2_i); + matrix_21 = select(m_border,f3_p,f3_i); + } + + static __forceinline Vertex eval(const Vertex matrix[4][4], const Vertex f[2][2], const float& uu, const float& vv) + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4<float> Bu = BezierBasis::eval(uu); + const Vec4<float> Bv = BezierBasis::eval(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_du(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4<float> Bu = BezierBasis::derivative(uu); + const Vec4<float> Bv = BezierBasis::eval(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_dv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4<float> Bu = BezierBasis::eval(uu); + const Vec4<float> Bv = BezierBasis::derivative(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_dudu(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4<float> Bu = BezierBasis::derivative2(uu); + const Vec4<float> Bv = BezierBasis::eval(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_dvdv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4<float> Bu = BezierBasis::eval(uu); + const Vec4<float> Bv = BezierBasis::derivative2(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_dudv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4<float> Bu = BezierBasis::derivative(uu); + const Vec4<float> Bv = BezierBasis::derivative(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + __forceinline Vertex eval(const float uu, const float vv) const { + return eval(v,f,uu,vv); + } + + __forceinline Vertex eval_du( const float uu, const float vv) const { + return eval_du(v,f,uu,vv); + } + + __forceinline Vertex eval_dv( const float uu, const float vv) const { + return eval_dv(v,f,uu,vv); + } + + __forceinline Vertex eval_dudu( const float uu, const float vv) const { + return eval_dudu(v,f,uu,vv); + } + + __forceinline Vertex eval_dvdv( const float uu, const float vv) const { + return eval_dvdv(v,f,uu,vv); + } + + __forceinline Vertex eval_dudv( const float uu, const float vv) const { + return eval_dudv(v,f,uu,vv); + } + + static __forceinline Vertex normal(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv) // FIXME: why not using basis functions + { + /* interpolate inner vertices */ + Vertex_t matrix_11, matrix_12, matrix_22, matrix_21; + computeInnerVertices(matrix,f_m,uu,vv,matrix_11, matrix_12, matrix_22, matrix_21); + + /* tangentU */ + const Vertex_t col0 = deCasteljau(vv, (Vertex_t)matrix[0][0], (Vertex_t)matrix[1][0], (Vertex_t)matrix[2][0], (Vertex_t)matrix[3][0]); + const Vertex_t col1 = deCasteljau(vv, (Vertex_t)matrix[0][1], (Vertex_t)matrix_11 , (Vertex_t)matrix_21 , (Vertex_t)matrix[3][1]); + const Vertex_t col2 = deCasteljau(vv, (Vertex_t)matrix[0][2], (Vertex_t)matrix_12 , (Vertex_t)matrix_22 , (Vertex_t)matrix[3][2]); + const Vertex_t col3 = deCasteljau(vv, (Vertex_t)matrix[0][3], (Vertex_t)matrix[1][3], (Vertex_t)matrix[2][3], (Vertex_t)matrix[3][3]); + + const Vertex_t tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); + + /* tangentV */ + const Vertex_t row0 = deCasteljau(uu, (Vertex_t)matrix[0][0], (Vertex_t)matrix[0][1], (Vertex_t)matrix[0][2], (Vertex_t)matrix[0][3]); + const Vertex_t row1 = deCasteljau(uu, (Vertex_t)matrix[1][0], (Vertex_t)matrix_11 , (Vertex_t)matrix_12 , (Vertex_t)matrix[1][3]); + const Vertex_t row2 = deCasteljau(uu, (Vertex_t)matrix[2][0], (Vertex_t)matrix_21 , (Vertex_t)matrix_22 , (Vertex_t)matrix[2][3]); + const Vertex_t row3 = deCasteljau(uu, (Vertex_t)matrix[3][0], (Vertex_t)matrix[3][1], (Vertex_t)matrix[3][2], (Vertex_t)matrix[3][3]); + + const Vertex_t tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); + + /* normal = tangentU x tangentV */ + const Vertex_t n = cross(tangentU,tangentV); + + return n; + } + + __forceinline Vertex normal( const float uu, const float vv) const { + return normal(v,f,uu,vv); + } + + __forceinline void eval(const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, + Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, + const float dscale = 1.0f) const + { + if (P) { + *P = eval(u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = eval_du(u,v)*dscale; + assert(dPdv); *dPdv = eval_dv(u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); + } + } + + template<class vfloat> + static __forceinline vfloat eval(const Vertex v[4][4], const Vertex f[2][2], + const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n, + vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) + { + const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i])))); + const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(matrix_11 ),madd(v_n[2],vfloat(matrix_21 ),v_n[3] * vfloat(v[3][1][i])))); + const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(matrix_12 ),madd(v_n[2],vfloat(matrix_22 ),v_n[3] * vfloat(v[3][2][i])))); + const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i])))); + return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); + } + + template<typename vbool, typename vfloat> + static __forceinline void eval(const Vertex v[4][4], const Vertex f[2][2], + const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) + { + if (P) { + const Vec4<vfloat> u_n = BezierBasis::eval(uu); + const Vec4<vfloat> v_n = BezierBasis::eval(vv); + for (size_t i=0; i<N; i++) { + vfloat matrix_11, matrix_12, matrix_22, matrix_21; + computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times + vfloat::store(valid,P+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)); + } + } + if (dPdu) + { + { + assert(dPdu); + const Vec4<vfloat> u_n = BezierBasis::derivative(uu); + const Vec4<vfloat> v_n = BezierBasis::eval(vv); + for (size_t i=0; i<N; i++) { + vfloat matrix_11, matrix_12, matrix_22, matrix_21; + computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times + vfloat::store(valid,dPdu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale); + } + } + { + assert(dPdv); + const Vec4<vfloat> u_n = BezierBasis::eval(uu); + const Vec4<vfloat> v_n = BezierBasis::derivative(vv); + for (size_t i=0; i<N; i++) { + vfloat matrix_11, matrix_12, matrix_22, matrix_21; + computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times + vfloat::store(valid,dPdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale); + } + } + } + if (ddPdudu) + { + { + assert(ddPdudu); + const Vec4<vfloat> u_n = BezierBasis::derivative2(uu); + const Vec4<vfloat> v_n = BezierBasis::eval(vv); + for (size_t i=0; i<N; i++) { + vfloat matrix_11, matrix_12, matrix_22, matrix_21; + computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times + vfloat::store(valid,ddPdudu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale)); + } + } + { + assert(ddPdvdv); + const Vec4<vfloat> u_n = BezierBasis::eval(uu); + const Vec4<vfloat> v_n = BezierBasis::derivative2(vv); + for (size_t i=0; i<N; i++) { + vfloat matrix_11, matrix_12, matrix_22, matrix_21; + computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times + vfloat::store(valid,ddPdvdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale)); + } + } + { + assert(ddPdudv); + const Vec4<vfloat> u_n = BezierBasis::derivative(uu); + const Vec4<vfloat> v_n = BezierBasis::derivative(vv); + for (size_t i=0; i<N; i++) { + vfloat matrix_11, matrix_12, matrix_22, matrix_21; + computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times + vfloat::store(valid,ddPdudv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale)); + } + } + } + } + + template<typename vbool, typename vfloat> + __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) const { + eval(v,f,valid,uu,vv,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } + + template<class T> + static __forceinline Vec3<T> eval_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) + { + typedef typename T::Bool M; + const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); + + const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); + const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); + const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); + const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); + + const Vec3<T> f0_m = f[0][0]; + const Vec3<T> f1_m = f[0][1]; + const Vec3<T> f2_m = f[1][1]; + const Vec3<T> f3_m = f[1][0]; + + const T one_minus_uu = T(1.0f) - uu; + const T one_minus_vv = T(1.0f) - vv; + + const Vec3<T> f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); + const Vec3<T> f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); + const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); + const Vec3<T> f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); + + const Vec3<T> F0( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) ); + const Vec3<T> F1( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) ); + const Vec3<T> F2( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) ); + const Vec3<T> F3( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) ); + + const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu; + const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv; + const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu); + const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv); + const T B2_u = 3.0f * (uu * one_minus_uu * uu); + const T B2_v = 3.0f * (vv * one_minus_vv * vv); + const T B3_u = uu * uu * uu; + const T B3_v = vv * vv * vv; + + const T x = madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u * matrix[0][3].x))), + madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,F0.x ,madd(B2_u,F1.x ,B3_u * matrix[1][3].x))), + madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,F3.x ,madd(B2_u,F2.x ,B3_u * matrix[2][3].x))), + B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u * matrix[3][3].x)))))); + + const T y = madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u * matrix[0][3].y))), + madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,F0.y ,madd(B2_u,F1.y ,B3_u * matrix[1][3].y))), + madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,F3.y ,madd(B2_u,F2.y ,B3_u * matrix[2][3].y))), + B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u * matrix[3][3].y)))))); + + const T z = madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u * matrix[0][3].z))), + madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,F0.z ,madd(B2_u,F1.z ,B3_u * matrix[1][3].z))), + madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,F3.z ,madd(B2_u,F2.z ,B3_u * matrix[2][3].z))), + B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u * matrix[3][3].z)))))); + + return Vec3<T>(x,y,z); + } + + template<class T> + __forceinline Vec3<T> eval(const T& uu, const T& vv) const + { + Vec3<T> ff[2][2]; + ff[0][0] = Vec3<T>(f[0][0]); + ff[0][1] = Vec3<T>(f[0][1]); + ff[1][1] = Vec3<T>(f[1][1]); + ff[1][0] = Vec3<T>(f[1][0]); + return eval_t(v,ff,uu,vv); + } + + template<class T> + static __forceinline Vec3<T> normal_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) + { + typedef typename T::Bool M; + + const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); + const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); + const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); + const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); + + const Vec3<T> f0_m = f[0][0]; + const Vec3<T> f1_m = f[0][1]; + const Vec3<T> f2_m = f[1][1]; + const Vec3<T> f3_m = f[1][0]; + + const T one_minus_uu = T(1.0f) - uu; + const T one_minus_vv = T(1.0f) - vv; + + const Vec3<T> f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); + const Vec3<T> f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); + const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); + const Vec3<T> f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); + +#if 1 + const M m_corner0 = (uu == 0.0f) & (vv == 0.0f); + const M m_corner1 = (uu == 1.0f) & (vv == 0.0f); + const M m_corner2 = (uu == 1.0f) & (vv == 1.0f); + const M m_corner3 = (uu == 0.0f) & (vv == 1.0f); + const Vec3<T> matrix_11( select(m_corner0,f0_p.x,f0_i.x), select(m_corner0,f0_p.y,f0_i.y), select(m_corner0,f0_p.z,f0_i.z) ); + const Vec3<T> matrix_12( select(m_corner1,f1_p.x,f1_i.x), select(m_corner1,f1_p.y,f1_i.y), select(m_corner1,f1_p.z,f1_i.z) ); + const Vec3<T> matrix_22( select(m_corner2,f2_p.x,f2_i.x), select(m_corner2,f2_p.y,f2_i.y), select(m_corner2,f2_p.z,f2_i.z) ); + const Vec3<T> matrix_21( select(m_corner3,f3_p.x,f3_i.x), select(m_corner3,f3_p.y,f3_i.y), select(m_corner3,f3_p.z,f3_i.z) ); +#else + const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); + const Vec3<T> matrix_11( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) ); + const Vec3<T> matrix_12( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) ); + const Vec3<T> matrix_22( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) ); + const Vec3<T> matrix_21( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) ); +#endif + + const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z); + const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z); + const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z); + const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z); + + const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z); + const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z); + const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z); + + const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z); + const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z); + const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z); + + const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z); + const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z); + + /* tangentU */ + const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30); + const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31); + const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32); + const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33); + + const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); + + /* tangentV */ + const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03); + const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13); + const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23); + const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33); + + const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); + + /* normal = tangentU x tangentV */ + const Vec3<T> n = cross(tangentU,tangentV); + return n; + } + + template<class T> + __forceinline Vec3<T> normal(const T& uu, const T& vv) const + { + Vec3<T> ff[2][2]; + ff[0][0] = Vec3<T>(f[0][0]); + ff[0][1] = Vec3<T>(f[0][1]); + ff[1][1] = Vec3<T>(f[1][1]); + ff[1][0] = Vec3<T>(f[1][0]); + return normal_t(v,ff,uu,vv); + } + + __forceinline BBox<Vertex> bounds() const + { + const Vertex *const cv = &v[0][0]; + BBox<Vertex> bounds (cv[0]); + for (size_t i=1; i<16; i++) + bounds.extend( cv[i] ); + bounds.extend(f[0][0]); + bounds.extend(f[1][0]); + bounds.extend(f[1][1]); + bounds.extend(f[1][1]); + return bounds; + } + + friend embree_ostream operator<<(embree_ostream o, const GregoryPatchT& p) + { + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + o << "v[" << y << "][" << x << "] " << p.v[y][x] << embree_endl; + + for (size_t y=0; y<2; y++) + for (size_t x=0; x<2; x++) + o << "f[" << y << "][" << x << "] " << p.f[y][x] << embree_endl; + return o; + } + }; + + typedef GregoryPatchT<Vec3fa,Vec3fa_t> GregoryPatch3fa; + + template<typename Vertex, typename Vertex_t> + __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride) + { + CatmullClarkPatchT<Vertex,Vertex_t> patch(edge,vertices,stride); + GregoryPatchT<Vertex,Vertex_t> gpatch(patch); + gpatch.convert_to_bezier(); + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + matrix[y][x] = (Vertex_t)gpatch.v[y][x]; + } + + template<typename Vertex, typename Vertex_t> + __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch) + { + GregoryPatchT<Vertex,Vertex_t> gpatch(patch); + gpatch.convert_to_bezier(); + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + matrix[y][x] = (Vertex_t)gpatch.v[y][x]; + } + + template<typename Vertex, typename Vertex_t> + __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch, + const BezierCurveT<Vertex>* border0, + const BezierCurveT<Vertex>* border1, + const BezierCurveT<Vertex>* border2, + const BezierCurveT<Vertex>* border3) + { + GregoryPatchT<Vertex,Vertex_t> gpatch(patch,border0,border1,border2,border3); + gpatch.convert_to_bezier(); + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + matrix[y][x] = (Vertex_t)gpatch.v[y][x]; + } +} diff --git a/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h b/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h new file mode 100644 index 0000000000..4cf9a7e98f --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/gregory_patch_dense.h @@ -0,0 +1,113 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "gregory_patch.h" + +namespace embree +{ + class __aligned(64) DenseGregoryPatch3fa + { + typedef Vec3fa Vec3fa_4x4[4][4]; + public: + + __forceinline DenseGregoryPatch3fa (const GregoryPatch3fa& patch) + { + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + matrix[y][x] = Vec3ff(patch.v[y][x], 0.0f); + + matrix[0][0].w = patch.f[0][0].x; + matrix[0][1].w = patch.f[0][0].y; + matrix[0][2].w = patch.f[0][0].z; + matrix[0][3].w = 0.0f; + + matrix[1][0].w = patch.f[0][1].x; + matrix[1][1].w = patch.f[0][1].y; + matrix[1][2].w = patch.f[0][1].z; + matrix[1][3].w = 0.0f; + + matrix[2][0].w = patch.f[1][1].x; + matrix[2][1].w = patch.f[1][1].y; + matrix[2][2].w = patch.f[1][1].z; + matrix[2][3].w = 0.0f; + + matrix[3][0].w = patch.f[1][0].x; + matrix[3][1].w = patch.f[1][0].y; + matrix[3][2].w = patch.f[1][0].z; + matrix[3][3].w = 0.0f; + } + + __forceinline void extract_f_m(Vec3fa f_m[2][2]) const + { + f_m[0][0] = Vec3fa( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); + f_m[0][1] = Vec3fa( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); + f_m[1][1] = Vec3fa( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); + f_m[1][0] = Vec3fa( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); + } + + __forceinline Vec3fa eval(const float uu, const float vv) const + { + __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); + return GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); + } + + __forceinline Vec3fa normal(const float uu, const float vv) const + { + __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); + return GregoryPatch3fa::normal(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); + } + + template<class T> + __forceinline Vec3<T> eval(const T &uu, const T &vv) const + { + Vec3<T> f_m[2][2]; + f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); + f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); + f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); + f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); + return GregoryPatch3fa::eval_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); + } + + template<class T> + __forceinline Vec3<T> normal(const T &uu, const T &vv) const + { + Vec3<T> f_m[2][2]; + f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); + f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); + f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); + f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); + return GregoryPatch3fa::normal_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); + } + + __forceinline void eval(const float u, const float v, + Vec3fa* P, Vec3fa* dPdu, Vec3fa* dPdv, Vec3fa* ddPdudu, Vec3fa* ddPdvdv, Vec3fa* ddPdudv, + const float dscale = 1.0f) const + { + __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); + if (P) { + *P = GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = GregoryPatch3fa::eval_du(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; + assert(dPdv); *dPdv = GregoryPatch3fa::eval_dv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = GregoryPatch3fa::eval_dudu(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = GregoryPatch3fa::eval_dvdv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = GregoryPatch3fa::eval_dudv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); + } + } + + template<typename vbool, typename vfloat> + __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, float* P, float* dPdu, float* dPdv, const float dscale, const size_t dstride, const size_t N) const + { + __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); + GregoryPatch3fa::eval(matrix,f_m,valid,uu,vv,P,dPdu,dPdv,dscale,dstride,N); + } + + private: + Vec3ff matrix[4][4]; // f_p/m points are stored in 4th component + }; +} diff --git a/thirdparty/embree/kernels/subdiv/gridrange.h b/thirdparty/embree/kernels/subdiv/gridrange.h new file mode 100644 index 0000000000..4f2b90d7bd --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/gridrange.h @@ -0,0 +1,96 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" + +namespace embree +{ + struct __aligned(16) GridRange + { + unsigned int u_start; + unsigned int u_end; + unsigned int v_start; + unsigned int v_end; + + __forceinline GridRange() {} + + __forceinline GridRange(unsigned int u_start, unsigned int u_end, unsigned int v_start, unsigned int v_end) + : u_start(u_start), u_end(u_end), v_start(v_start), v_end(v_end) {} + + __forceinline unsigned int width() const { + return u_end-u_start+1; + } + + __forceinline unsigned int height() const { + return v_end-v_start+1; + } + + __forceinline bool hasLeafSize() const + { + const unsigned int u_size = u_end-u_start+1; + const unsigned int v_size = v_end-v_start+1; + assert(u_size >= 1); + assert(v_size >= 1); + return u_size <= 3 && v_size <= 3; + } + + static __forceinline unsigned int split(unsigned int start,unsigned int end) + { + const unsigned int center = (start+end)/2; + assert (center > start); + assert (center < end); + return center; + } + + __forceinline void split(GridRange& r0, GridRange& r1) const + { + assert( hasLeafSize() == false ); + const unsigned int u_size = u_end-u_start+1; + const unsigned int v_size = v_end-v_start+1; + r0 = *this; + r1 = *this; + + if (u_size >= v_size) + { + const unsigned int u_mid = split(u_start,u_end); + r0.u_end = u_mid; + r1.u_start = u_mid; + } + else + { + const unsigned int v_mid = split(v_start,v_end); + r0.v_end = v_mid; + r1.v_start = v_mid; + } + } + + __forceinline unsigned int splitIntoSubRanges(GridRange r[4]) const + { + assert( !hasLeafSize() ); + unsigned int children = 0; + GridRange first,second; + split(first,second); + + if (first.hasLeafSize()) { + r[0] = first; + children++; + } + else { + first.split(r[0],r[1]); + children += 2; + } + + if (second.hasLeafSize()) { + r[children] = second; + children++; + } + else { + second.split(r[children+0],r[children+1]); + children += 2; + } + return children; + } + }; +} diff --git a/thirdparty/embree/kernels/subdiv/half_edge.h b/thirdparty/embree/kernels/subdiv/half_edge.h new file mode 100644 index 0000000000..baf019cd79 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/half_edge.h @@ -0,0 +1,371 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_coefficients.h" + +namespace embree +{ + class __aligned(32) HalfEdge + { + friend class SubdivMesh; + public: + + enum PatchType : char { + BILINEAR_PATCH = 0, //!< a bilinear patch + REGULAR_QUAD_PATCH = 1, //!< a regular quad patch can be represented as a B-Spline + IRREGULAR_QUAD_PATCH = 2, //!< an irregular quad patch can be represented as a Gregory patch + COMPLEX_PATCH = 3 //!< these patches need subdivision and cannot be processed by the above fast code paths + }; + + enum VertexType : char { + REGULAR_VERTEX = 0, //!< regular vertex + NON_MANIFOLD_EDGE_VERTEX = 1, //!< vertex of a non-manifold edge + }; + + __forceinline friend PatchType max( const PatchType& ty0, const PatchType& ty1) { + return (PatchType) max((int)ty0,(int)ty1); + } + + struct Edge + { + /*! edge constructor */ + __forceinline Edge(const uint32_t v0, const uint32_t v1) + : v0(v0), v1(v1) {} + + /*! create an 64 bit identifier that is unique for the not oriented edge */ + __forceinline operator uint64_t() const + { + uint32_t p0 = v0, p1 = v1; + if (p0<p1) std::swap(p0,p1); + return (((uint64_t)p0) << 32) | (uint64_t)p1; + } + + public: + uint32_t v0,v1; //!< start and end vertex of the edge + }; + + HalfEdge () + : vtx_index(-1), next_half_edge_ofs(0), prev_half_edge_ofs(0), opposite_half_edge_ofs(0), edge_crease_weight(0), + vertex_crease_weight(0), edge_level(0), patch_type(COMPLEX_PATCH), vertex_type(REGULAR_VERTEX) + { + static_assert(sizeof(HalfEdge) == 32, "invalid half edge size"); + } + + __forceinline bool hasOpposite() const { return opposite_half_edge_ofs != 0; } + __forceinline void setOpposite(HalfEdge* opposite) { opposite_half_edge_ofs = int(opposite-this); } + + __forceinline HalfEdge* next() { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; } + __forceinline const HalfEdge* next() const { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; } + + __forceinline HalfEdge* prev() { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; } + __forceinline const HalfEdge* prev() const { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; } + + __forceinline HalfEdge* opposite() { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; } + __forceinline const HalfEdge* opposite() const { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; } + + __forceinline HalfEdge* rotate() { return opposite()->next(); } + __forceinline const HalfEdge* rotate() const { return opposite()->next(); } + + __forceinline unsigned int getStartVertexIndex() const { return vtx_index; } + __forceinline unsigned int getEndVertexIndex () const { return next()->vtx_index; } + __forceinline Edge getEdge () const { return Edge(getStartVertexIndex(),getEndVertexIndex()); } + + + /*! tests if the start vertex of the edge is regular */ + __forceinline PatchType vertexType() const + { + const HalfEdge* p = this; + size_t face_valence = 0; + bool hasBorder = false; + + do + { + /* we need subdivision to handle edge creases */ + if (p->hasOpposite() && p->edge_crease_weight > 0.0f) + return COMPLEX_PATCH; + + face_valence++; + + /* test for quad */ + const HalfEdge* pp = p; + pp = pp->next(); if (pp == p) return COMPLEX_PATCH; + pp = pp->next(); if (pp == p) return COMPLEX_PATCH; + pp = pp->next(); if (pp == p) return COMPLEX_PATCH; + pp = pp->next(); if (pp != p) return COMPLEX_PATCH; + + /* continue with next face */ + p = p->prev(); + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else + { + face_valence++; + hasBorder = true; + p = this; + while (p->hasOpposite()) + p = p->rotate(); + } + } while (p != this); + + /* calculate vertex type */ + if (face_valence == 2 && hasBorder) { + if (vertex_crease_weight == 0.0f ) return REGULAR_QUAD_PATCH; + else if (vertex_crease_weight == float(inf)) return REGULAR_QUAD_PATCH; + else return COMPLEX_PATCH; + } + else if (vertex_crease_weight != 0.0f) return COMPLEX_PATCH; + else if (face_valence == 3 && hasBorder) return REGULAR_QUAD_PATCH; + else if (face_valence == 4 && !hasBorder) return REGULAR_QUAD_PATCH; + else return IRREGULAR_QUAD_PATCH; + } + + /*! tests if this edge is part of a bilinear patch */ + __forceinline bool bilinearVertex() const { + return vertex_crease_weight == float(inf) && edge_crease_weight == float(inf); + } + + /*! calculates the type of the patch */ + __forceinline PatchType patchType() const + { + const HalfEdge* p = this; + PatchType ret = REGULAR_QUAD_PATCH; + bool bilinear = true; + + ret = max(ret,p->vertexType()); + bilinear &= p->bilinearVertex(); + if ((p = p->next()) == this) return COMPLEX_PATCH; + + ret = max(ret,p->vertexType()); + bilinear &= p->bilinearVertex(); + if ((p = p->next()) == this) return COMPLEX_PATCH; + + ret = max(ret,p->vertexType()); + bilinear &= p->bilinearVertex(); + if ((p = p->next()) == this) return COMPLEX_PATCH; + + ret = max(ret,p->vertexType()); + bilinear &= p->bilinearVertex(); + if ((p = p->next()) != this) return COMPLEX_PATCH; + + if (bilinear) return BILINEAR_PATCH; + return ret; + } + + /*! tests if the face is a regular b-spline face */ + __forceinline bool isRegularFace() const { + return patch_type == REGULAR_QUAD_PATCH; + } + + /*! tests if the face can be diced (using bspline or gregory patch) */ + __forceinline bool isGregoryFace() const { + return patch_type == IRREGULAR_QUAD_PATCH || patch_type == REGULAR_QUAD_PATCH; + } + + /*! tests if the base vertex of this half edge is a corner vertex */ + __forceinline bool isCorner() const { + return !hasOpposite() && !prev()->hasOpposite(); + } + + /*! tests if the vertex is attached to any border */ + __forceinline bool vertexHasBorder() const + { + const HalfEdge* p = this; + do { + if (!p->hasOpposite()) return true; + p = p->rotate(); + } while (p != this); + return false; + } + + /*! tests if the face this half edge belongs to has some border */ + __forceinline bool faceHasBorder() const + { + const HalfEdge* p = this; + do { + if (p->vertexHasBorder() && (p->vertex_type != HalfEdge::NON_MANIFOLD_EDGE_VERTEX)) return true; + p = p->next(); + } while (p != this); + return false; + } + + /*! calculates conservative bounds of a catmull clark subdivision face */ + __forceinline BBox3fa bounds(const BufferView<Vec3fa>& vertices) const + { + BBox3fa bounds = this->get1RingBounds(vertices); + for (const HalfEdge* p=this->next(); p!=this; p=p->next()) + bounds.extend(p->get1RingBounds(vertices)); + return bounds; + } + + /*! tests if this is a valid patch */ + __forceinline bool valid(const BufferView<Vec3fa>& vertices) const + { + size_t N = 1; + if (!this->validRing(vertices)) return false; + for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++) { + if (!p->validRing(vertices)) return false; + } + return N >= 3 && N <= MAX_PATCH_VALENCE; + } + + /*! counts number of polygon edges */ + __forceinline unsigned int numEdges() const + { + unsigned int N = 1; + for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++); + return N; + } + + /*! calculates face and edge valence */ + __forceinline void calculateFaceValenceAndEdgeValence(size_t& faceValence, size_t& edgeValence) const + { + faceValence = 0; + edgeValence = 0; + + const HalfEdge* p = this; + do + { + /* calculate bounds of current face */ + unsigned int numEdges = p->numEdges(); + assert(numEdges >= 3); + edgeValence += numEdges-2; + + faceValence++; + p = p->prev(); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else { + faceValence++; + edgeValence++; + p = this; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != this); + } + + /*! stream output */ + friend __forceinline std::ostream &operator<<(std::ostream &o, const HalfEdge &h) + { + return o << "{ " << + "vertex = " << h.vtx_index << ", " << //" -> " << h.next()->vtx_index << ", " << + "prev = " << h.prev_half_edge_ofs << ", " << + "next = " << h.next_half_edge_ofs << ", " << + "opposite = " << h.opposite_half_edge_ofs << ", " << + "edge_crease = " << h.edge_crease_weight << ", " << + "vertex_crease = " << h.vertex_crease_weight << ", " << + //"edge_level = " << h.edge_level << + " }"; + } + + private: + + /*! calculates the bounds of the face associated with the half-edge */ + __forceinline BBox3fa getFaceBounds(const BufferView<Vec3fa>& vertices) const + { + BBox3fa b = vertices[getStartVertexIndex()]; + for (const HalfEdge* p = next(); p!=this; p=p->next()) { + b.extend(vertices[p->getStartVertexIndex()]); + } + return b; + } + + /*! calculates the bounds of the 1-ring associated with the vertex of the half-edge */ + __forceinline BBox3fa get1RingBounds(const BufferView<Vec3fa>& vertices) const + { + BBox3fa bounds = empty; + const HalfEdge* p = this; + do + { + /* calculate bounds of current face */ + bounds.extend(p->getFaceBounds(vertices)); + p = p->prev(); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else { + p = this; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != this); + + return bounds; + } + + /*! tests if this is a valid face */ + __forceinline bool validFace(const BufferView<Vec3fa>& vertices, size_t& N) const + { + const Vec3fa v = vertices[getStartVertexIndex()]; + if (!isvalid(v)) return false; + size_t n = 1; + for (const HalfEdge* p = next(); p!=this; p=p->next(), n++) { + const Vec3fa v = vertices[p->getStartVertexIndex()]; + if (!isvalid(v)) return false; + } + N += n-2; + return n >= 3 && n <= MAX_PATCH_VALENCE; + } + + /*! tests if this is a valid ring */ + __forceinline bool validRing(const BufferView<Vec3fa>& vertices) const + { + size_t faceValence = 0; + size_t edgeValence = 0; + + const HalfEdge* p = this; + do + { + /* calculate bounds of current face */ + if (!p->validFace(vertices,edgeValence)) + return false; + + faceValence++; + p = p->prev(); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else { + faceValence++; + edgeValence++; + p = this; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != this); + + return faceValence <= MAX_RING_FACE_VALENCE && edgeValence <= MAX_RING_EDGE_VALENCE; + } + + private: + unsigned int vtx_index; //!< index of edge start vertex + int next_half_edge_ofs; //!< relative offset to next half edge of face + int prev_half_edge_ofs; //!< relative offset to previous half edge of face + int opposite_half_edge_ofs; //!< relative offset to opposite half edge + + public: + float edge_crease_weight; //!< crease weight attached to edge + float vertex_crease_weight; //!< crease weight attached to start vertex + float edge_level; //!< subdivision factor for edge + PatchType patch_type; //!< stores type of subdiv patch + VertexType vertex_type; //!< stores type of the start vertex + char align[2]; + }; +} diff --git a/thirdparty/embree/kernels/subdiv/hermite_curve.h b/thirdparty/embree/kernels/subdiv/hermite_curve.h new file mode 100644 index 0000000000..ffef5a4315 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/hermite_curve.h @@ -0,0 +1,39 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "bezier_curve.h" + +namespace embree +{ + template<typename Vertex> + struct HermiteCurveT : BezierCurveT<Vertex> + { + __forceinline HermiteCurveT() {} + + __forceinline HermiteCurveT(const BezierCurveT<Vertex>& curve) + : BezierCurveT<Vertex>(curve) {} + + __forceinline HermiteCurveT(const Vertex& v0, const Vertex& t0, const Vertex& v1, const Vertex& t1) + : BezierCurveT<Vertex>(v0,madd(1.0f/3.0f,t0,v0),nmadd(1.0f/3.0f,t1,v1),v1) {} + + __forceinline HermiteCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3ff q0(xfmVector(space,this->v0-p), this->v0.w); + const Vec3ff q1(xfmVector(space,this->v1-p), this->v1.w); + const Vec3ff q2(xfmVector(space,this->v2-p), this->v2.w); + const Vec3ff q3(xfmVector(space,this->v3-p), this->v3.w); + return BezierCurveT<Vec3ff>(q0,q1,q2,q3); + } + }; + + template<typename CurveGeometry> + __forceinline HermiteCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT<Vec3ff>& curve) { + return HermiteCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,BezierCurveT<Vec3ff>(curve))); + } + + typedef HermiteCurveT<Vec3fa> HermiteCurve3fa; +} + diff --git a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h new file mode 100644 index 0000000000..f8e8a25f35 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h @@ -0,0 +1,403 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bezier_curve.h" + +namespace embree +{ + namespace isa + { + template<typename V> + struct TensorLinearQuadraticBezierSurface + { + QuadraticBezierCurve<V> L; + QuadraticBezierCurve<V> R; + + __forceinline TensorLinearQuadraticBezierSurface() {} + + __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<V>& curve) + : L(curve.L), R(curve.R) {} + + __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) { + L = other.L; R = other.R; return *this; + } + + __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<V>& L, const QuadraticBezierCurve<V>& R) + : L(L), R(R) {} + + __forceinline BBox<V> bounds() const { + return merge(L.bounds(),R.bounds()); + } + }; + + template<> + struct TensorLinearQuadraticBezierSurface<Vec2fa> + { + QuadraticBezierCurve<vfloat4> LR; + + __forceinline TensorLinearQuadraticBezierSurface() {} + + __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<Vec2fa>& curve) + : LR(curve.LR) {} + + __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) { + LR = other.LR; return *this; + } + + __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<vfloat4>& LR) + : LR(LR) {} + + __forceinline BBox<Vec2fa> bounds() const + { + const BBox<vfloat4> b = LR.bounds(); + const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper)); + const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper))); + return merge(bl,br); + } + }; + + template<typename V> + struct TensorLinearCubicBezierSurface + { + CubicBezierCurve<V> L; + CubicBezierCurve<V> R; + + __forceinline TensorLinearCubicBezierSurface() {} + + __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve) + : L(curve.L), R(curve.R) {} + + __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) { + L = other.L; R = other.R; return *this; + } + + __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<V>& L, const CubicBezierCurve<V>& R) + : L(L), R(R) {} + + template<template<typename T> class SourceCurve> + __forceinline static TensorLinearCubicBezierSurface fromCenterAndNormalCurve(const SourceCurve<Vec3ff>& center, const SourceCurve<Vec3fa>& normal) + { + SourceCurve<Vec3ff> vcurve = center; + SourceCurve<Vec3fa> ncurve = normal; + + /* here we construct a patch which follows the curve l(t) = + * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */ + + const Vec3ff p0 = vcurve.eval(0.0f); + const Vec3ff dp0 = vcurve.eval_du(0.0f); + const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); + + const Vec3fa n0 = ncurve.eval(0.0f); + const Vec3fa dn0 = ncurve.eval_du(0.0f); + + const Vec3ff p1 = vcurve.eval(1.0f); + const Vec3ff dp1 = vcurve.eval_du(1.0f); + const Vec3ff ddp1 = vcurve.eval_dudu(1.0f); + + const Vec3fa n1 = ncurve.eval(1.0f); + const Vec3fa dn1 = ncurve.eval_du(1.0f); + + const Vec3fa bt0 = cross(n0,dp0); + const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0); + + const Vec3fa bt1 = cross(n1,dp1); + const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1); + + const Vec3fa k0 = normalize(bt0); + const Vec3fa dk0 = dnormalize(bt0,dbt0); + + const Vec3fa k1 = normalize(bt1); + const Vec3fa dk1 = dnormalize(bt1,dbt1); + + const Vec3fa l0 = p0 - p0.w*k0; + const Vec3fa dl0 = dp0 - (dp0.w*k0 + p0.w*dk0); + + const Vec3fa r0 = p0 + p0.w*k0; + const Vec3fa dr0 = dp0 + (dp0.w*k0 + p0.w*dk0); + + const Vec3fa l1 = p1 - p1.w*k1; + const Vec3fa dl1 = dp1 - (dp1.w*k1 + p1.w*dk1); + + const Vec3fa r1 = p1 + p1.w*k1; + const Vec3fa dr1 = dp1 + (dp1.w*k1 + p1.w*dk1); + + const float scale = 1.0f/3.0f; + CubicBezierCurve<V> L(l0,l0+scale*dl0,l1-scale*dl1,l1); + CubicBezierCurve<V> R(r0,r0+scale*dr0,r1-scale*dr1,r1); + return TensorLinearCubicBezierSurface(L,R); + } + + __forceinline BBox<V> bounds() const { + return merge(L.bounds(),R.bounds()); + } + + __forceinline BBox3fa accurateBounds() const { + return merge(L.accurateBounds(),R.accurateBounds()); + } + + __forceinline CubicBezierCurve<Interval1f> reduce_v() const { + return merge(CubicBezierCurve<Interval<V>>(L),CubicBezierCurve<Interval<V>>(R)); + } + + __forceinline LinearBezierCurve<Interval1f> reduce_u() const { + return LinearBezierCurve<Interval1f>(L.bounds(),R.bounds()); + } + + __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx) const { + return TensorLinearCubicBezierSurface<float>(L.xfm(dx),R.xfm(dx)); + } + + __forceinline TensorLinearCubicBezierSurface<vfloatx> vxfm(const V& dx) const { + return TensorLinearCubicBezierSurface<vfloatx>(L.vxfm(dx),R.vxfm(dx)); + } + + __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx, const V& p) const { + return TensorLinearCubicBezierSurface<float>(L.xfm(dx,p),R.xfm(dx,p)); + } + + __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space) const { + return TensorLinearCubicBezierSurface(L.xfm(space),R.xfm(space)); + } + + __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const { + return TensorLinearCubicBezierSurface(L.xfm(space,p),R.xfm(space,p)); + } + + __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const { + return TensorLinearCubicBezierSurface(L.xfm(space,p,s),R.xfm(space,p,s)); + } + + __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const { + return TensorLinearCubicBezierSurface(L.clip(u),R.clip(u)); + } + + __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const { + return TensorLinearCubicBezierSurface(clerp(L,R,V(v.lower)),clerp(L,R,V(v.upper))); + } + + __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const { + return clip_v(v).clip_u(u); + } + + __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const + { + CubicBezierCurve<V> L0,L1; L.split(L0,L1,u); + CubicBezierCurve<V> R0,R1; R.split(R0,R1,u); + new (&left ) TensorLinearCubicBezierSurface(L0,R0); + new (&right) TensorLinearCubicBezierSurface(L1,R1); + } + + __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const { + valid = true; clear(valid,VSIZEX-1); + return TensorLinearCubicBezierSurface<Vec2vfx>(L.split(u),R.split(u)); + } + + __forceinline V eval(const float u, const float v) const { + return clerp(L,R,V(v)).eval(u); + } + + __forceinline V eval_du(const float u, const float v) const { + return clerp(L,R,V(v)).eval_dt(u); + } + + __forceinline V eval_dv(const float u, const float v) const { + return (R-L).eval(u); + } + + __forceinline void eval(const float u, const float v, V& p, V& dpdu, V& dpdv) const + { + V p0, dp0du; L.eval(u,p0,dp0du); + V p1, dp1du; R.eval(u,p1,dp1du); + p = lerp(p0,p1,v); + dpdu = lerp(dp0du,dp1du,v); + dpdv = p1-p0; + } + + __forceinline TensorLinearQuadraticBezierSurface<V> derivative_u() const { + return TensorLinearQuadraticBezierSurface<V>(L.derivative(),R.derivative()); + } + + __forceinline CubicBezierCurve<V> derivative_v() const { + return R-L; + } + + __forceinline V axis_u() const { + return (L.end()-L.begin())+(R.end()-R.begin()); + } + + __forceinline V axis_v() const { + return (R.begin()-L.begin())+(R.end()-L.end()); + } + + friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a) + { + return cout << "TensorLinearCubicBezierSurface" << embree_endl + << "{" << embree_endl + << " L = " << a.L << ", " << embree_endl + << " R = " << a.R << embree_endl + << "}"; + } + + friend __forceinline TensorLinearCubicBezierSurface clerp(const TensorLinearCubicBezierSurface& a, const TensorLinearCubicBezierSurface& b, const float t) { + return TensorLinearCubicBezierSurface(clerp(a.L,b.L,V(t)), clerp(a.R,b.R,V(t))); + } + }; + + template<> + struct TensorLinearCubicBezierSurface<Vec2fa> + { + CubicBezierCurve<vfloat4> LR; + + __forceinline TensorLinearCubicBezierSurface() {} + + __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve) + : LR(curve.LR) {} + + __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) { + LR = other.LR; return *this; + } + + __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<vfloat4>& LR) + : LR(LR) {} + + __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<Vec2fa>& L, const CubicBezierCurve<Vec2fa>& R) + : LR(shuffle<0,1,0,1>(vfloat4(L.v0),vfloat4(R.v0)),shuffle<0,1,0,1>(vfloat4(L.v1),vfloat4(R.v1)),shuffle<0,1,0,1>(vfloat4(L.v2),vfloat4(R.v2)),shuffle<0,1,0,1>(vfloat4(L.v3),vfloat4(R.v3))) {} + + __forceinline CubicBezierCurve<Vec2fa> getL() const { + return CubicBezierCurve<Vec2fa>(Vec2fa(LR.v0),Vec2fa(LR.v1),Vec2fa(LR.v2),Vec2fa(LR.v3)); + } + + __forceinline CubicBezierCurve<Vec2fa> getR() const { + return CubicBezierCurve<Vec2fa>(Vec2fa(shuffle<2,3,2,3>(LR.v0)),Vec2fa(shuffle<2,3,2,3>(LR.v1)),Vec2fa(shuffle<2,3,2,3>(LR.v2)),Vec2fa(shuffle<2,3,2,3>(LR.v3))); + } + + __forceinline BBox<Vec2fa> bounds() const + { + const BBox<vfloat4> b = LR.bounds(); + const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper)); + const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper))); + return merge(bl,br); + } + + __forceinline BBox1f bounds(const Vec2fa& axis) const + { + const CubicBezierCurve<vfloat4> LRx = LR; + const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); + const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(axis)),LRx,shuffle<1>(vfloat4(axis))*LRy); + const BBox<vfloat4> Lb = LRa.bounds(); + const BBox<vfloat4> Rb(shuffle<3>(Lb.lower),shuffle<3>(Lb.upper)); + const BBox<vfloat4> b = merge(Lb,Rb); + return BBox1f(b.lower[0],b.upper[0]); + } + + __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx) const + { + const CubicBezierCurve<vfloat4> LRx = LR; + const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); + const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy); + return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]), + CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2])); + } + + __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx, const Vec2fa& p) const + { + const vfloat4 pxyxy = shuffle<0,1,0,1>(vfloat4(p)); + const CubicBezierCurve<vfloat4> LRx = LR-pxyxy; + const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); + const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy); + return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]), + CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2])); + } + + __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const { + return TensorLinearCubicBezierSurface(LR.clip(u)); + } + + __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const + { + const CubicBezierCurve<vfloat4> LL(shuffle<0,1,0,1>(LR.v0),shuffle<0,1,0,1>(LR.v1),shuffle<0,1,0,1>(LR.v2),shuffle<0,1,0,1>(LR.v3)); + const CubicBezierCurve<vfloat4> RR(shuffle<2,3,2,3>(LR.v0),shuffle<2,3,2,3>(LR.v1),shuffle<2,3,2,3>(LR.v2),shuffle<2,3,2,3>(LR.v3)); + return TensorLinearCubicBezierSurface(clerp(LL,RR,vfloat4(v.lower,v.lower,v.upper,v.upper))); + } + + __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const { + return clip_v(v).clip_u(u); + } + + __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const + { + CubicBezierCurve<vfloat4> LR0,LR1; LR.split(LR0,LR1,u); + new (&left ) TensorLinearCubicBezierSurface(LR0); + new (&right) TensorLinearCubicBezierSurface(LR1); + } + + __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const { + valid = true; clear(valid,VSIZEX-1); + return TensorLinearCubicBezierSurface<Vec2vfx>(getL().split(u),getR().split(u)); + } + + __forceinline Vec2fa eval(const float u, const float v) const + { + const vfloat4 p = LR.eval(u); + return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v)); + } + + __forceinline Vec2fa eval_du(const float u, const float v) const + { + const vfloat4 dpdu = LR.eval_dt(u); + return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v)); + } + + __forceinline Vec2fa eval_dv(const float u, const float v) const + { + const vfloat4 p = LR.eval(u); + return Vec2fa(shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p)); + } + + __forceinline void eval(const float u, const float v, Vec2fa& p, Vec2fa& dpdu, Vec2fa& dpdv) const + { + vfloat4 p0, dp0du; LR.eval(u,p0,dp0du); + p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v)); + dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v)); + dpdv = Vec2fa(shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0)); + } + + __forceinline TensorLinearQuadraticBezierSurface<Vec2fa> derivative_u() const { + return TensorLinearQuadraticBezierSurface<Vec2fa>(LR.derivative()); + } + + __forceinline CubicBezierCurve<Vec2fa> derivative_v() const { + return getR()-getL(); + } + + __forceinline Vec2fa axis_u() const + { + const CubicBezierCurve<Vec2fa> L = getL(); + const CubicBezierCurve<Vec2fa> R = getR(); + return (L.end()-L.begin())+(R.end()-R.begin()); + } + + __forceinline Vec2fa axis_v() const + { + const CubicBezierCurve<Vec2fa> L = getL(); + const CubicBezierCurve<Vec2fa> R = getR(); + return (R.begin()-L.begin())+(R.end()-L.end()); + } + + friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a) + { + return cout << "TensorLinearCubicBezierSurface" << embree_endl + << "{" << embree_endl + << " L = " << a.getL() << ", " << embree_endl + << " R = " << a.getR() << embree_endl + << "}"; + } + }; + + typedef TensorLinearCubicBezierSurface<float> TensorLinearCubicBezierSurface1f; + typedef TensorLinearCubicBezierSurface<Vec2fa> TensorLinearCubicBezierSurface2fa; + typedef TensorLinearCubicBezierSurface<Vec3fa> TensorLinearCubicBezierSurface3fa; + } +} diff --git a/thirdparty/embree/kernels/subdiv/patch.h b/thirdparty/embree/kernels/subdiv/patch.h new file mode 100644 index 0000000000..c4340ea9b6 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/patch.h @@ -0,0 +1,371 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bilinear_patch.h" +#include "bspline_patch.h" +#include "bezier_patch.h" +#include "gregory_patch.h" +#include "tessellation_cache.h" + +#if 1 +#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z) +#else +#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z) \ + { \ + size_t hex = (size_t)ptr; \ + for (size_t i=0; i<4; i++) hex = hex ^ (hex >> 8); \ + const float c = (float)(((hex >> 0) ^ (hex >> 4) ^ (hex >> 8) ^ (hex >> 12) ^ (hex >> 16))&0xf)/15.0f; \ + if (P) *P = Vertex(0.5f+0.5f*x,0.5f+0.5f*y,0.5f+0.5f*z,0.0f); \ + } +#endif + +#define PATCH_MAX_CACHE_DEPTH 2 +//#define PATCH_MIN_RESOLUTION 1 // FIXME: not yet completely implemented +#define PATCH_MAX_EVAL_DEPTH_IRREGULAR 10 // maximum evaluation depth at irregular vertices (has to be larger or equal than PATCH_MAX_CACHE_DEPTH) +#define PATCH_MAX_EVAL_DEPTH_CREASE 10 // maximum evaluation depth at crease features (has to be larger or equal than PATCH_MAX_CACHE_DEPTH) +#define PATCH_USE_GREGORY 1 // 0 = no gregory, 1 = fill, 2 = as early as possible + +#if PATCH_USE_GREGORY==2 +#define PATCH_USE_BEZIER_PATCH 1 // enable use of bezier instead of b-spline patches +#else +#define PATCH_USE_BEZIER_PATCH 0 // enable use of bezier instead of b-spline patches +#endif + +#if PATCH_USE_BEZIER_PATCH +# define RegularPatch BezierPatch +# define RegularPatchT BezierPatchT<Vertex,Vertex_t> +#else +# define RegularPatch BSplinePatch +# define RegularPatchT BSplinePatchT<Vertex,Vertex_t> +#endif + +#if PATCH_USE_GREGORY +#define IrregularFillPatch GregoryPatch +#define IrregularFillPatchT GregoryPatchT<Vertex,Vertex_t> +#else +#define IrregularFillPatch BilinearPatch +#define IrregularFillPatchT BilinearPatchT<Vertex,Vertex_t> +#endif + +namespace embree +{ + template<typename Vertex, typename Vertex_t = Vertex> + struct __aligned(64) PatchT + { + public: + + typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch; + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing; + typedef BezierCurveT<Vertex> BezierCurve; + + enum Type { + INVALID_PATCH = 0, + BILINEAR_PATCH = 1, + BSPLINE_PATCH = 2, + BEZIER_PATCH = 3, + GREGORY_PATCH = 4, + SUBDIVIDED_GENERAL_PATCH = 7, + SUBDIVIDED_QUAD_PATCH = 8, + EVAL_PATCH = 9, + }; + + struct Ref + { + __forceinline Ref(void* p = nullptr) + : ptr((size_t)p) {} + + __forceinline operator bool() const { return ptr != 0; } + __forceinline operator size_t() const { return ptr; } + + __forceinline Ref (Type ty, void* in) + : ptr(((size_t)in)+ty) { assert((((size_t)in) & 0xF) == 0); } + + __forceinline Type type () const { return (Type)(ptr & 0xF); } + __forceinline void* object() const { return (void*) (ptr & ~0xF); } + + size_t ptr; + }; + + struct EvalPatch + { + /* creates EvalPatch from a CatmullClarkPatch */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch) + { + size_t ofs = 0, bytes = patch.bytes(); + void* ptr = alloc(bytes); + patch.serialize(ptr,ofs); + assert(ofs == bytes); + return Ref(EVAL_PATCH, ptr); + } + }; + + struct BilinearPatch + { + /* creates BilinearPatch from a CatmullClarkPatch */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { + return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(patch)); + } + + __forceinline BilinearPatch (const CatmullClarkPatch& patch) + : patch(patch) {} + + /* creates BilinearPatch from 4 vertices */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { + return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(edge,vertices,stride)); + } + + __forceinline BilinearPatch (const HalfEdge* edge, const char* vertices, size_t stride) + : patch(edge,vertices,stride) {} + + public: + BilinearPatchT<Vertex,Vertex_t> patch; + }; + + struct BSplinePatch + { + /* creates BSplinePatch from a half edge */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { + return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(edge,vertices,stride)); + } + + __forceinline BSplinePatch (const HalfEdge* edge, const char* vertices, size_t stride) + : patch(edge,vertices,stride) {} + + /* creates BSplinePatch from a CatmullClarkPatch */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { + return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(patch,border0,border1,border2,border3)); + } + + __forceinline BSplinePatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) + : patch(patch,border0,border1,border2,border3) {} + + public: + BSplinePatchT<Vertex,Vertex_t> patch; + }; + + struct BezierPatch + { + /* creates BezierPatch from a half edge */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { + return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(edge,vertices,stride)); + } + + __forceinline BezierPatch (const HalfEdge* edge, const char* vertices, size_t stride) + : patch(edge,vertices,stride) {} + + /* creates Bezier from a CatmullClarkPatch */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { + return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(patch,border0,border1,border2,border3)); + } + + __forceinline BezierPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) + : patch(patch,border0,border1,border2,border3) {} + + public: + BezierPatchT<Vertex,Vertex_t> patch; + }; + + struct GregoryPatch + { + /* creates GregoryPatch from half edge */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { + return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(edge,vertices,stride)); + } + + __forceinline GregoryPatch (const HalfEdge* edge, const char* vertices, size_t stride) + : patch(CatmullClarkPatch(edge,vertices,stride)) {} + + /* creates GregoryPatch from CatmullClarkPatch */ + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { + return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(patch,border0,border1,border2,border3)); + } + + __forceinline GregoryPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) + : patch(patch,border0,border1,border2,border3) {} + + public: + GregoryPatchT<Vertex,Vertex_t> patch; + }; + + struct SubdividedQuadPatch + { + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, Ref children[4]) { + return Ref(SUBDIVIDED_QUAD_PATCH, new (alloc(sizeof(SubdividedQuadPatch))) SubdividedQuadPatch(children)); + } + + __forceinline SubdividedQuadPatch(Ref children[4]) { + for (size_t i=0; i<4; i++) child[i] = children[i]; + } + + public: + Ref child[4]; + }; + + struct SubdividedGeneralPatch + { + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, Ref* children, const unsigned N) { + return Ref(SUBDIVIDED_GENERAL_PATCH, new (alloc(sizeof(SubdividedGeneralPatch))) SubdividedGeneralPatch(children,N)); + } + + __forceinline SubdividedGeneralPatch(Ref* children, const unsigned N) : N(N) { + for (unsigned i=0; i<N; i++) child[i] = children[i]; + } + + unsigned N; + Ref child[MAX_PATCH_VALENCE]; + }; + + /*! Default constructor. */ + __forceinline PatchT () {} + + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) + { + if (PATCH_MAX_CACHE_DEPTH == 0) + return nullptr; + + Ref child(0); + switch (edge->patch_type) { + case HalfEdge::BILINEAR_PATCH: child = BilinearPatch::create(alloc,edge,vertices,stride); break; + case HalfEdge::REGULAR_QUAD_PATCH: child = RegularPatch::create(alloc,edge,vertices,stride); break; +#if PATCH_USE_GREGORY == 2 + case HalfEdge::IRREGULAR_QUAD_PATCH: child = GregoryPatch::create(alloc,edge,vertices,stride); break; +#endif + default: { + GeneralCatmullClarkPatch patch(edge,vertices,stride); + child = PatchT::create(alloc,patch,edge,vertices,stride,0); + } + } + return child; + } + + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, GeneralCatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth) + { + /* convert into standard quad patch if possible */ + if (likely(patch.isQuadPatch())) + { + CatmullClarkPatch qpatch; patch.init(qpatch); + return PatchT::create(alloc,qpatch,edge,vertices,stride,depth); + } + + /* do only cache up to some depth */ + if (depth >= PATCH_MAX_CACHE_DEPTH) + return nullptr; + + /* subdivide patch */ + unsigned N; + array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; + patch.subdivide(patches,N); + + if (N == 4) + { + Ref child[4]; +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r); + BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r); + BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r); + GeneralCatmullClarkPatch::fix_quad_ring_order(patches); + child[0] = PatchT::create(alloc,patches[0],edge,vertices,stride,depth+1,&border0l,nullptr,nullptr,&border3r); + child[1] = PatchT::create(alloc,patches[1],edge,vertices,stride,depth+1,&border0r,&border1l,nullptr,nullptr); + child[2] = PatchT::create(alloc,patches[2],edge,vertices,stride,depth+1,nullptr,&border1r,&border2l,nullptr); + child[3] = PatchT::create(alloc,patches[3],edge,vertices,stride,depth+1,nullptr,nullptr,&border2r,&border3l); +#else + GeneralCatmullClarkPatch::fix_quad_ring_order(patches); + for (size_t i=0; i<4; i++) + child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1); +#endif + return SubdividedQuadPatch::create(alloc,child); + } + else + { + assert(N<MAX_PATCH_VALENCE); + Ref child[MAX_PATCH_VALENCE]; + +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; + patch.getLimitBorder(borders); + + for (size_t i0=0; i0<N; i0++) { + const size_t i2 = i0==0 ? N-1 : i0-1; + BezierCurve border0l,border0r; borders[i0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[i2].subdivide(border2l,border2r); + child[i0] = PatchT::create(alloc,patches[i0],edge,vertices,stride,depth+1, &border0l, nullptr, nullptr, &border2r); + } +#else + for (size_t i=0; i<N; i++) + child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1); +#endif + return SubdividedGeneralPatch::create(alloc,child,N); + } + + return nullptr; + } + + static __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) + { + const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR; +//#if PATCH_MIN_RESOLUTION +// return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth; +//#else + return depth>=max_eval_depth; +//#endif + } + + template<typename Allocator> + __noinline static Ref create(const Allocator& alloc, CatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth, + const BezierCurve* border0 = nullptr, const BezierCurve* border1 = nullptr, const BezierCurve* border2 = nullptr, const BezierCurve* border3 = nullptr) + { + const typename CatmullClarkPatch::Type ty = patch.type(); + if (unlikely(final(patch,ty,depth))) { + if (ty & CatmullClarkRing::TYPE_REGULAR) return RegularPatch::create(alloc,patch,border0,border1,border2,border3); + else return IrregularFillPatch::create(alloc,patch,border0,border1,border2,border3); + } + else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { + assert(depth > 0); return RegularPatch::create(alloc,patch,border0,border1,border2,border3); + } +#if PATCH_USE_GREGORY == 2 + else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { + assert(depth > 0); return GregoryPatch::create(alloc,patch,border0,border1,border2,border3); + } +#endif + else if (depth >= PATCH_MAX_CACHE_DEPTH) { + return EvalPatch::create(alloc,patch); + } + + else + { + Ref child[4]; + array_t<CatmullClarkPatch,4> patches; + patch.subdivide(patches); + + for (size_t i=0; i<4; i++) + child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1); + return SubdividedQuadPatch::create(alloc,child); + } + } + }; + + typedef PatchT<Vec3fa,Vec3fa_t> Patch3fa; +} diff --git a/thirdparty/embree/kernels/subdiv/patch_eval.h b/thirdparty/embree/kernels/subdiv/patch_eval.h new file mode 100644 index 0000000000..a3fafa72f4 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/patch_eval.h @@ -0,0 +1,129 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" +#include "feature_adaptive_eval.h" + +namespace embree +{ + namespace isa + { + template<typename Vertex, typename Vertex_t = Vertex> + struct PatchEval + { + public: + + typedef PatchT<Vertex,Vertex_t> Patch; + typedef typename Patch::Ref Ref; + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + + PatchEval (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, + const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) + { + /* conservative time for the very first allocation */ + auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); + + Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () { + auto alloc = [&](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); }; + return Patch::create(alloc,edge,vertices,stride); + },true); + + auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); + const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime); + + if (patch && allAllocationsValid && eval(patch,u,v,1.0f,0)) { + SharedLazyTessellationCache::unlock(); + return; + } + SharedLazyTessellationCache::unlock(); + FeatureAdaptiveEval<Vertex,Vertex_t>(edge,vertices,stride,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv); + PATCH_DEBUG_SUBDIVISION(edge,c,-1,-1); + } + + __forceinline bool eval_quad(const typename Patch::SubdividedQuadPatch* This, const float u, const float v, const float dscale, const size_t depth) + { + if (v < 0.5f) { + if (u < 0.5f) return eval(This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1); + else return eval(This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1); + } else { + if (u > 0.5f) return eval(This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1); + else return eval(This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1); + } + } + + bool eval_general(const typename Patch::SubdividedGeneralPatch* This, const float U, const float V, const size_t depth) + { + const unsigned l = (unsigned) floor(0.5f*U); const float u = 2.0f*frac(0.5f*U)-0.5f; + const unsigned h = (unsigned) floor(0.5f*V); const float v = 2.0f*frac(0.5f*V)-0.5f; + const unsigned i = 4*h+l; assert(i<This->N); + return eval(This->child[i],u,v,1.0f,depth+1); + } + + bool eval(Ref This, const float& u, const float& v, const float dscale, const size_t depth) + { + if (!This) return false; + //PRINT(depth); + //PRINT2(u,v); + + switch (This.type()) + { + case Patch::BILINEAR_PATCH: { + //PRINT("bilinear"); + ((typename Patch::BilinearPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(This,-1,c,c); + return true; + } + case Patch::BSPLINE_PATCH: { + //PRINT("bspline"); + ((typename Patch::BSplinePatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(This,-1,c,-1); + return true; + } + case Patch::BEZIER_PATCH: { + //PRINT("bezier"); + ((typename Patch::BezierPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(This,-1,c,-1); + return true; + } + case Patch::GREGORY_PATCH: { + //PRINT("gregory"); + ((typename Patch::GregoryPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(This,-1,-1,c); + return true; + } + case Patch::SUBDIVIDED_QUAD_PATCH: { + //PRINT("subdivided quad"); + return eval_quad(((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth); + } + case Patch::SUBDIVIDED_GENERAL_PATCH: { + //PRINT("general_patch"); + assert(dscale == 1.0f); + return eval_general(((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); + } + case Patch::EVAL_PATCH: { + //PRINT("eval_patch"); + CatmullClarkPatch patch; patch.deserialize(This.object()); + FeatureAdaptiveEval<Vertex,Vertex_t>(patch,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv); + return true; + } + default: + assert(false); + return false; + } + } + + private: + Vertex* const P; + Vertex* const dPdu; + Vertex* const dPdv; + Vertex* const ddPdudu; + Vertex* const ddPdvdv; + Vertex* const ddPdudv; + }; + } +} + diff --git a/thirdparty/embree/kernels/subdiv/patch_eval_grid.h b/thirdparty/embree/kernels/subdiv/patch_eval_grid.h new file mode 100644 index 0000000000..167e1ebe1c --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/patch_eval_grid.h @@ -0,0 +1,245 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" +#include "feature_adaptive_eval_grid.h" + +namespace embree +{ + namespace isa + { + struct PatchEvalGrid + { + typedef Patch3fa Patch; + typedef Patch::Ref Ref; + typedef GeneralCatmullClarkPatch3fa GeneralCatmullClarkPatch; + typedef CatmullClarkPatch3fa CatmullClarkPatch; + typedef BSplinePatch3fa BSplinePatch; + typedef BezierPatch3fa BezierPatch; + typedef GregoryPatch3fa GregoryPatch; + typedef BilinearPatch3fa BilinearPatch; + + private: + const unsigned x0,x1; + const unsigned y0,y1; + const unsigned swidth,sheight; + const float rcp_swidth, rcp_sheight; + float* const Px; + float* const Py; + float* const Pz; + float* const U; + float* const V; + float* const Nx; + float* const Ny; + float* const Nz; + const unsigned dwidth,dheight; + unsigned count; + + public: + + PatchEvalGrid (Ref patch, unsigned subPatch, + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + float* Px, float* Py, float* Pz, float* U, float* V, + float* Nx, float* Ny, float* Nz, + const unsigned dwidth, const unsigned dheight) + : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), + Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), dheight(dheight), count(0) + { + assert(swidth < (2<<20) && sheight < (2<<20)); + const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1))); + const BBox2f erange(Vec2f(float(x0),float(y0)),Vec2f((float)x1,(float)y1)); + bool done MAYBE_UNUSED = eval(patch,subPatch,srange,erange); + assert(done); + assert(count == (x1-x0+1)*(y1-y0+1)); + } + + template<typename Patch> + __forceinline void evalLocalGrid(const Patch* patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1) + { + const float scale_x = rcp(srange.upper.x-srange.lower.x); + const float scale_y = rcp(srange.upper.y-srange.lower.y); + count += (lx1-lx0)*(ly1-ly0); + +#if 0 + for (unsigned iy=ly0; iy<ly1; iy++) { + for (unsigned ix=lx0; ix<lx1; ix++) { + const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x); + const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y); + const Vec3fa p = patch->patch.eval(lu,lv); + const float u = float(ix)*rcp_swidth; + const float v = float(iy)*rcp_sheight; + const int ofs = (iy-y0)*dwidth+(ix-x0); + Px[ofs] = p.x; + Py[ofs] = p.y; + Pz[ofs] = p.z; + U[ofs] = u; + V[ofs] = v; + } + } +#else + foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) { + const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x); + const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y); + const Vec3vfx p = patch->patch.eval(lu,lv); + Vec3vfx n = zero; + if (unlikely(Nx != nullptr)) n = normalize_safe(patch->patch.normal(lu,lv)); + const vfloatx u = vfloatx(ix)*rcp_swidth; + const vfloatx v = vfloatx(iy)*rcp_sheight; + const vintx ofs = (iy-y0)*dwidth+(ix-x0); + if (likely(all(valid)) && all(iy==iy[0])) { + const unsigned ofs2 = ofs[0]; + vfloatx::storeu(Px+ofs2,p.x); + vfloatx::storeu(Py+ofs2,p.y); + vfloatx::storeu(Pz+ofs2,p.z); + vfloatx::storeu(U+ofs2,u); + vfloatx::storeu(V+ofs2,v); + if (unlikely(Nx != nullptr)) { + vfloatx::storeu(Nx+ofs2,n.x); + vfloatx::storeu(Ny+ofs2,n.y); + vfloatx::storeu(Nz+ofs2,n.z); + } + } else { + foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) { + const unsigned ofs2 = ofs[j]-j; + vfloatx::storeu(valid,Px+ofs2,p.x); + vfloatx::storeu(valid,Py+ofs2,p.y); + vfloatx::storeu(valid,Pz+ofs2,p.z); + vfloatx::storeu(valid,U+ofs2,u); + vfloatx::storeu(valid,V+ofs2,v); + if (unlikely(Nx != nullptr)) { + vfloatx::storeu(valid,Nx+ofs2,n.x); + vfloatx::storeu(valid,Ny+ofs2,n.y); + vfloatx::storeu(valid,Nz+ofs2,n.z); + } + }); + } + }); +#endif + } + + bool eval(Ref This, const BBox2f& srange, const BBox2f& erange, const unsigned depth) + { + if (erange.empty()) + return true; + + const int lx0 = (int) ceilf(erange.lower.x); + const int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0)); + const int ly0 = (int) ceilf(erange.lower.y); + const int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0)); + if (lx0 >= lx1 || ly0 >= ly1) + return true; + + if (!This) + return false; + + switch (This.type()) + { + case Patch::BILINEAR_PATCH: { + evalLocalGrid((Patch::BilinearPatch*)This.object(),srange,lx0,lx1,ly0,ly1); + return true; + } + case Patch::BSPLINE_PATCH: { + evalLocalGrid((Patch::BSplinePatch*)This.object(),srange,lx0,lx1,ly0,ly1); + return true; + } + case Patch::BEZIER_PATCH: { + evalLocalGrid((Patch::BezierPatch*)This.object(),srange,lx0,lx1,ly0,ly1); + return true; + } + case Patch::GREGORY_PATCH: { + evalLocalGrid((Patch::GregoryPatch*)This.object(),srange,lx0,lx1,ly0,ly1); + return true; + } + case Patch::SUBDIVIDED_QUAD_PATCH: + { + const Vec2f c = srange.center(); + const BBox2f srange0(srange.lower,c); + const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); + const BBox2f srange2(c,srange.upper); + const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); + + Patch::SubdividedQuadPatch* patch = (Patch::SubdividedQuadPatch*)This.object(); + eval(patch->child[0],srange0,intersect(srange0,erange),depth+1); + eval(patch->child[1],srange1,intersect(srange1,erange),depth+1); + eval(patch->child[2],srange2,intersect(srange2,erange),depth+1); + eval(patch->child[3],srange3,intersect(srange3,erange),depth+1); + return true; + } + case Patch::EVAL_PATCH: { + CatmullClarkPatch patch; patch.deserialize(This.object()); + FeatureAdaptiveEvalGrid(patch,srange,erange,depth,x0,x1,y0,y1,swidth,sheight,Px,Py,Pz,U,V,Nx,Ny,Nz,dwidth,dheight); + count += (lx1-lx0)*(ly1-ly0); + return true; + } + default: + assert(false); + return false; + } + } + + bool eval(Ref This, unsigned subPatch, const BBox2f& srange, const BBox2f& erange) + { + if (!This) + return false; + + switch (This.type()) + { + case Patch::SUBDIVIDED_GENERAL_PATCH: { + Patch::SubdividedGeneralPatch* patch = (Patch::SubdividedGeneralPatch*)This.object(); + assert(subPatch < patch->N); + return eval(patch->child[subPatch],srange,erange,1); + } + default: + assert(subPatch == 0); + return eval(This,srange,erange,0); + } + } + }; + + __forceinline unsigned patch_eval_subdivision_count (const HalfEdge* h) + { + const unsigned N = h->numEdges(); + if (N == 4) return 1; + else return N; + } + + template<typename Tessellator> + inline void patch_eval_subdivision (const HalfEdge* h, Tessellator tessellator) + { + const unsigned N = h->numEdges(); + int neighborSubdiv[GeneralCatmullClarkPatch3fa::SIZE]; // FIXME: use array_t + float levels[GeneralCatmullClarkPatch3fa::SIZE]; + for (unsigned i=0; i<N; i++) { + assert(i<GeneralCatmullClarkPatch3fa::SIZE); + neighborSubdiv[i] = h->hasOpposite() ? h->opposite()->numEdges() != 4 : 0; + levels[i] = h->edge_level; + h = h->next(); + } + if (N == 4) + { + const Vec2f uv[4] = { Vec2f(0.0f,0.0f), Vec2f(1.0f,0.0f), Vec2f(1.0f,1.0f), Vec2f(0.0f,1.0f) }; + tessellator(uv,neighborSubdiv,levels,0); + } + else + { + for (unsigned i=0; i<N; i++) + { + assert(i<MAX_PATCH_VALENCE); + static_assert(MAX_PATCH_VALENCE <= 16, "MAX_PATCH_VALENCE > 16"); + const int h = (i >> 2) & 3, l = i & 3; + const Vec2f subPatchID((float)l,(float)h); + const Vec2f uv[4] = { 2.0f*subPatchID + (0.5f+Vec2f(0.0f,0.0f)), + 2.0f*subPatchID + (0.5f+Vec2f(1.0f,0.0f)), + 2.0f*subPatchID + (0.5f+Vec2f(1.0f,1.0f)), + 2.0f*subPatchID + (0.5f+Vec2f(0.0f,1.0f)) }; + const int neighborSubdiv1[4] = { 0,0,0,0 }; + const float levels1[4] = { 0.5f*levels[(i+0)%N], 0.5f*levels[(i+0)%N], 0.5f*levels[(i+N-1)%N], 0.5f*levels[(i+N-1)%N] }; + tessellator(uv,neighborSubdiv1,levels1,i); + } + } + } + } +} + diff --git a/thirdparty/embree/kernels/subdiv/patch_eval_simd.h b/thirdparty/embree/kernels/subdiv/patch_eval_simd.h new file mode 100644 index 0000000000..fef88a4492 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/patch_eval_simd.h @@ -0,0 +1,127 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" +#include "feature_adaptive_eval_simd.h" + +namespace embree +{ + namespace isa + { + template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex> + struct PatchEvalSimd + { + public: + + typedef PatchT<Vertex,Vertex_t> Patch; + typedef typename Patch::Ref Ref; + typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch; + + PatchEvalSimd (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, + const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid0, const vfloat& u, const vfloat& v, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) + { + /* conservative time for the very first allocation */ + auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); + + Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () { + auto alloc = [](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); }; + return Patch::create(alloc,edge,vertices,stride); + }, true); + + auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); + const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime); + + patch = allAllocationsValid ? patch : nullptr; + + /* use cached data structure for calculations */ + const vbool valid1 = patch ? eval(valid0,patch,u,v,1.0f,0) : vbool(false); + SharedLazyTessellationCache::unlock(); + const vbool valid2 = valid0 & !valid1; + if (any(valid2)) { + FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(edge,vertices,stride,valid2,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N); + } + } + + vbool eval_quad(const vbool& valid, const typename Patch::SubdividedQuadPatch* This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) + { + vbool ret = false; + const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; + const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; + const vbool u0v0_mask = valid & u0_mask & v0_mask; + const vbool u0v1_mask = valid & u0_mask & v1_mask; + const vbool u1v0_mask = valid & u1_mask & v0_mask; + const vbool u1v1_mask = valid & u1_mask & v1_mask; + if (any(u0v0_mask)) ret |= eval(u0v0_mask,This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1); + if (any(u1v0_mask)) ret |= eval(u1v0_mask,This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1); + if (any(u1v1_mask)) ret |= eval(u1v1_mask,This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1); + if (any(u0v1_mask)) ret |= eval(u0v1_mask,This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1); + return ret; + } + + vbool eval_general(const vbool& valid, const typename Patch::SubdividedGeneralPatch* patch, const vfloat& U, const vfloat& V, const size_t depth) + { + vbool ret = false; + const vint l = (vint)floor(0.5f*U); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; + const vint h = (vint)floor(0.5f*V); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; + const vint i = (h<<2)+l; assert(all(valid,i<patch->N)); + foreach_unique(valid,i,[&](const vbool& valid, const int i) { + ret |= eval(valid,patch->child[i],u,v,1.0f,depth+1); + }); + return ret; + } + + vbool eval(const vbool& valid, Ref This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) + { + if (!This) return false; + switch (This.type()) + { + case Patch::BILINEAR_PATCH: { + ((typename Patch::BilinearPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + return valid; + } + case Patch::BSPLINE_PATCH: { + ((typename Patch::BSplinePatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + return valid; + } + case Patch::BEZIER_PATCH: { + ((typename Patch::BezierPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + return valid; + } + case Patch::GREGORY_PATCH: { + ((typename Patch::GregoryPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + return valid; + } + case Patch::SUBDIVIDED_QUAD_PATCH: { + return eval_quad(valid,((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth); + } + case Patch::SUBDIVIDED_GENERAL_PATCH: { + assert(dscale == 1.0f); + return eval_general(valid,((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); + } + case Patch::EVAL_PATCH: { + CatmullClarkPatch patch; patch.deserialize(This.object()); + FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(patch,valid,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N); + return valid; + } + default: + assert(false); + return false; + } + } + + private: + float* const P; + float* const dPdu; + float* const dPdv; + float* const ddPdudu; + float* const ddPdvdv; + float* const ddPdudv; + const size_t dstride; + const size_t N; + }; + } +} diff --git a/thirdparty/embree/kernels/subdiv/subdivpatch1base.h b/thirdparty/embree/kernels/subdiv/subdivpatch1base.h new file mode 100644 index 0000000000..c3069dadee --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/subdivpatch1base.h @@ -0,0 +1,156 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../geometry/primitive.h" +#include "bspline_patch.h" +#include "bezier_patch.h" +#include "gregory_patch.h" +#include "gregory_patch_dense.h" +#include "tessellation.h" +#include "tessellation_cache.h" +#include "gridrange.h" +#include "patch_eval_grid.h" +#include "feature_adaptive_eval_grid.h" +#include "../common/scene_subdiv_mesh.h" + +namespace embree +{ + struct __aligned(64) SubdivPatch1Base + { + public: + + enum Type { + INVALID_PATCH = 0, + BSPLINE_PATCH = 1, + BEZIER_PATCH = 2, + GREGORY_PATCH = 3, + EVAL_PATCH = 5, + BILINEAR_PATCH = 6, + }; + + enum Flags { + TRANSITION_PATCH = 16, + }; + + /*! Default constructor. */ + __forceinline SubdivPatch1Base () {} + + SubdivPatch1Base (const unsigned int gID, + const unsigned int pID, + const unsigned int subPatch, + const SubdivMesh *const mesh, + const size_t time, + const Vec2f uv[4], + const float edge_level[4], + const int subdiv[4], + const int simd_width); + + __forceinline bool needsStitching() const { + return flags & TRANSITION_PATCH; + } + + __forceinline Vec2f getUV(const size_t i) const { + return Vec2f((float)u[i],(float)v[i]) * (8.0f/0x10000); + } + + static void computeEdgeLevels(const float edge_level[4], const int subdiv[4], float level[4]); + static Vec2i computeGridSize(const float level[4]); + bool updateEdgeLevels(const float edge_level[4], const int subdiv[4], const SubdivMesh *const mesh, const int simd_width); + + public: + + __forceinline size_t getGridBytes() const { + const size_t grid_size_xyzuv = (grid_size_simd_blocks * VSIZEX) * 4; + return 64*((grid_size_xyzuv+15) / 16); + } + + __forceinline void write_lock() { mtx.lock(); } + __forceinline void write_unlock() { mtx.unlock(); } + __forceinline bool try_write_lock() { return mtx.try_lock(); } + //__forceinline bool try_read_lock() { return mtx.try_read_lock(); } + + __forceinline void resetRootRef() { + //assert( mtx.hasInitialState() ); + root_ref = SharedLazyTessellationCache::Tag(); + } + + __forceinline SharedLazyTessellationCache::CacheEntry& entry() { + return (SharedLazyTessellationCache::CacheEntry&) root_ref; + } + + public: + __forceinline unsigned int geomID() const { + return geom; + } + + __forceinline unsigned int primID() const { + return prim; + } + + public: + SharedLazyTessellationCache::Tag root_ref; + SpinLock mtx; + + unsigned short u[4]; //!< 16bit discretized u,v coordinates + unsigned short v[4]; + float level[4]; + + unsigned char flags; + unsigned char type; + unsigned short grid_u_res; + unsigned int geom; //!< geometry ID of the subdivision mesh this patch belongs to + unsigned int prim; //!< primitive ID of this subdivision patch + unsigned short grid_v_res; + + unsigned short grid_size_simd_blocks; + unsigned int time_; + + struct PatchHalfEdge { + const HalfEdge* edge; + unsigned subPatch; + }; + + Vec3fa patch_v[4][4]; + + const HalfEdge *edge() const { return ((PatchHalfEdge*)patch_v)->edge; } + unsigned time() const { return time_; } + unsigned subPatch() const { return ((PatchHalfEdge*)patch_v)->subPatch; } + + void set_edge(const HalfEdge *h) const { ((PatchHalfEdge*)patch_v)->edge = h; } + void set_subPatch(const unsigned s) const { ((PatchHalfEdge*)patch_v)->subPatch = s; } + }; + + namespace isa + { + Vec3fa patchEval(const SubdivPatch1Base& patch, const float uu, const float vv); + Vec3fa patchNormal(const SubdivPatch1Base& patch, const float uu, const float vv); + + template<typename simdf> + Vec3<simdf> patchEval(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); + + template<typename simdf> + Vec3<simdf> patchNormal(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); + + + /* eval grid over patch and stich edges when required */ + void evalGrid(const SubdivPatch1Base& patch, + const unsigned x0, const unsigned x1, + const unsigned y0, const unsigned y1, + const unsigned swidth, const unsigned sheight, + float *__restrict__ const grid_x, + float *__restrict__ const grid_y, + float *__restrict__ const grid_z, + float *__restrict__ const grid_u, + float *__restrict__ const grid_v, + const SubdivMesh* const geom); + + /* eval grid over patch and stich edges when required */ + BBox3fa evalGridBounds(const SubdivPatch1Base& patch, + const unsigned x0, const unsigned x1, + const unsigned y0, const unsigned y1, + const unsigned swidth, const unsigned sheight, + const SubdivMesh* const geom); + } +} diff --git a/thirdparty/embree/kernels/subdiv/tessellation.h b/thirdparty/embree/kernels/subdiv/tessellation.h new file mode 100644 index 0000000000..abde4f2bde --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/tessellation.h @@ -0,0 +1,161 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* adjust discret tessellation level for feature-adaptive pre-subdivision */ + __forceinline float adjustTessellationLevel(float l, const size_t sublevel) + { + for (size_t i=0; i<sublevel; i++) l *= 0.5f; + float r = ceilf(l); + for (size_t i=0; i<sublevel; i++) r *= 2.0f; + return r; + } + + __forceinline int stitch(const int x, const int fine, const int coarse) { + return (2*x+1)*coarse/(2*fine); + } + + __forceinline void stitchGridEdges(const unsigned int low_rate, + const unsigned int high_rate, + const unsigned int x0, + const unsigned int x1, + float * __restrict__ const uv_array, + const unsigned int uv_array_step) + { +#if 1 + const float inv_low_rate = rcp((float)(low_rate-1)); + for (unsigned x=x0; x<=x1; x++) { + uv_array[(x-x0)*uv_array_step] = float(stitch(x,high_rate-1,low_rate-1))*inv_low_rate; + } + if (unlikely(x1 == high_rate-1)) + uv_array[(x1-x0)*uv_array_step] = 1.0f; +#else + assert(low_rate < high_rate); + assert(high_rate >= 2); + + const float inv_low_rate = rcp((float)(low_rate-1)); + const unsigned int dy = low_rate - 1; + const unsigned int dx = high_rate - 1; + + int p = 2*dy-dx; + + unsigned int offset = 0; + unsigned int y = 0; + float value = 0.0f; + for(unsigned int x=0;x<high_rate-1; x++) // '<=' would be correct but we will leave the 1.0f at the end + { + uv_array[offset] = value; + + offset += uv_array_step; + if (unlikely(p > 0)) + { + y++; + value = (float)y * inv_low_rate; + p -= 2*dx; + } + p += 2*dy; + } +#endif + } + + __forceinline void stitchUVGrid(const float edge_levels[4], + const unsigned int swidth, + const unsigned int sheight, + const unsigned int x0, + const unsigned int y0, + const unsigned int grid_u_res, + const unsigned int grid_v_res, + float * __restrict__ const u_array, + float * __restrict__ const v_array) + { + const unsigned int x1 = x0+grid_u_res-1; + const unsigned int y1 = y0+grid_v_res-1; + const unsigned int int_edge_points0 = (unsigned int)edge_levels[0] + 1; + const unsigned int int_edge_points1 = (unsigned int)edge_levels[1] + 1; + const unsigned int int_edge_points2 = (unsigned int)edge_levels[2] + 1; + const unsigned int int_edge_points3 = (unsigned int)edge_levels[3] + 1; + + if (unlikely(y0 == 0 && int_edge_points0 < swidth)) + stitchGridEdges(int_edge_points0,swidth,x0,x1,u_array,1); + + if (unlikely(y1 == sheight-1 && int_edge_points2 < swidth)) + stitchGridEdges(int_edge_points2,swidth,x0,x1,&u_array[(grid_v_res-1)*grid_u_res],1); + + if (unlikely(x0 == 0 && int_edge_points1 < sheight)) + stitchGridEdges(int_edge_points1,sheight,y0,y1,&v_array[grid_u_res-1],grid_u_res); + + if (unlikely(x1 == swidth-1 && int_edge_points3 < sheight)) + stitchGridEdges(int_edge_points3,sheight,y0,y1,v_array,grid_u_res); + } + + __forceinline void gridUVTessellator(const float edge_levels[4], + const unsigned int swidth, + const unsigned int sheight, + const unsigned int x0, + const unsigned int y0, + const unsigned int grid_u_res, + const unsigned int grid_v_res, + float * __restrict__ const u_array, + float * __restrict__ const v_array) + { + assert( grid_u_res >= 1); + assert( grid_v_res >= 1); + assert( edge_levels[0] >= 1.0f ); + assert( edge_levels[1] >= 1.0f ); + assert( edge_levels[2] >= 1.0f ); + assert( edge_levels[3] >= 1.0f ); + +#if defined(__AVX__) + const vint8 grid_u_segments = vint8(swidth)-1; + const vint8 grid_v_segments = vint8(sheight)-1; + + const vfloat8 inv_grid_u_segments = rcp(vfloat8(grid_u_segments)); + const vfloat8 inv_grid_v_segments = rcp(vfloat8(grid_v_segments)); + + unsigned int index = 0; + vint8 v_i( zero ); + for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1) + { + vint8 u_i ( step ); + + const vbool8 m_v = v_i < grid_v_segments; + + for (unsigned int x=0;x<grid_u_res;x+=8, u_i += 8) + { + const vbool8 m_u = u_i < grid_u_segments; + const vfloat8 u = select(m_u, vfloat8(x0+u_i) * inv_grid_u_segments, 1.0f); + const vfloat8 v = select(m_v, vfloat8(y0+v_i) * inv_grid_v_segments, 1.0f); + vfloat8::storeu(&u_array[index + x],u); + vfloat8::storeu(&v_array[index + x],v); + } + } + #else + const vint4 grid_u_segments = vint4(swidth)-1; + const vint4 grid_v_segments = vint4(sheight)-1; + + const vfloat4 inv_grid_u_segments = rcp(vfloat4(grid_u_segments)); + const vfloat4 inv_grid_v_segments = rcp(vfloat4(grid_v_segments)); + + unsigned int index = 0; + vint4 v_i( zero ); + for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1) + { + vint4 u_i ( step ); + + const vbool4 m_v = v_i < grid_v_segments; + + for (unsigned int x=0;x<grid_u_res;x+=4, u_i += 4) + { + const vbool4 m_u = u_i < grid_u_segments; + const vfloat4 u = select(m_u, vfloat4(x0+u_i) * inv_grid_u_segments, 1.0f); + const vfloat4 v = select(m_v, vfloat4(y0+v_i) * inv_grid_v_segments, 1.0f); + vfloat4::storeu(&u_array[index + x],u); + vfloat4::storeu(&v_array[index + x],v); + } + } +#endif + } +} diff --git a/thirdparty/embree/kernels/subdiv/tessellation_cache.h b/thirdparty/embree/kernels/subdiv/tessellation_cache.h new file mode 100644 index 0000000000..99edf49be4 --- /dev/null +++ b/thirdparty/embree/kernels/subdiv/tessellation_cache.h @@ -0,0 +1,325 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" + +/* force a complete cache invalidation when running out of allocation space */ +#define FORCE_SIMPLE_FLUSH 0 + +#define THREAD_BLOCK_ATOMIC_ADD 4 + +#if defined(DEBUG) +#define CACHE_STATS(x) +#else +#define CACHE_STATS(x) +#endif + +namespace embree +{ + class SharedTessellationCacheStats + { + public: + /* stats */ + static std::atomic<size_t> cache_accesses; + static std::atomic<size_t> cache_hits; + static std::atomic<size_t> cache_misses; + static std::atomic<size_t> cache_flushes; + static size_t cache_num_patches; + __aligned(64) static SpinLock mtx; + + /* print stats for debugging */ + static void printStats(); + static void clearStats(); + }; + + void resizeTessellationCache(size_t new_size); + void resetTessellationCache(); + + //////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(64) ThreadWorkState + { + ALIGNED_STRUCT_(64); + + std::atomic<size_t> counter; + ThreadWorkState* next; + bool allocated; + + __forceinline ThreadWorkState(bool allocated = false) + : counter(0), next(nullptr), allocated(allocated) + { + assert( ((size_t)this % 64) == 0 ); + } + }; + + class __aligned(64) SharedLazyTessellationCache + { + public: + + static const size_t NUM_CACHE_SEGMENTS = 8; + static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512; + static const size_t COMMIT_INDEX_SHIFT = 32+8; +#if defined(__64BIT__) + static const size_t REF_TAG_MASK = 0xffffffffff; +#else + static const size_t REF_TAG_MASK = 0x7FFFFFFF; +#endif + static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1; + static const size_t BLOCK_SIZE = 64; + + + /*! Per thread tessellation ref cache */ + static __thread ThreadWorkState* init_t_state; + static ThreadWorkState* current_t_state; + + static __forceinline ThreadWorkState *threadState() + { + if (unlikely(!init_t_state)) + /* sets init_t_state, can't return pointer due to macosx icc bug*/ + SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState(); + return init_t_state; + } + + struct Tag + { + __forceinline Tag() : data(0) {} + + __forceinline Tag(void* ptr, size_t combinedTime) { + init(ptr,combinedTime); + } + + __forceinline Tag(size_t ptr, size_t combinedTime) { + init((void*)ptr,combinedTime); + } + + __forceinline void init(void* ptr, size_t combinedTime) + { + if (ptr == nullptr) { + data = 0; + return; + } + int64_t new_root_ref = (int64_t) ptr; + new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr(); + assert( new_root_ref <= (int64_t)REF_TAG_MASK ); + new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; + data = new_root_ref; + } + + __forceinline int64_t get() const { return data.load(); } + __forceinline void set( int64_t v ) { data.store(v); } + __forceinline void reset() { data.store(0); } + + private: + atomic<int64_t> data; + }; + + static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } + + struct CacheEntry + { + Tag tag; + SpinLock mutex; + }; + + private: + + float *data; + bool hugepages; + size_t size; + size_t maxBlocks; + ThreadWorkState *threadWorkState; + + __aligned(64) std::atomic<size_t> localTime; + __aligned(64) std::atomic<size_t> next_block; + __aligned(64) SpinLock reset_state; + __aligned(64) SpinLock linkedlist_mtx; + __aligned(64) std::atomic<size_t> switch_block_threshold; + __aligned(64) std::atomic<size_t> numRenderThreads; + + + public: + + + SharedLazyTessellationCache(); + ~SharedLazyTessellationCache(); + + void getNextRenderThreadWorkState(); + + __forceinline size_t maxAllocSize() const { + return switch_block_threshold; + } + + __forceinline size_t getCurrentIndex() { return localTime.load(); } + __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); } + + __forceinline size_t getTime(const size_t globalTime) { + return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; + } + + + __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); } + __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); } + + __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } + + static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); } + static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); } + static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); } + static __forceinline size_t getState() { return threadState()->counter.load(); } + static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); } + + static __forceinline size_t getTCacheTime(const size_t globalTime) { + return sharedLazyTessellationCache.getTime(globalTime); + } + + /* per thread lock */ + __forceinline void lockThreadLoop (ThreadWorkState *const t_state) + { + while(1) + { + size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1); + if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) + { + /* lock failed wait until sync phase is over */ + sharedLazyTessellationCache.unlockThread(t_state,-1); + sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0); + } + else + break; + } + } + + static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) + { + const int64_t subdiv_patch_root_ref = entry.tag.get(); + CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); + + if (likely(subdiv_patch_root_ref != 0)) + { + const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); + const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); + + if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) + { + CACHE_STATS(SharedTessellationCacheStats::cache_hits++); + return (void*) subdiv_patch_root; + } + } + CACHE_STATS(SharedTessellationCacheStats::cache_misses++); + return nullptr; + } + + template<typename Constructor> + static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) + { + ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); + + while (true) + { + sharedLazyTessellationCache.lockThreadLoop(t_state); + void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); + if (patch) return (decltype(constructor())) patch; + + if (entry.mutex.try_lock()) + { + if (!validTag(entry.tag,globalTime)) + { + auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); + auto ret = constructor(); // thread is locked here! + assert(ret); + /* this should never return nullptr */ + auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); + auto time = before ? timeBefore : timeAfter; + __memory_barrier(); + entry.tag = SharedLazyTessellationCache::Tag(ret,time); + __memory_barrier(); + entry.mutex.unlock(); + return ret; + } + entry.mutex.unlock(); + } + SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); + } + } + + __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) + { +#if FORCE_SIMPLE_FLUSH == 1 + return i == getTime(globalTime); +#else + return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); +#endif + } + + static __forceinline bool validTime(const size_t oldtime, const size_t newTime) + { + return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; + } + + + static __forceinline bool validTag(const Tag& tag, size_t globalTime) + { + const int64_t subdiv_patch_root_ref = tag.get(); + if (subdiv_patch_root_ref == 0) return false; + const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); + return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime); + } + + void waitForUsersLessEqual(ThreadWorkState *const t_state, + const unsigned int users); + + __forceinline size_t alloc(const size_t blocks) + { + if (unlikely(blocks >= switch_block_threshold)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment"); + + assert(blocks < switch_block_threshold); + size_t index = next_block.fetch_add(blocks); + if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; + return index; + } + + static __forceinline void* malloc(const size_t bytes) + { + size_t block_index = -1; + ThreadWorkState *const t_state = threadState(); + while (true) + { + block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE); + if (block_index == (size_t)-1) + { + sharedLazyTessellationCache.unlockThread(t_state); + sharedLazyTessellationCache.allocNextSegment(); + sharedLazyTessellationCache.lockThread(t_state); + continue; + } + break; + } + return sharedLazyTessellationCache.getBlockPtr(block_index); + } + + __forceinline void *getBlockPtr(const size_t block_index) + { + assert(block_index < maxBlocks); + assert(data); + assert(block_index*16 <= size); + return (void*)&data[block_index*16]; + } + + __forceinline void* getDataPtr() { return data; } + __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } + __forceinline size_t getMaxBlocks() { return maxBlocks; } + __forceinline size_t getSize() { return size; } + + void allocNextSegment(); + void realloc(const size_t newSize); + + void reset(); + + static SharedLazyTessellationCache sharedLazyTessellationCache; + }; +} diff --git a/thirdparty/embree/patches/godot-changes-android.patch b/thirdparty/embree/patches/godot-changes-android.patch new file mode 100644 index 0000000000..a27f924bde --- /dev/null +++ b/thirdparty/embree/patches/godot-changes-android.patch @@ -0,0 +1,103 @@ +diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp +index ba97dc227b..1679599608 100644 +--- a/thirdparty/embree/common/sys/sysinfo.cpp ++++ b/thirdparty/embree/common/sys/sysinfo.cpp +@@ -618,7 +618,10 @@ namespace embree + static int nThreads = -1; + if (nThreads != -1) return nThreads; + +-#if defined(__MACOSX__) ++// -- GODOT start -- ++// #if defined(__MACOSX__) ++#if defined(__MACOSX__) || defined(__ANDROID__) ++// -- GODOT end -- + nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container + assert(nThreads); + #else +diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp +index a7827e18f7..f4014be89b 100644 +--- a/thirdparty/embree/common/sys/thread.cpp ++++ b/thirdparty/embree/common/sys/thread.cpp +@@ -158,7 +158,9 @@ namespace embree + /// Linux Platform + //////////////////////////////////////////////////////////////////////////////// + +-#if defined(__LINUX__) ++// -- GODOT start -- ++#if defined(__LINUX__) && !defined(__ANDROID__) ++// -- GODOT end -- + + #include <fstream> + #include <sstream> +@@ -247,6 +249,28 @@ namespace embree + } + #endif + ++// -- GODOT start -- ++//////////////////////////////////////////////////////////////////////////////// ++/// Android Platform ++//////////////////////////////////////////////////////////////////////////////// ++ ++#if defined(__ANDROID__) ++ ++namespace embree ++{ ++ /*! set affinity of the calling thread */ ++ void setAffinity(ssize_t affinity) ++ { ++ cpu_set_t cset; ++ CPU_ZERO(&cset); ++ CPU_SET(affinity, &cset); ++ ++ sched_setaffinity(0, sizeof(cset), &cset); ++ } ++} ++#endif ++// -- GODOT end -- ++ + //////////////////////////////////////////////////////////////////////////////// + /// FreeBSD Platform + //////////////////////////////////////////////////////////////////////////////// +@@ -355,7 +379,9 @@ namespace embree + pthread_attr_destroy(&attr); + + /* set affinity */ +-#if defined(__LINUX__) ++// -- GODOT start -- ++#if defined(__LINUX__) && !defined(__ANDROID__) ++// -- GODOT end -- + if (threadID >= 0) { + cpu_set_t cset; + CPU_ZERO(&cset); +@@ -370,7 +396,16 @@ namespace embree + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } ++// -- GODOT start -- ++#elif defined(__ANDROID__) ++ if (threadID >= 0) { ++ cpu_set_t cset; ++ CPU_ZERO(&cset); ++ CPU_SET(threadID, &cset); ++ sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset); ++ } + #endif ++// -- GODOT end -- + + return thread_t(tid); + } +@@ -389,8 +424,14 @@ namespace embree + + /*! destroy a hardware thread by its handle */ + void destroyThread(thread_t tid) { ++// -- GODOT start -- ++#if defined(__ANDROID__) ++ FATAL("Can't destroy threads on Android."); ++#else + pthread_cancel(*(pthread_t*)tid); + delete (pthread_t*)tid; ++#endif ++// -- GODOT end -- + } + + /*! creates thread local storage */ diff --git a/thirdparty/embree/patches/godot-changes-misc.patch b/thirdparty/embree/patches/godot-changes-misc.patch new file mode 100644 index 0000000000..8bf0d9fa97 --- /dev/null +++ b/thirdparty/embree/patches/godot-changes-misc.patch @@ -0,0 +1,105 @@ +diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h +index 79729c87ab..ed8dd7d40a 100644 +--- a/thirdparty/embree/common/sys/intrinsics.h ++++ b/thirdparty/embree/common/sys/intrinsics.h +@@ -34,8 +34,14 @@ + #endif + + #if defined(__WIN32__) +-# define NOMINMAX +-# include <windows.h> ++// -- GODOT start -- ++#if !defined(NOMINMAX) ++// -- GODOT end -- ++#define NOMINMAX ++// -- GODOT start -- ++#endif ++#include "windows.h" ++// -- GODOT end -- + #endif + + /* normally defined in pmmintrin.h, but we always need this */ +diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h +index 3fc5e99b8d..697e07bb86 100644 +--- a/thirdparty/embree/common/sys/platform.h ++++ b/thirdparty/embree/common/sys/platform.h +@@ -99,7 +99,9 @@ + #define dll_import + #endif + +-#ifdef __WIN32__ ++// -- GODOT start -- ++#if defined(__WIN32__) && !defined(__MINGW32__) ++// -- GODOT end -- + #if !defined(__noinline) + #define __noinline __declspec(noinline) + #endif +@@ -149,6 +151,9 @@ + #define DELETED = delete + #endif + ++// -- GODOT start -- ++#if !defined(likely) ++// -- GODOT end -- + #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + #define likely(expr) (expr) + #define unlikely(expr) (expr) +@@ -156,6 +161,9 @@ + #define likely(expr) __builtin_expect((bool)(expr),true ) + #define unlikely(expr) __builtin_expect((bool)(expr),false) + #endif ++// -- GODOT start -- ++#endif ++// -- GODOT end -- + + //////////////////////////////////////////////////////////////////////////////// + /// Error handling and debugging +diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp +index ba97dc227b..f1a59e511e 100644 +--- a/thirdparty/embree/common/sys/sysinfo.cpp ++++ b/thirdparty/embree/common/sys/sysinfo.cpp +@@ -248,7 +248,9 @@ namespace embree + #if defined(__X86_ASM__) + __noinline int64_t get_xcr0() + { +-#if defined (__WIN32__) ++// -- GODOT start -- ++#if defined (__WIN32__) && !defined (__MINGW32__) ++// -- GODOT end -- + int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 + xcr0 = _xgetbv(0); + return xcr0; +diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h +index 9c14b28745..4857e1e05e 100644 +--- a/thirdparty/embree/include/embree3/rtcore_common.h ++++ b/thirdparty/embree/include/embree3/rtcore_common.h +@@ -19,7 +19,9 @@ typedef int ssize_t; + #endif + #endif + +-#ifdef _WIN32 ++// -- GODOT start -- ++#if defined(_WIN32) && defined(_MSC_VER) ++// -- GODOT end -- + # define RTC_ALIGN(...) __declspec(align(__VA_ARGS__)) + #else + # define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__))) +diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h +index 3fd15816e9..35bd49849f 100644 +--- a/thirdparty/embree/common/tasking/taskschedulertbb.h ++++ b/thirdparty/embree/common/tasking/taskschedulertbb.h +@@ -12,7 +12,13 @@ + #include "../sys/ref.h" + + #if defined(__WIN32__) ++// -- GODOT start -- ++#if !defined(NOMINMAX) ++// -- GODOT end -- + # define NOMINMAX ++// -- GODOT start -- ++#endif ++// -- GODOT end -- + #endif + + // We need to define these to avoid implicit linkage against +
\ No newline at end of file diff --git a/thirdparty/embree/patches/godot-changes-noexcept.patch b/thirdparty/embree/patches/godot-changes-noexcept.patch new file mode 100644 index 0000000000..c587a0e2be --- /dev/null +++ b/thirdparty/embree/patches/godot-changes-noexcept.patch @@ -0,0 +1,630 @@ +diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h +index f052d8b468..645681ac63 100644 +--- a/thirdparty/embree/common/algorithms/parallel_for.h ++++ b/thirdparty/embree/common/algorithms/parallel_for.h +@@ -21,7 +21,10 @@ namespace embree + func(r.begin()); + }); + if (!TaskScheduler::wait()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + } + + #elif defined(TASKING_TBB) +@@ -31,13 +34,19 @@ namespace embree + func(i); + },context); + if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); + if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #endif + + #elif defined(TASKING_PPL) +@@ -57,7 +66,10 @@ namespace embree + #if defined(TASKING_INTERNAL) + TaskScheduler::spawn(first,last,minStepSize,func); + if (!TaskScheduler::wait()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + + #elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 +@@ -66,13 +78,19 @@ namespace embree + func(range<Index>(r.begin(),r.end())); + },context); + if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #else + tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) { + func(range<Index>(r.begin(),r.end())); + }); + if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #endif + + #elif defined(TASKING_PPL) +@@ -104,13 +122,19 @@ namespace embree + func(i); + },tbb::simple_partitioner(),context); + if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner()); + if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #endif + } + +@@ -125,13 +149,19 @@ namespace embree + func(i); + },ap,context); + if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap); + if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #endif + } + +diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h +index f42ae2ec50..8271372ea4 100644 +--- a/thirdparty/embree/common/algorithms/parallel_reduce.h ++++ b/thirdparty/embree/common/algorithms/parallel_reduce.h +@@ -58,15 +58,19 @@ namespace embree + const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, + [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, + reduction,context); +- if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // if (context.is_group_execution_cancelled()) ++ // throw std::runtime_error("task cancelled"); ++ // -- GODOT end -- + return v; + #else + const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity, + [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); }, + reduction); +- if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // if (tbb::task::self().is_cancelled()) ++ // throw std::runtime_error("task cancelled"); ++ // -- GODOT end -- + return v; + #endif + #else // TASKING_PPL +diff --git a/thirdparty/embree/common/lexers/stringstream.cpp b/thirdparty/embree/common/lexers/stringstream.cpp +index 42ffb10176..a037869506 100644 +--- a/thirdparty/embree/common/lexers/stringstream.cpp ++++ b/thirdparty/embree/common/lexers/stringstream.cpp +@@ -39,7 +39,10 @@ namespace embree + std::vector<char> str; str.reserve(64); + while (cin->peek() != EOF && !isSeparator(cin->peek())) { + int c = cin->get(); +- if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); ++ // -- GODOT start -- ++ // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); ++ if (!isValidChar(c)) abort(); ++ // -- GODOT end -- + str.push_back((char)c); + } + str.push_back(0); +diff --git a/thirdparty/embree/common/sys/alloc.cpp b/thirdparty/embree/common/sys/alloc.cpp +index 1bc30fe9a5..abdd269069 100644 +--- a/thirdparty/embree/common/sys/alloc.cpp ++++ b/thirdparty/embree/common/sys/alloc.cpp +@@ -21,7 +21,10 @@ namespace embree + void* ptr = _mm_malloc(size,align); + + if (size != 0 && ptr == nullptr) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + + return ptr; + } +@@ -128,7 +131,10 @@ namespace embree + /* fall back to 4k pages */ + int flags = MEM_COMMIT | MEM_RESERVE; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); +- if (ptr == nullptr) throw std::bad_alloc(); ++ // -- GODOT start -- ++ // if (ptr == nullptr) throw std::bad_alloc(); ++ if (ptr == nullptr) abort(); ++ // -- GODOT end -- + hugepages = false; + return ptr; + } +@@ -145,7 +151,10 @@ namespace embree + return bytesOld; + + if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + + return bytesNew; + } +@@ -156,7 +165,10 @@ namespace embree + return; + + if (!VirtualFree(ptr,0,MEM_RELEASE)) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + } + + void os_advise(void *ptr, size_t bytes) +@@ -260,7 +272,10 @@ namespace embree + + /* fallback to 4k pages */ + void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); +- if (ptr == MAP_FAILED) throw std::bad_alloc(); ++ // -- GODOT start -- ++ // if (ptr == MAP_FAILED) throw std::bad_alloc(); ++ if (ptr == MAP_FAILED) abort(); ++ // -- GODOT end -- + hugepages = false; + + /* advise huge page hint for THP */ +@@ -277,7 +292,10 @@ namespace embree + return bytesOld; + + if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + + return bytesNew; + } +@@ -291,7 +309,10 @@ namespace embree + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytes = (bytes+pageSize-1) & ~(pageSize-1); + if (munmap(ptr,bytes) == -1) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + } + + /* hint for transparent huge pages (THP) */ +diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h +index 8a6d9fa0a9..697e07bb86 100644 +--- a/thirdparty/embree/common/sys/platform.h ++++ b/thirdparty/embree/common/sys/platform.h +@@ -179,11 +179,19 @@ + #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl + + #if defined(DEBUG) // only report file and line in debug mode ++ // -- GODOT start -- ++ // #define THROW_RUNTIME_ERROR(str) ++ // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define THROW_RUNTIME_ERROR(str) \ +- throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); ++ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); ++ // -- GODOT end -- + #else ++ // -- GODOT start -- ++ // #define THROW_RUNTIME_ERROR(str) ++ // throw std::runtime_error(str); + #define THROW_RUNTIME_ERROR(str) \ +- throw std::runtime_error(str); ++ abort(); ++ // -- GODOT end -- + #endif + + #define FATAL(x) THROW_RUNTIME_ERROR(x) +diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp +index dca835a716..ad438588a3 100644 +--- a/thirdparty/embree/common/tasking/taskschedulerinternal.cpp ++++ b/thirdparty/embree/common/tasking/taskschedulerinternal.cpp +@@ -48,13 +48,15 @@ namespace embree + { + Task* prevTask = thread.task; + thread.task = this; +- try { +- if (thread.scheduler->cancellingException == nullptr) ++ // -- GODOT start -- ++ // try { ++ // if (thread.scheduler->cancellingException == nullptr) + closure->execute(); +- } catch (...) { +- if (thread.scheduler->cancellingException == nullptr) +- thread.scheduler->cancellingException = std::current_exception(); +- } ++ // } catch (...) { ++ // if (thread.scheduler->cancellingException == nullptr) ++ // thread.scheduler->cancellingException = std::current_exception(); ++ // } ++ // -- GODOT end -- + thread.task = prevTask; + add_dependencies(-1); + } +@@ -291,8 +293,11 @@ namespace embree + size_t threadIndex = allocThreadIndex(); + condition.wait(mutex, [&] () { return hasRootTask.load(); }); + mutex.unlock(); +- std::exception_ptr except = thread_loop(threadIndex); +- if (except != nullptr) std::rethrow_exception(except); ++ // -- GODOT start -- ++ // std::exception_ptr except = thread_loop(threadIndex); ++ // if (except != nullptr) std::rethrow_exception(except); ++ thread_loop(threadIndex); ++ // -- GODOT end -- + } + + void TaskScheduler::reset() { +@@ -324,7 +329,10 @@ namespace embree + return thread->scheduler->cancellingException == nullptr; + } + +- std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) ++// -- GODOT start -- ++// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) ++ void TaskScheduler::thread_loop(size_t threadIndex) ++// -- GODOT end -- + { + /* allocate thread structure */ + std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation +@@ -347,9 +355,10 @@ namespace embree + swapThread(oldThread); + + /* remember exception to throw */ +- std::exception_ptr except = nullptr; +- if (cancellingException != nullptr) except = cancellingException; +- ++ // -- GODOT start -- ++ // std::exception_ptr except = nullptr; ++ // if (cancellingException != nullptr) except = cancellingException; ++ // -- GODOT end -- + /* wait for all threads to terminate */ + threadCounter--; + #if defined(__WIN32__) +@@ -367,7 +376,10 @@ namespace embree + yield(); + #endif + } +- return except; ++ // -- GODOT start -- ++ // return except; ++ return; ++ // -- GODOT end -- + } + + bool TaskScheduler::steal_from_other_threads(Thread& thread) +diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h +index c766a0bb6a..8fa6bb12fa 100644 +--- a/thirdparty/embree/common/tasking/taskschedulerinternal.h ++++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h +@@ -123,7 +123,10 @@ namespace embree + { + size_t ofs = bytes + ((align - stackPtr) & (align-1)); + if (stackPtr + ofs > CLOSURE_STACK_SIZE) +- throw std::runtime_error("closure stack overflow"); ++ // -- GODOT start -- ++ // throw std::runtime_error("closure stack overflow"); ++ abort(); ++ // -- GODOT end -- + stackPtr += ofs; + return &stack[stackPtr-bytes]; + } +@@ -132,7 +135,10 @@ namespace embree + __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) + { + if (right >= TASK_STACK_SIZE) +- throw std::runtime_error("task stack overflow"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task stack overflow"); ++ abort(); ++ // -- GODOT end -- + + /* allocate new task on right side of stack */ + size_t oldStackPtr = stackPtr; +@@ -238,7 +244,10 @@ namespace embree + void wait_for_threads(size_t threadCount); + + /*! thread loop for all worker threads */ +- std::exception_ptr thread_loop(size_t threadIndex); ++ // -- GODOT start -- ++ // std::exception_ptr thread_loop(size_t threadIndex); ++ void thread_loop(size_t threadIndex); ++ // -- GODOT end -- + + /*! steals a task from a different thread */ + bool steal_from_other_threads(Thread& thread); +diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp +index d8da78eed7..d857ff7d95 100644 +--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp ++++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp +@@ -150,7 +150,10 @@ namespace embree + } + } + else { +- throw std::runtime_error("not supported node type in bvh_statistics"); ++ // -- GODOT start -- ++ // throw std::runtime_error("not supported node type in bvh_statistics"); ++ abort(); ++ // -- GODOT end -- + } + return s; + } +diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp +index 74e9fb335c..94b3819e42 100644 +--- a/thirdparty/embree/kernels/common/rtcore.cpp ++++ b/thirdparty/embree/kernels/common/rtcore.cpp +@@ -197,7 +197,10 @@ RTC_NAMESPACE_BEGIN; + if (quality != RTC_BUILD_QUALITY_LOW && + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH) +- throw std::runtime_error("invalid build quality"); ++ // -- GODOT start -- ++ // throw std::runtime_error("invalid build quality"); ++ abort(); ++ // -- GODOT end -- + scene->setBuildQuality(quality); + RTC_CATCH_END2(scene); + } +@@ -1350,7 +1353,10 @@ RTC_NAMESPACE_BEGIN; + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH && + quality != RTC_BUILD_QUALITY_REFIT) +- throw std::runtime_error("invalid build quality"); ++ // -- GODOT start -- ++ // throw std::runtime_error("invalid build quality"); ++ abort(); ++ // -- GODOT end -- + geometry->setBuildQuality(quality); + RTC_CATCH_END2(geometry); + } +diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h +index 4e4b24e9c2..373e49a689 100644 +--- a/thirdparty/embree/kernels/common/rtcore.h ++++ b/thirdparty/embree/kernels/common/rtcore.h +@@ -25,52 +25,58 @@ namespace embree + #endif + + /*! Macros used in the rtcore API implementation */ +-#define RTC_CATCH_BEGIN try { ++// -- GODOT start -- ++// #define RTC_CATCH_BEGIN try { ++#define RTC_CATCH_BEGIN + +-#define RTC_CATCH_END(device) \ +- } catch (std::bad_alloc&) { \ +- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +- } catch (rtcore_error& e) { \ +- Device::process_error(device,e.error,e.what()); \ +- } catch (std::exception& e) { \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +- } catch (...) { \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +- } ++// #define RTC_CATCH_END(device) \ ++// } catch (std::bad_alloc&) { \ ++// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ ++// } catch (rtcore_error& e) { \ ++// Device::process_error(device,e.error,e.what()); \ ++// } catch (std::exception& e) { \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ ++// } catch (...) { \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ ++// } ++#define RTC_CATCH_END(device) + +-#define RTC_CATCH_END2(scene) \ +- } catch (std::bad_alloc&) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +- } catch (rtcore_error& e) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,e.error,e.what()); \ +- } catch (std::exception& e) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +- } catch (...) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +- } ++// #define RTC_CATCH_END2(scene) \ ++// } catch (std::bad_alloc&) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ ++// } catch (rtcore_error& e) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,e.error,e.what()); \ ++// } catch (std::exception& e) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ ++// } catch (...) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ ++// } ++#define RTC_CATCH_END2(scene) + +-#define RTC_CATCH_END2_FALSE(scene) \ +- } catch (std::bad_alloc&) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +- return false; \ +- } catch (rtcore_error& e) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,e.error,e.what()); \ +- return false; \ +- } catch (std::exception& e) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +- return false; \ +- } catch (...) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +- return false; \ +- } ++// #define RTC_CATCH_END2_FALSE(scene) \ ++// } catch (std::bad_alloc&) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ ++// return false; \ ++// } catch (rtcore_error& e) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,e.error,e.what()); \ ++// return false; \ ++// } catch (std::exception& e) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ ++// return false; \ ++// } catch (...) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ ++// return false; \ ++// } ++#define RTC_CATCH_END2_FALSE(scene) return false; ++// -- GODOT end -- + + #define RTC_VERIFY_HANDLE(handle) \ + if (handle == nullptr) { \ +@@ -97,28 +103,38 @@ namespace embree + #define RTC_TRACE(x) + #endif + +- /*! used to throw embree API errors */ +- struct rtcore_error : public std::exception +- { +- __forceinline rtcore_error(RTCError error, const std::string& str) +- : error(error), str(str) {} +- +- ~rtcore_error() throw() {} +- +- const char* what () const throw () { +- return str.c_str(); +- } +- +- RTCError error; +- std::string str; +- }; ++// -- GODOT begin -- ++// /*! used to throw embree API errors */ ++// struct rtcore_error : public std::exception ++// { ++// __forceinline rtcore_error(RTCError error, const std::string& str) ++// : error(error), str(str) {} ++// ++// ~rtcore_error() throw() {} ++// ++// const char* what () const throw () { ++// return str.c_str(); ++// } ++// ++// RTCError error; ++// std::string str; ++// }; ++// -- GODOT end -- + + #if defined(DEBUG) // only report file and line in debug mode ++ // -- GODOT begin -- ++ // #define throw_RTCError(error,str) \ ++ // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define throw_RTCError(error,str) \ +- throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); ++ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); ++ // -- GODOT end -- + #else ++ // -- GODOT begin -- ++ // #define throw_RTCError(error,str) \ ++ // throw rtcore_error(error,str); + #define throw_RTCError(error,str) \ +- throw rtcore_error(error,str); ++ abort(); ++ // -- GODOT end -- + #endif + + #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ +diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp +index 0149055f2c..408d7eae6f 100644 +--- a/thirdparty/embree/kernels/common/scene.cpp ++++ b/thirdparty/embree/kernels/common/scene.cpp +@@ -792,16 +792,18 @@ namespace embree + } + + /* initiate build */ +- try { ++ // -- GODOT start -- ++ // try { + scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join); +- } +- catch (...) { +- accels_clear(); +- updateInterface(); +- Lock<MutexSys> lock(schedulerMutex); +- this->scheduler = nullptr; +- throw; +- } ++ // } ++ // catch (...) { ++ // accels_clear(); ++ // updateInterface(); ++ // Lock<MutexSys> lock(schedulerMutex); ++ // this->scheduler = nullptr; ++ // throw; ++ // } ++ // -- GODOT end -- + } + + #endif diff --git a/thirdparty/embree/patches/godot-changes-ubsan.patch b/thirdparty/embree/patches/godot-changes-ubsan.patch new file mode 100644 index 0000000000..1336246f0d --- /dev/null +++ b/thirdparty/embree/patches/godot-changes-ubsan.patch @@ -0,0 +1,24 @@ +diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp +index bb4fc81dfe..d279dc4993 100644 +--- a/thirdparty/embree/kernels/builders/primrefgen.cpp ++++ b/thirdparty/embree/kernels/builders/primrefgen.cpp +@@ -184,6 +184,9 @@ namespace embree + + // special variants for grid meshes + ++// -- GODOT start -- ++#if defined(EMBREE_GEOMETRY_GRID) ++// -- GODOT end -- + PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids) + { + PrimInfo pinfo(empty); +@@ -293,6 +296,9 @@ namespace embree + + return pinfo; + } ++// -- GODOT start -- ++#endif ++// -- GODOT end -- + + // ==================================================================================================== + // ==================================================================================================== |