From 767e374dced69b45db0afb30ca2ccf0bbbeef672 Mon Sep 17 00:00:00 2001 From: jfons Date: Thu, 20 May 2021 12:49:33 +0200 Subject: Upgrade Embree to the latest official release. Since Embree v3.13.0 supports AARCH64, switch back to the official repo instead of using Embree-aarch64. `thirdparty/embree/patches/godot-changes.patch` should now contain an accurate diff of the changes done to the library. --- .../embree/common/algorithms/parallel_any_of.h | 55 +++ .../embree/common/algorithms/parallel_filter.h | 93 +++++ thirdparty/embree/common/algorithms/parallel_for.h | 186 +++++++++ .../embree/common/algorithms/parallel_for_for.h | 149 +++++++ .../algorithms/parallel_for_for_prefix_sum.h | 112 +++++ thirdparty/embree/common/algorithms/parallel_map.h | 85 ++++ .../embree/common/algorithms/parallel_partition.h | 283 +++++++++++++ .../embree/common/algorithms/parallel_prefix_sum.h | 85 ++++ .../embree/common/algorithms/parallel_reduce.h | 150 +++++++ thirdparty/embree/common/algorithms/parallel_set.h | 52 +++ .../embree/common/algorithms/parallel_sort.h | 454 +++++++++++++++++++++ 11 files changed, 1704 insertions(+) create mode 100644 thirdparty/embree/common/algorithms/parallel_any_of.h create mode 100644 thirdparty/embree/common/algorithms/parallel_filter.h create mode 100644 thirdparty/embree/common/algorithms/parallel_for.h create mode 100644 thirdparty/embree/common/algorithms/parallel_for_for.h create mode 100644 thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h create mode 100644 thirdparty/embree/common/algorithms/parallel_map.h create mode 100644 thirdparty/embree/common/algorithms/parallel_partition.h create mode 100644 thirdparty/embree/common/algorithms/parallel_prefix_sum.h create mode 100644 thirdparty/embree/common/algorithms/parallel_reduce.h create mode 100644 thirdparty/embree/common/algorithms/parallel_set.h create mode 100644 thirdparty/embree/common/algorithms/parallel_sort.h (limited to 'thirdparty/embree/common/algorithms') diff --git a/thirdparty/embree/common/algorithms/parallel_any_of.h b/thirdparty/embree/common/algorithms/parallel_any_of.h new file mode 100644 index 0000000000..a64e4a1889 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_any_of.h @@ -0,0 +1,55 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "parallel_reduce.h" + +namespace embree +{ + + template + __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred) + { + bool ret = false; + +#if defined(TASKING_TBB) +#if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range{first, last}, [&ret,pred,&context](const tbb::blocked_range& r) { + if (context.is_group_execution_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + context.cancel_group_execution(); + } + } + }); +#else + tbb::parallel_for(tbb::blocked_range{first, last}, [&ret,pred](const tbb::blocked_range& r) { + if (tbb::task::self().is_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + tbb::task::self().cancel_group_execution(); + } + } + }); +#endif +#else + ret = parallel_reduce (first, last, false, [pred](const range& r)->bool { + bool localret = false; + for (auto i=r.begin(); i() + ); +#endif + + return ret; + } + +} // end namespace diff --git a/thirdparty/embree/common/algorithms/parallel_filter.h b/thirdparty/embree/common/algorithms/parallel_filter.h new file mode 100644 index 0000000000..090ef164c2 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_filter.h @@ -0,0 +1,93 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template + inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate) + { + Index j = first; + for (Index i=first; i + inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate) + { + /* sequential fallback */ + if (end-begin <= minStepSize) + return sequential_filter(data,begin,end,predicate); + + /* calculate number of tasks to use */ + enum { MAX_TASKS = 64 }; + const Index numThreads = TaskScheduler::threadCount(); + const Index numBlocks = (end-begin+minStepSize-1)/minStepSize; + const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS); + + /* filter blocks */ + Index nused[MAX_TASKS]; + Index nfree[MAX_TASKS]; + parallel_for(taskCount, [&](const Index taskIndex) + { + const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount; + const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount; + const Index i2 = sequential_filter(data,i0,i1,predicate); + nused[taskIndex] = i2-i0; + nfree[taskIndex] = i1-i2; + }); + + /* calculate offsets */ + Index sused=0; + Index sfree=0; + Index pfree[MAX_TASKS]; + for (Index i=0; i0; i--) + { + if (k0 > r1) break; + Index k1 = k0+nused[i]; + Index src = begin+(i+0)*(end-begin)/taskCount+nused[i]; + for (Index i=max(r0,k0); i= begin && dst < end); + assert(isrc >= begin && isrc < end); + data[dst++] = data[isrc]; + } + k0 = k1; + } + }); + + return begin+sused; + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h new file mode 100644 index 0000000000..645681ac63 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for.h @@ -0,0 +1,186 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../tasking/taskscheduler.h" +#include "../sys/array.h" +#include "../math/math.h" +#include "../math/range.h" + +namespace embree +{ + /* parallel_for without range */ + template + __forceinline void parallel_for( const Index N, const Func& func) + { +#if defined(TASKING_INTERNAL) + if (N) { + TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range& r) { + assert(r.size() == 1); + func(r.begin()); + }); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + } + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range and granulatity */ + template + __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func) + { + assert(first <= last); +#if defined(TASKING_INTERNAL) + TaskScheduler::spawn(first,last,minStepSize,func); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { + func(range(r.begin(),r.end())); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { + func(range(r.begin(),r.end())); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { + func(range(i,i+1)); + }); + +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range */ + template + __forceinline void parallel_for( const Index first, const Index last, const Func& func) + { + assert(first <= last); + parallel_for(first,last,(Index)1,func); + } + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001) + + template + __forceinline void parallel_for_static( const Index N, const Func& func) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner(),context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner()); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + + typedef tbb::affinity_partitioner affinity_partitioner; + + template + __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap,context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + +#else + + template + __forceinline void parallel_for_static( const Index N, const Func& func) + { + parallel_for(N,func); + } + + struct affinity_partitioner { + }; + + template + __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) + { + parallel_for(N,func); + } + +#endif +} diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h new file mode 100644 index 0000000000..92c37a4a38 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for_for.h @@ -0,0 +1,149 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template + __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + size_t k=0; + for (size_t i=0; i!=array2.size(); ++i) { + const size_t N = array2[i]->size(); + if (N) func(array2[i],range(0,N),k); + k+=N; + } + } + + class ParallelForForState + { + public: + + enum { MAX_TASKS = 64 }; + + __forceinline ParallelForForState () + : taskCount(0) {} + + template + __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { + init(array2,minStepSize); + } + + template + __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) + { + /* first calculate total number of elements */ + size_t N = 0; + for (size_t i=0; isize() : 0; + } + this->N = N; + + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (N+minStepSize-1)/minStepSize; + taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); + + /* calculate start (i,j) for each task */ + size_t taskIndex = 0; + i0[taskIndex] = 0; + j0[taskIndex] = 0; + size_t k0 = (++taskIndex)*N/taskCount; + for (size_t i=0, k=0; taskIndex < taskCount; i++) + { + assert(isize() : 0; + while (j= k0 && taskIndex < taskCount) { + assert(taskIndex + __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + ParallelForForState state(array2,minStepSize); + + parallel_for(state.taskCount, [&](const size_t taskIndex) + { + /* calculate range */ + const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; + const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) func(array2[i],range(r0,r1),k); + k+=r1-r0; j0 = 0; + } + }); + } + + template + __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) + { + parallel_for_for(array2,1,func); + } + + template + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + ParallelForForState state(array2,minStepSize); + Value temp[ParallelForForState::MAX_TASKS]; + + for (size_t i=0; isize() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range(r0,r1),k)); + k+=r1-r0; j0 = 0; + } + }); + + Value ret = identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_reduce(array2,1,identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h new file mode 100644 index 0000000000..b15b44a991 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h @@ -0,0 +1,112 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for_for.h" +#include "parallel_prefix_sum.h" + +namespace embree +{ + template + struct ParallelForForPrefixSumState : public ParallelForForState + { + __forceinline ParallelForForPrefixSumState () {} + + template + __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) + : ParallelForForState(array2,minStepSize) {} + + ParallelPrefixSumState prefix_state; + }; + + template + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range((Index)r0,(Index)r1),(Index)k,(Index)i)); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction); + } + + template + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_map.h b/thirdparty/embree/common/algorithms/parallel_map.h new file mode 100644 index 0000000000..15c098fe20 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_map.h @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_sort.h" + +namespace embree +{ + /*! implementation of a key/value map with parallel construction */ + template + class parallel_map + { + /* key/value pair to build the map */ + struct KeyValue + { + __forceinline KeyValue () {} + + __forceinline KeyValue (const Key key, const Val val) + : key(key), val(val) {} + + __forceinline operator Key() const { + return key; + } + + public: + Key key; + Val val; + }; + + public: + + /*! parallel map constructors */ + parallel_map () {} + + /*! construction from pair of vectors */ + template + parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); } + + /*! initialized the parallel map from a vector with keys and values */ + template + void init(const KeyVector& keys, const ValVector& values) + { + /* reserve sufficient space for all data */ + assert(keys.size() == values.size()); + vec.resize(keys.size()); + + /* generate key/value pairs */ + parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range& r) { + for (size_t i=r.begin(); i temp(keys.size()); + radix_sort(vec.data(),temp.data(),keys.size()); + } + + /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */ + __forceinline const Val* lookup(const Key& key) const + { + typename std::vector::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return nullptr; + if (i->key != key) return nullptr; + return &i->val; + } + + /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */ + __forceinline Val lookup(const Key& key, const Val& def) const + { + typename std::vector::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return def; + if (i->key != key) return def; + return i->val; + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree/common/algorithms/parallel_partition.h b/thirdparty/embree/common/algorithms/parallel_partition.h new file mode 100644 index 0000000000..a1cbdc8e04 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_partition.h @@ -0,0 +1,283 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" +#include "../math/range.h" + +namespace embree +{ + /* serial partitioning */ + template + __forceinline size_t serial_partitioning(T* array, + const size_t begin, + const size_t end, + V& leftReduction, + V& rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t) + { + T* l = array + begin; + T* r = array + end - 1; + + while(1) + { + /* *l < pivot */ + while (likely(l <= r && is_left(*l) )) + { + //prefetchw(l+4); // FIXME: enable? + reduction_t(leftReduction,*l); + ++l; + } + /* *r >= pivot) */ + while (likely(l <= r && !is_left(*r))) + { + //prefetchw(r-4); FIXME: enable? + reduction_t(rightReduction,*r); + --r; + } + if (r + class __aligned(64) parallel_partition_task + { + ALIGNED_CLASS_(64); + private: + + static const size_t MAX_TASKS = 64; + + T* array; + size_t N; + const IsLeft& is_left; + const Reduction_T& reduction_t; + const Reduction_V& reduction_v; + const Vi& identity; + + size_t numTasks; + __aligned(64) size_t counter_start[MAX_TASKS+1]; + __aligned(64) size_t counter_left[MAX_TASKS+1]; + __aligned(64) range leftMisplacedRanges[MAX_TASKS]; + __aligned(64) range rightMisplacedRanges[MAX_TASKS]; + __aligned(64) V leftReductions[MAX_TASKS]; + __aligned(64) V rightReductions[MAX_TASKS]; + + public: + + __forceinline parallel_partition_task(T* array, + const size_t N, + const Vi& identity, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + const size_t BLOCK_SIZE) + + : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), + numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} + + __forceinline const range* findStartRange(size_t& index, const range* const r, const size_t numRanges) + { + size_t i = 0; + while(index >= (size_t)r[i].size()) + { + assert(i < numRanges); + index -= (size_t)r[i].size(); + i++; + } + return &r[i]; + } + + __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, + const size_t numRightMisplacedRanges, + const size_t startID, + const size_t endID) + { + size_t leftLocalIndex = startID; + size_t rightLocalIndex = startID; + const range* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); + const range* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); + + size_t l_left = l_range->size() - leftLocalIndex; + size_t r_left = r_range->size() - rightLocalIndex; + T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; + T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; + size_t size = endID - startID; + size_t items = min(size,min(l_left,r_left)); + + while (size) + { + if (unlikely(l_left == 0)) + { + l_range++; + l_left = l_range->size(); + l = &array[l_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + if (unlikely(r_left == 0)) + { + r_range++; + r_left = r_range->size(); + r = &array[r_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + size -= items; + l_left -= items; + r_left -= items; + + while(items) { + items--; + xchg(*l++,*r++); + } + } + } + + __forceinline size_t partition(V& leftReduction, V& rightReduction) + { + /* partition the individual ranges for each task */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*N/numTasks; + const size_t endID = (taskID+1)*N/numTasks; + V local_left(identity); + V local_right(identity); + const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); + counter_start[taskID] = startID; + counter_left [taskID] = mid-startID; + leftReductions[taskID] = local_left; + rightReductions[taskID] = local_right; + }); + counter_start[numTasks] = N; + counter_left[numTasks] = 0; + + /* finalize the reductions */ + for (size_t i=0; i globalLeft (0,mid); + const range globalRight(mid,N); + + /* calculate all left and right ranges that are on the wrong global side */ + size_t numMisplacedRangesLeft = 0; + size_t numMisplacedRangesRight = 0; + size_t numMisplacedItemsLeft = 0; + size_t numMisplacedItemsRight = 0; + + for (size_t i=0; i left_range (counter_start[i], counter_start[i] + counter_left[i]); + const range right_range(counter_start[i] + counter_left[i], counter_start[i+1]); + const range left_misplaced = globalLeft. intersect(right_range); + const range right_misplaced = globalRight.intersect(left_range); + + if (!left_misplaced.empty()) + { + numMisplacedItemsLeft += left_misplaced.size(); + leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; + } + + if (!right_misplaced.empty()) + { + numMisplacedItemsRight += right_misplaced.size(); + rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; + } + } + assert( numMisplacedItemsLeft == numMisplacedItemsRight ); + + /* if no items are misplaced we are done */ + if (numMisplacedItemsLeft == 0) + return mid; + + /* otherwise we copy the items to the right place in parallel */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; + const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; + swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); + }); + + return mid; + } + }; + + template + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE = 128) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < BLOCK_SIZE)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task partition_task; + std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + template + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE, + size_t PARALLEL_THRESHOLD) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < PARALLEL_THRESHOLD)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task partition_task; + std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + + template + inline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const IsLeft& is_left, + size_t BLOCK_SIZE = 128) + { + size_t leftReduction = 0; + size_t rightReduction = 0; + return parallel_partitioning( + array,begin,end,0,leftReduction,rightReduction,is_left, + [] (size_t& t,const T& ref) { }, + [] (size_t& t0,size_t& t1) { }, + BLOCK_SIZE); + } + +} diff --git a/thirdparty/embree/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h new file mode 100644 index 0000000000..208bb4e480 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h @@ -0,0 +1,85 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" + +namespace embree +{ + template + struct ParallelPrefixSumState + { + enum { MAX_TASKS = 64 }; + Value counts[MAX_TASKS]; + Value sums [MAX_TASKS]; + }; + + template + __forceinline Value parallel_prefix_sum( ParallelPrefixSumState& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; + const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState::MAX_TASKS)); + + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; + const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; + state.counts[taskIndex] = func(range(i0,i1),state.sums[taskIndex]); + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) + { + /* perform single threaded prefix operation for small N */ + if (N < SINGLE_THREAD_THRESHOLD) + { + Value sum=identity; + for (size_t i=0; i state; + + /* initial run just sets up start values for subtasks */ + parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i + __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range(first,last)); + } + + template + __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range(first,last)); + } + + template + __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + const Index maxTasks = 512; + const Index threadCount = (Index) TaskScheduler::threadCount(); + taskCount = min(taskCount,threadCount,maxTasks); + + /* parallel invokation of all tasks */ + dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack + parallel_for(taskCount, [&](const Index taskIndex) { + const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; + const Index k1 = first+(taskIndex+1)*(last-first)/taskCount; + values[taskIndex] = func(range(k0,k1)); + }); + + /* perform reduction over all tasks */ + Value v = identity; + for (Index i=0; i + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { +#if defined(TASKING_INTERNAL) + + /* fast path for small number of iterations */ + Index taskCount = (last-first+minStepSize-1)/minStepSize; + if (likely(taskCount == 1)) { + return func(range(first,last)); + } + return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction); + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, + [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, + reduction,context); + // -- GODOT start -- + // if (context.is_group_execution_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #else + const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, + [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, + reduction); + // -- GODOT start -- + // if (tbb::task::self().is_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #endif +#else // TASKING_PPL + struct AlignedValue + { + char storage[__alignof(Value)+sizeof(Value)]; + static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); }; + Value* getValuePtr() { return reinterpret_cast(alignUp(uintptr_t(storage), __alignof(Value))); } + const Value* getValuePtr() const { return reinterpret_cast(alignUp(uintptr_t(storage), __alignof(Value))); } + AlignedValue(const Value& v) { new(getValuePtr()) Value(v); } + AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); } + AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); }; + AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + operator Value() const { return *getValuePtr(); } + }; + + struct Iterator_Index + { + Index v; + typedef std::forward_iterator_tag iterator_category; + typedef AlignedValue value_type; + typedef Index difference_type; + typedef Index distance_type; + typedef AlignedValue* pointer; + typedef AlignedValue& reference; + __forceinline Iterator_Index() {} + __forceinline Iterator_Index(Index v) : v(v) {} + __forceinline bool operator== (Iterator_Index other) { return v == other.v; } + __forceinline bool operator!= (Iterator_Index other) { return v != other.v; } + __forceinline Iterator_Index operator++() { return Iterator_Index(++v); } + __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); } + }; + + auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) { + assert(begin.v < end.v); + return reduction(start, func(range(begin.v, end.v))); + }; + const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction); + return v; +#endif + } + + template + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + if (likely(last-first < parallel_threshold)) { + return func(range(first,last)); + } else { + return parallel_reduce(first,last,minStepSize,identity,func,reduction); + } + } + + template + __forceinline Value parallel_reduce( const range range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction); + } + + template + __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + auto funcr = [&] ( const range r ) { + Value v = identity; + for (Index i=r.begin(); i + __forceinline Value parallel_reduce( const range range, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree/common/algorithms/parallel_set.h b/thirdparty/embree/common/algorithms/parallel_set.h new file mode 100644 index 0000000000..7eae577457 --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_set.h @@ -0,0 +1,52 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_sort.h" + +namespace embree +{ + /* implementation of a set of values with parallel construction */ + template + class parallel_set + { + public: + + /*! default constructor for the parallel set */ + parallel_set () {} + + /*! construction from vector */ + template + parallel_set (const Vector& in) { init(in); } + + /*! initialized the parallel set from a vector */ + template + void init(const Vector& in) + { + /* copy data to internal vector */ + vec.resize(in.size()); + parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range& r) { + for (size_t i=r.begin(); i temp(in.size()); + radix_sort(vec.data(),temp.data(),vec.size()); + } + + /*! tests if some element is in the set */ + __forceinline bool lookup(const T& elt) const { + return std::binary_search(vec.begin(), vec.end(), elt); + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree/common/algorithms/parallel_sort.h b/thirdparty/embree/common/algorithms/parallel_sort.h new file mode 100644 index 0000000000..30e56c2bfc --- /dev/null +++ b/thirdparty/embree/common/algorithms/parallel_sort.h @@ -0,0 +1,454 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../simd/simd.h" +#include "parallel_for.h" +#include + +namespace embree +{ + template + __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i 0 && v < array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template + __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i 0 && v > array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template + void quicksort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_ascending(t, begin, pivot); + quicksort_ascending(t, pivot + 1, end); + } + } + + template + void quicksort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_decending(t, begin, pivot); + quicksort_decending(t, pivot + 1, end); + } + } + + + template + void quicksort_insertionsort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_ascending(&t[begin],size); + } + else + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_ascending(t, begin, pivot); + quicksort_insertionsort_ascending(t, pivot + 1, end); + } + } + } + + + template + void quicksort_insertionsort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_decending(&t[begin],size); + } + else + { + + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_decending(t, begin, pivot); + quicksort_insertionsort_decending(t, pivot + 1, end); + } + } + } + + template + static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) + { + static const unsigned int BITS = 8; + static const unsigned int BUCKETS = (1 << BITS); + static const unsigned int CMP_SORT_THRESHOLD = 16; + + __aligned(64) unsigned int count[BUCKETS]; + + /* clear buckets */ + for (size_t i=0;i> shift) & (BUCKETS-1)]++; + + /* prefix sums */ + __aligned(64) unsigned int head[BUCKETS]; + __aligned(64) unsigned int tail[BUCKETS]; + + head[0] = 0; + for (size_t i=1; i> shift) & (BUCKETS-1); + if (b == i) break; + std::swap(v,morton[head[b]++]); + } + assert((unsigned(v) >> shift & (BUCKETS-1)) == i); + morton[head[i]++] = v; + } + } + if (shift == 0) return; + + size_t offset = 0; + for (size_t i=0;i> shift) & (BUCKETS-1)) == i); + + if (unlikely(count[i] < CMP_SORT_THRESHOLD)) + insertionsort_ascending(morton + offset, count[i]); + else + radixsort32(morton + offset, count[i], shift-BITS); + + for (size_t j=offset;j + class ParallelRadixSort + { + static const size_t MAX_TASKS = 64; + static const size_t BITS = 8; + static const size_t BUCKETS = (1 << BITS); + typedef unsigned int TyRadixCount[BUCKETS]; + + template + static bool compare(const T& v0, const T& v1) { + return (Key)v0 < (Key)v1; + } + + private: + ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement + ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement + + + public: + ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) + : radixCount(nullptr), src(src), tmp(tmp), N(N) {} + + void sort(const size_t blockSize) + { + assert(blockSize > 0); + + /* perform single threaded sort for small N */ + if (N<=blockSize) // handles also special case of 0! + { + /* do inplace sort inside destination array */ + std::sort(src,src+N,compare); + } + + /* perform parallel sort for large N */ + else + { + const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); + tbbRadixSort(numThreads); + } + } + + ~ParallelRadixSort() + { + alignedFree(radixCount); + radixCount = nullptr; + } + + private: + + void tbbRadixIteration0(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* count how many items go into the buckets */ + for (size_t i=0; i> (size_t)shift) & (size_t)mask; +#else + const Key index = ((Key)src[i] >> shift) & mask; +#endif + count[index]++; + } + } + + void tbbRadixIteration1(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* calculate total number of items for each bucket */ + __aligned(64) unsigned int total[BUCKETS]; + /* + for (size_t i=0; i> (size_t)shift) & (size_t)mask; +#else + const size_t index = ((Key)src[i] >> shift) & mask; +#endif + dst[offset[index]++] = elt; + } + } + + void tbbRadixIteration(const Key shift, const bool last, + const Ty* __restrict src, Ty* __restrict dst, + const size_t numTasks) + { + affinity_partitioner ap; + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); + } + + void tbbRadixSort(const size_t numTasks) + { + radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); + + if (sizeof(Key) == sizeof(uint32_t)) { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,1,tmp,src,numTasks); + } + else if (sizeof(Key) == sizeof(uint64_t)) + { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,0,tmp,src,numTasks); + tbbRadixIteration(4*BITS,0,src,tmp,numTasks); + tbbRadixIteration(5*BITS,0,tmp,src,numTasks); + tbbRadixIteration(6*BITS,0,src,tmp,numTasks); + tbbRadixIteration(7*BITS,1,tmp,src,numTasks); + } + } + + private: + TyRadixCount* radixCount; + Ty* const src; + Ty* const tmp; + const size_t N; + }; + + template + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort(src,tmp,N).sort(blockSize); + } + + template + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort(src,tmp,N).sort(blockSize); + } + + template + void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort(src,tmp,N,blockSize); + } + + template + void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort(src,tmp,N,blockSize); + } +} -- cgit v1.2.3