summaryrefslogtreecommitdiff
path: root/thirdparty/embree/common/algorithms
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree/common/algorithms')
-rw-r--r--thirdparty/embree/common/algorithms/parallel_any_of.h55
-rw-r--r--thirdparty/embree/common/algorithms/parallel_filter.h93
-rw-r--r--thirdparty/embree/common/algorithms/parallel_for.h186
-rw-r--r--thirdparty/embree/common/algorithms/parallel_for_for.h149
-rw-r--r--thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h112
-rw-r--r--thirdparty/embree/common/algorithms/parallel_map.h85
-rw-r--r--thirdparty/embree/common/algorithms/parallel_partition.h283
-rw-r--r--thirdparty/embree/common/algorithms/parallel_prefix_sum.h85
-rw-r--r--thirdparty/embree/common/algorithms/parallel_reduce.h150
-rw-r--r--thirdparty/embree/common/algorithms/parallel_set.h52
-rw-r--r--thirdparty/embree/common/algorithms/parallel_sort.h454
11 files changed, 1704 insertions, 0 deletions
diff --git a/thirdparty/embree/common/algorithms/parallel_any_of.h b/thirdparty/embree/common/algorithms/parallel_any_of.h
new file mode 100644
index 0000000000..a64e4a1889
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_any_of.h
@@ -0,0 +1,55 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include "parallel_reduce.h"
+
+namespace embree
+{
+
+ template<typename Index, class UnaryPredicate>
+ __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
+ {
+ bool ret = false;
+
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
+ if (context.is_group_execution_cancelled()) return;
+ for (size_t i = r.begin(); i != r.end(); ++i) {
+ if (pred(i)) {
+ ret = true;
+ context.cancel_group_execution();
+ }
+ }
+ });
+#else
+ tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
+ if (tbb::task::self().is_cancelled()) return;
+ for (size_t i = r.begin(); i != r.end(); ++i) {
+ if (pred(i)) {
+ ret = true;
+ tbb::task::self().cancel_group_execution();
+ }
+ }
+ });
+#endif
+#else
+ ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
+ bool localret = false;
+ for (auto i=r.begin(); i<r.end(); ++i) {
+ localret |= pred(i);
+ }
+ return localret;
+ },
+ std::bit_or<bool>()
+ );
+#endif
+
+ return ret;
+ }
+
+} // end namespace
diff --git a/thirdparty/embree/common/algorithms/parallel_filter.h b/thirdparty/embree/common/algorithms/parallel_filter.h
new file mode 100644
index 0000000000..090ef164c2
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_filter.h
@@ -0,0 +1,93 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+ template<typename Ty, typename Index, typename Predicate>
+ inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
+ {
+ Index j = first;
+ for (Index i=first; i<last; i++)
+ if (predicate(data[i]))
+ data[j++] = data[i];
+
+ return j;
+ }
+
+ template<typename Ty, typename Index, typename Predicate>
+ inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
+ {
+ /* sequential fallback */
+ if (end-begin <= minStepSize)
+ return sequential_filter(data,begin,end,predicate);
+
+ /* calculate number of tasks to use */
+ enum { MAX_TASKS = 64 };
+ const Index numThreads = TaskScheduler::threadCount();
+ const Index numBlocks = (end-begin+minStepSize-1)/minStepSize;
+ const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS);
+
+ /* filter blocks */
+ Index nused[MAX_TASKS];
+ Index nfree[MAX_TASKS];
+ parallel_for(taskCount, [&](const Index taskIndex)
+ {
+ const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
+ const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
+ const Index i2 = sequential_filter(data,i0,i1,predicate);
+ nused[taskIndex] = i2-i0;
+ nfree[taskIndex] = i1-i2;
+ });
+
+ /* calculate offsets */
+ Index sused=0;
+ Index sfree=0;
+ Index pfree[MAX_TASKS];
+ for (Index i=0; i<taskCount; i++)
+ {
+ sused+=nused[i];
+ Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
+ }
+
+ /* return if we did not filter out any element */
+ assert(sfree <= end-begin);
+ assert(sused <= end-begin);
+ if (sused == end-begin)
+ return end;
+
+ /* otherwise we have to copy misplaced elements around */
+ parallel_for(taskCount, [&](const Index taskIndex)
+ {
+ /* destination to write elements to */
+ Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
+ Index dst_end = min(dst+nfree[taskIndex],begin+sused);
+ if (dst_end <= dst) return;
+
+ /* range of misplaced elements to copy to destination */
+ Index r0 = pfree[taskIndex];
+ Index r1 = r0+dst_end-dst;
+
+ /* find range in misplaced elements in back to front order */
+ Index k0=0;
+ for (Index i=taskCount-1; i>0; i--)
+ {
+ if (k0 > r1) break;
+ Index k1 = k0+nused[i];
+ Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
+ for (Index i=max(r0,k0); i<min(r1,k1); i++) {
+ Index isrc = src-i+k0-1;
+ assert(dst >= begin && dst < end);
+ assert(isrc >= begin && isrc < end);
+ data[dst++] = data[isrc];
+ }
+ k0 = k1;
+ }
+ });
+
+ return begin+sused;
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h
new file mode 100644
index 0000000000..645681ac63
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../tasking/taskscheduler.h"
+#include "../sys/array.h"
+#include "../math/math.h"
+#include "../math/range.h"
+
+namespace embree
+{
+ /* parallel_for without range */
+ template<typename Index, typename Func>
+ __forceinline void parallel_for( const Index N, const Func& func)
+ {
+#if defined(TASKING_INTERNAL)
+ if (N) {
+ TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
+ assert(r.size() == 1);
+ func(r.begin());
+ });
+ if (!TaskScheduler::wait())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ }
+
+#elif defined(TASKING_TBB)
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },context);
+ if (context.is_group_execution_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #else
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ });
+ if (tbb::task::self().is_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #endif
+
+#elif defined(TASKING_PPL)
+ concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ });
+#else
+# error "no tasking system enabled"
+#endif
+ }
+
+ /* parallel for with range and granulatity */
+ template<typename Index, typename Func>
+ __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
+ {
+ assert(first <= last);
+#if defined(TASKING_INTERNAL)
+ TaskScheduler::spawn(first,last,minStepSize,func);
+ if (!TaskScheduler::wait())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+
+#elif defined(TASKING_TBB)
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+ func(range<Index>(r.begin(),r.end()));
+ },context);
+ if (context.is_group_execution_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #else
+ tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+ func(range<Index>(r.begin(),r.end()));
+ });
+ if (tbb::task::self().is_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #endif
+
+#elif defined(TASKING_PPL)
+ concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) {
+ func(range<Index>(i,i+1));
+ });
+
+#else
+# error "no tasking system enabled"
+#endif
+ }
+
+ /* parallel for with range */
+ template<typename Index, typename Func>
+ __forceinline void parallel_for( const Index first, const Index last, const Func& func)
+ {
+ assert(first <= last);
+ parallel_for(first,last,(Index)1,func);
+ }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
+
+ template<typename Index, typename Func>
+ __forceinline void parallel_for_static( const Index N, const Func& func)
+ {
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },tbb::simple_partitioner(),context);
+ if (context.is_group_execution_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #else
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },tbb::simple_partitioner());
+ if (tbb::task::self().is_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #endif
+ }
+
+ typedef tbb::affinity_partitioner affinity_partitioner;
+
+ template<typename Index, typename Func>
+ __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
+ {
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },ap,context);
+ if (context.is_group_execution_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #else
+ tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+ func(i);
+ },ap);
+ if (tbb::task::self().is_cancelled())
+ // -- GODOT start --
+ // throw std::runtime_error("task cancelled");
+ abort();
+ // -- GODOT end --
+ #endif
+ }
+
+#else
+
+ template<typename Index, typename Func>
+ __forceinline void parallel_for_static( const Index N, const Func& func)
+ {
+ parallel_for(N,func);
+ }
+
+ struct affinity_partitioner {
+ };
+
+ template<typename Index, typename Func>
+ __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap)
+ {
+ parallel_for(N,func);
+ }
+
+#endif
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h
new file mode 100644
index 0000000000..92c37a4a38
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for_for.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+ template<typename ArrayArray, typename Func>
+ __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+ {
+ size_t k=0;
+ for (size_t i=0; i!=array2.size(); ++i) {
+ const size_t N = array2[i]->size();
+ if (N) func(array2[i],range<size_t>(0,N),k);
+ k+=N;
+ }
+ }
+
+ class ParallelForForState
+ {
+ public:
+
+ enum { MAX_TASKS = 64 };
+
+ __forceinline ParallelForForState ()
+ : taskCount(0) {}
+
+ template<typename ArrayArray>
+ __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
+ init(array2,minStepSize);
+ }
+
+ template<typename ArrayArray>
+ __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+ {
+ /* first calculate total number of elements */
+ size_t N = 0;
+ for (size_t i=0; i<array2.size(); i++) {
+ N += array2[i] ? array2[i]->size() : 0;
+ }
+ this->N = N;
+
+ /* calculate number of tasks to use */
+ const size_t numThreads = TaskScheduler::threadCount();
+ const size_t numBlocks = (N+minStepSize-1)/minStepSize;
+ taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
+
+ /* calculate start (i,j) for each task */
+ size_t taskIndex = 0;
+ i0[taskIndex] = 0;
+ j0[taskIndex] = 0;
+ size_t k0 = (++taskIndex)*N/taskCount;
+ for (size_t i=0, k=0; taskIndex < taskCount; i++)
+ {
+ assert(i<array2.size());
+ size_t j=0, M = array2[i] ? array2[i]->size() : 0;
+ while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
+ assert(taskIndex<taskCount);
+ i0[taskIndex] = i;
+ j0[taskIndex] = j += k0-k;
+ k=k0;
+ k0 = (++taskIndex)*N/taskCount;
+ }
+ k+=M-j;
+ }
+ }
+
+ __forceinline size_t size() const {
+ return N;
+ }
+
+ public:
+ size_t i0[MAX_TASKS];
+ size_t j0[MAX_TASKS];
+ size_t taskCount;
+ size_t N;
+ };
+
+ template<typename ArrayArray, typename Func>
+ __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+ {
+ ParallelForForState state(array2,minStepSize);
+
+ parallel_for(state.taskCount, [&](const size_t taskIndex)
+ {
+ /* calculate range */
+ const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+ const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+ size_t i0 = state.i0[taskIndex];
+ size_t j0 = state.j0[taskIndex];
+
+ /* iterate over arrays */
+ size_t k=k0;
+ for (size_t i=i0; k<k1; i++) {
+ const size_t N = array2[i] ? array2[i]->size() : 0;
+ const size_t r0 = j0, r1 = min(N,r0+k1-k);
+ if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
+ k+=r1-r0; j0 = 0;
+ }
+ });
+ }
+
+ template<typename ArrayArray, typename Func>
+ __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
+ {
+ parallel_for_for(array2,1,func);
+ }
+
+ template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ ParallelForForState state(array2,minStepSize);
+ Value temp[ParallelForForState::MAX_TASKS];
+
+ for (size_t i=0; i<state.taskCount; i++)
+ temp[i] = identity;
+
+ parallel_for(state.taskCount, [&](const size_t taskIndex)
+ {
+ /* calculate range */
+ const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+ const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+ size_t i0 = state.i0[taskIndex];
+ size_t j0 = state.j0[taskIndex];
+
+ /* iterate over arrays */
+ size_t k=k0;
+ for (size_t i=i0; k<k1; i++) {
+ const size_t N = array2[i] ? array2[i]->size() : 0;
+ const size_t r0 = j0, r1 = min(N,r0+k1-k);
+ if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
+ k+=r1-r0; j0 = 0;
+ }
+ });
+
+ Value ret = identity;
+ for (size_t i=0; i<state.taskCount; i++)
+ ret = reduction(ret,temp[i]);
+ return ret;
+ }
+
+ template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ return parallel_for_for_reduce(array2,1,identity,func,reduction);
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
new file mode 100644
index 0000000000..b15b44a991
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for_for.h"
+#include "parallel_prefix_sum.h"
+
+namespace embree
+{
+ template<typename Value>
+ struct ParallelForForPrefixSumState : public ParallelForForState
+ {
+ __forceinline ParallelForForPrefixSumState () {}
+
+ template<typename ArrayArray>
+ __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
+ : ParallelForForState(array2,minStepSize) {}
+
+ ParallelPrefixSumState<Value> prefix_state;
+ };
+
+ template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize,
+ const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ /* calculate number of tasks to use */
+ const size_t taskCount = state.taskCount;
+ /* perform parallel prefix sum */
+ parallel_for(taskCount, [&](const size_t taskIndex)
+ {
+ const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+ const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+ size_t i0 = state.i0[taskIndex];
+ size_t j0 = state.j0[taskIndex];
+
+ /* iterate over arrays */
+ size_t k=k0;
+ Value N=identity;
+ for (size_t i=i0; k<k1; i++) {
+ const size_t size = array2[i] ? array2[i]->size() : 0;
+ const size_t r0 = j0, r1 = min(size,r0+k1-k);
+ if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
+ k+=r1-r0; j0 = 0;
+ }
+ state.prefix_state.counts[taskIndex] = N;
+ });
+
+ /* calculate prefix sum */
+ Value sum=identity;
+ for (size_t i=0; i<taskCount; i++)
+ {
+ const Value c = state.prefix_state.counts[i];
+ state.prefix_state.sums[i] = sum;
+ sum=reduction(sum,c);
+ }
+
+ return sum;
+ }
+
+ template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize,
+ const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ /* calculate number of tasks to use */
+ const size_t taskCount = state.taskCount;
+ /* perform parallel prefix sum */
+ parallel_for(taskCount, [&](const size_t taskIndex)
+ {
+ const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+ const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+ size_t i0 = state.i0[taskIndex];
+ size_t j0 = state.j0[taskIndex];
+
+ /* iterate over arrays */
+ size_t k=k0;
+ Value N=identity;
+ for (size_t i=i0; k<k1; i++) {
+ const size_t size = array2[i] ? array2[i]->size() : 0;
+ const size_t r0 = j0, r1 = min(size,r0+k1-k);
+ if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
+ k+=r1-r0; j0 = 0;
+ }
+ state.prefix_state.counts[taskIndex] = N;
+ });
+
+ /* calculate prefix sum */
+ Value sum=identity;
+ for (size_t i=0; i<taskCount; i++)
+ {
+ const Value c = state.prefix_state.counts[i];
+ state.prefix_state.sums[i] = sum;
+ sum=reduction(sum,c);
+ }
+
+ return sum;
+ }
+
+ template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
+ const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
+ }
+
+ template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2,
+ const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_map.h b/thirdparty/embree/common/algorithms/parallel_map.h
new file mode 100644
index 0000000000..15c098fe20
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_map.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+ /*! implementation of a key/value map with parallel construction */
+ template<typename Key, typename Val>
+ class parallel_map
+ {
+ /* key/value pair to build the map */
+ struct KeyValue
+ {
+ __forceinline KeyValue () {}
+
+ __forceinline KeyValue (const Key key, const Val val)
+ : key(key), val(val) {}
+
+ __forceinline operator Key() const {
+ return key;
+ }
+
+ public:
+ Key key;
+ Val val;
+ };
+
+ public:
+
+ /*! parallel map constructors */
+ parallel_map () {}
+
+ /*! construction from pair of vectors */
+ template<typename KeyVector, typename ValVector>
+ parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
+
+ /*! initialized the parallel map from a vector with keys and values */
+ template<typename KeyVector, typename ValVector>
+ void init(const KeyVector& keys, const ValVector& values)
+ {
+ /* reserve sufficient space for all data */
+ assert(keys.size() == values.size());
+ vec.resize(keys.size());
+
+ /* generate key/value pairs */
+ parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ vec[i] = KeyValue((Key)keys[i],values[i]);
+ });
+
+ /* perform parallel radix sort of the key/value pairs */
+ std::vector<KeyValue> temp(keys.size());
+ radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
+ }
+
+ /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
+ __forceinline const Val* lookup(const Key& key) const
+ {
+ typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+ if (i == vec.end()) return nullptr;
+ if (i->key != key) return nullptr;
+ return &i->val;
+ }
+
+ /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
+ __forceinline Val lookup(const Key& key, const Val& def) const
+ {
+ typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+ if (i == vec.end()) return def;
+ if (i->key != key) return def;
+ return i->val;
+ }
+
+ /*! clears all state */
+ void clear() {
+ vec.clear();
+ }
+
+ private:
+ std::vector<KeyValue> vec; //!< vector containing sorted elements
+ };
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_partition.h b/thirdparty/embree/common/algorithms/parallel_partition.h
new file mode 100644
index 0000000000..a1cbdc8e04
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_partition.h
@@ -0,0 +1,283 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+#include "../math/range.h"
+
+namespace embree
+{
+ /* serial partitioning */
+ template<typename T, typename V, typename IsLeft, typename Reduction_T>
+ __forceinline size_t serial_partitioning(T* array,
+ const size_t begin,
+ const size_t end,
+ V& leftReduction,
+ V& rightReduction,
+ const IsLeft& is_left,
+ const Reduction_T& reduction_t)
+ {
+ T* l = array + begin;
+ T* r = array + end - 1;
+
+ while(1)
+ {
+ /* *l < pivot */
+ while (likely(l <= r && is_left(*l) ))
+ {
+ //prefetchw(l+4); // FIXME: enable?
+ reduction_t(leftReduction,*l);
+ ++l;
+ }
+ /* *r >= pivot) */
+ while (likely(l <= r && !is_left(*r)))
+ {
+ //prefetchw(r-4); FIXME: enable?
+ reduction_t(rightReduction,*r);
+ --r;
+ }
+ if (r<l) break;
+
+ reduction_t(leftReduction ,*r);
+ reduction_t(rightReduction,*l);
+ xchg(*l,*r);
+ l++; r--;
+ }
+
+ return l - array;
+ }
+
+ template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+ class __aligned(64) parallel_partition_task
+ {
+ ALIGNED_CLASS_(64);
+ private:
+
+ static const size_t MAX_TASKS = 64;
+
+ T* array;
+ size_t N;
+ const IsLeft& is_left;
+ const Reduction_T& reduction_t;
+ const Reduction_V& reduction_v;
+ const Vi& identity;
+
+ size_t numTasks;
+ __aligned(64) size_t counter_start[MAX_TASKS+1];
+ __aligned(64) size_t counter_left[MAX_TASKS+1];
+ __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];
+ __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];
+ __aligned(64) V leftReductions[MAX_TASKS];
+ __aligned(64) V rightReductions[MAX_TASKS];
+
+ public:
+
+ __forceinline parallel_partition_task(T* array,
+ const size_t N,
+ const Vi& identity,
+ const IsLeft& is_left,
+ const Reduction_T& reduction_t,
+ const Reduction_V& reduction_v,
+ const size_t BLOCK_SIZE)
+
+ : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
+ numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
+
+ __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
+ {
+ size_t i = 0;
+ while(index >= (size_t)r[i].size())
+ {
+ assert(i < numRanges);
+ index -= (size_t)r[i].size();
+ i++;
+ }
+ return &r[i];
+ }
+
+ __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
+ const size_t numRightMisplacedRanges,
+ const size_t startID,
+ const size_t endID)
+ {
+ size_t leftLocalIndex = startID;
+ size_t rightLocalIndex = startID;
+ const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
+ const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
+
+ size_t l_left = l_range->size() - leftLocalIndex;
+ size_t r_left = r_range->size() - rightLocalIndex;
+ T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
+ T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
+ size_t size = endID - startID;
+ size_t items = min(size,min(l_left,r_left));
+
+ while (size)
+ {
+ if (unlikely(l_left == 0))
+ {
+ l_range++;
+ l_left = l_range->size();
+ l = &array[l_range->begin()];
+ items = min(size,min(l_left,r_left));
+ }
+
+ if (unlikely(r_left == 0))
+ {
+ r_range++;
+ r_left = r_range->size();
+ r = &array[r_range->begin()];
+ items = min(size,min(l_left,r_left));
+ }
+
+ size -= items;
+ l_left -= items;
+ r_left -= items;
+
+ while(items) {
+ items--;
+ xchg(*l++,*r++);
+ }
+ }
+ }
+
+ __forceinline size_t partition(V& leftReduction, V& rightReduction)
+ {
+ /* partition the individual ranges for each task */
+ parallel_for(numTasks,[&] (const size_t taskID) {
+ const size_t startID = (taskID+0)*N/numTasks;
+ const size_t endID = (taskID+1)*N/numTasks;
+ V local_left(identity);
+ V local_right(identity);
+ const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
+ counter_start[taskID] = startID;
+ counter_left [taskID] = mid-startID;
+ leftReductions[taskID] = local_left;
+ rightReductions[taskID] = local_right;
+ });
+ counter_start[numTasks] = N;
+ counter_left[numTasks] = 0;
+
+ /* finalize the reductions */
+ for (size_t i=0; i<numTasks; i++) {
+ reduction_v(leftReduction,leftReductions[i]);
+ reduction_v(rightReduction,rightReductions[i]);
+ }
+
+ /* calculate mid point for partitioning */
+ size_t mid = counter_left[0];
+ for (size_t i=1; i<numTasks; i++)
+ mid += counter_left[i];
+ const range<ssize_t> globalLeft (0,mid);
+ const range<ssize_t> globalRight(mid,N);
+
+ /* calculate all left and right ranges that are on the wrong global side */
+ size_t numMisplacedRangesLeft = 0;
+ size_t numMisplacedRangesRight = 0;
+ size_t numMisplacedItemsLeft = 0;
+ size_t numMisplacedItemsRight = 0;
+
+ for (size_t i=0; i<numTasks; i++)
+ {
+ const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
+ const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
+ const range<ssize_t> left_misplaced = globalLeft. intersect(right_range);
+ const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
+
+ if (!left_misplaced.empty())
+ {
+ numMisplacedItemsLeft += left_misplaced.size();
+ leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
+ }
+
+ if (!right_misplaced.empty())
+ {
+ numMisplacedItemsRight += right_misplaced.size();
+ rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
+ }
+ }
+ assert( numMisplacedItemsLeft == numMisplacedItemsRight );
+
+ /* if no items are misplaced we are done */
+ if (numMisplacedItemsLeft == 0)
+ return mid;
+
+ /* otherwise we copy the items to the right place in parallel */
+ parallel_for(numTasks,[&] (const size_t taskID) {
+ const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
+ const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;
+ swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);
+ });
+
+ return mid;
+ }
+ };
+
+ template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+ __noinline size_t parallel_partitioning(T* array,
+ const size_t begin,
+ const size_t end,
+ const Vi &identity,
+ V &leftReduction,
+ V &rightReduction,
+ const IsLeft& is_left,
+ const Reduction_T& reduction_t,
+ const Reduction_V& reduction_v,
+ size_t BLOCK_SIZE = 128)
+ {
+ /* fall back to single threaded partitioning for small N */
+ if (unlikely(end-begin < BLOCK_SIZE))
+ return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+ /* otherwise use parallel code */
+ else {
+ typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+ std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+ return begin+p->partition(leftReduction,rightReduction);
+ }
+ }
+
+ template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+ __noinline size_t parallel_partitioning(T* array,
+ const size_t begin,
+ const size_t end,
+ const Vi &identity,
+ V &leftReduction,
+ V &rightReduction,
+ const IsLeft& is_left,
+ const Reduction_T& reduction_t,
+ const Reduction_V& reduction_v,
+ size_t BLOCK_SIZE,
+ size_t PARALLEL_THRESHOLD)
+ {
+ /* fall back to single threaded partitioning for small N */
+ if (unlikely(end-begin < PARALLEL_THRESHOLD))
+ return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+ /* otherwise use parallel code */
+ else {
+ typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+ std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+ return begin+p->partition(leftReduction,rightReduction);
+ }
+ }
+
+
+ template<typename T, typename IsLeft>
+ inline size_t parallel_partitioning(T* array,
+ const size_t begin,
+ const size_t end,
+ const IsLeft& is_left,
+ size_t BLOCK_SIZE = 128)
+ {
+ size_t leftReduction = 0;
+ size_t rightReduction = 0;
+ return parallel_partitioning(
+ array,begin,end,0,leftReduction,rightReduction,is_left,
+ [] (size_t& t,const T& ref) { },
+ [] (size_t& t0,size_t& t1) { },
+ BLOCK_SIZE);
+ }
+
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
new file mode 100644
index 0000000000..208bb4e480
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_prefix_sum.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+ template<typename Value>
+ struct ParallelPrefixSumState
+ {
+ enum { MAX_TASKS = 64 };
+ Value counts[MAX_TASKS];
+ Value sums [MAX_TASKS];
+ };
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
+ {
+ /* calculate number of tasks to use */
+ const size_t numThreads = TaskScheduler::threadCount();
+ const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
+ const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
+
+ /* perform parallel prefix sum */
+ parallel_for(taskCount, [&](const size_t taskIndex)
+ {
+ const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
+ const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
+ state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
+ });
+
+ /* calculate prefix sum */
+ Value sum=identity;
+ for (size_t i=0; i<taskCount; i++)
+ {
+ const Value c = state.counts[i];
+ state.sums[i] = sum;
+ sum=reduction(sum,c);
+ }
+
+ return sum;
+ }
+
+ /*! parallel calculation of prefix sums */
+ template<typename SrcArray, typename DstArray, typename Value, typename Add>
+ __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
+ {
+ /* perform single threaded prefix operation for small N */
+ if (N < SINGLE_THREAD_THRESHOLD)
+ {
+ Value sum=identity;
+ for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
+ return sum;
+ }
+
+ /* perform parallel prefix operation for large N */
+ else
+ {
+ ParallelPrefixSumState<Value> state;
+
+ /* initial run just sets up start values for subtasks */
+ parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+
+ Value s = identity;
+ for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
+ return s;
+
+ }, add);
+
+ /* final run calculates prefix sum */
+ return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+
+ Value s = identity;
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ dst[i] = add(sum,s);
+ s = add(s,src[i]);
+ }
+ return s;
+
+ }, add);
+ }
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h
new file mode 100644
index 0000000000..8271372ea4
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_reduce.h
@@ -0,0 +1,150 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ return func(range<Index>(first,last));
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ return func(range<Index>(first,last));
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ const Index maxTasks = 512;
+ const Index threadCount = (Index) TaskScheduler::threadCount();
+ taskCount = min(taskCount,threadCount,maxTasks);
+
+ /* parallel invokation of all tasks */
+ dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
+ parallel_for(taskCount, [&](const Index taskIndex) {
+ const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
+ const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
+ values[taskIndex] = func(range<Index>(k0,k1));
+ });
+
+ /* perform reduction over all tasks */
+ Value v = identity;
+ for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
+ return v;
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+#if defined(TASKING_INTERNAL)
+
+ /* fast path for small number of iterations */
+ Index taskCount = (last-first+minStepSize-1)/minStepSize;
+ if (likely(taskCount == 1)) {
+ return func(range<Index>(first,last));
+ }
+ return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
+
+#elif defined(TASKING_TBB)
+ #if TBB_INTERFACE_VERSION >= 12002
+ tbb::task_group_context context;
+ const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+ [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+ reduction,context);
+ // -- GODOT start --
+ // if (context.is_group_execution_cancelled())
+ // throw std::runtime_error("task cancelled");
+ // -- GODOT end --
+ return v;
+ #else
+ const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+ [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+ reduction);
+ // -- GODOT start --
+ // if (tbb::task::self().is_cancelled())
+ // throw std::runtime_error("task cancelled");
+ // -- GODOT end --
+ return v;
+ #endif
+#else // TASKING_PPL
+ struct AlignedValue
+ {
+ char storage[__alignof(Value)+sizeof(Value)];
+ static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
+ Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+ const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+ AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
+ AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
+ AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
+ AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+ AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+ operator Value() const { return *getValuePtr(); }
+ };
+
+ struct Iterator_Index
+ {
+ Index v;
+ typedef std::forward_iterator_tag iterator_category;
+ typedef AlignedValue value_type;
+ typedef Index difference_type;
+ typedef Index distance_type;
+ typedef AlignedValue* pointer;
+ typedef AlignedValue& reference;
+ __forceinline Iterator_Index() {}
+ __forceinline Iterator_Index(Index v) : v(v) {}
+ __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
+ __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
+ __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
+ __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
+ };
+
+ auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
+ assert(begin.v < end.v);
+ return reduction(start, func(range<Index>(begin.v, end.v)));
+ };
+ const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
+ return v;
+#endif
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ if (likely(last-first < parallel_threshold)) {
+ return func(range<Index>(first,last));
+ } else {
+ return parallel_reduce(first,last,minStepSize,identity,func,reduction);
+ }
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ auto funcr = [&] ( const range<Index> r ) {
+ Value v = identity;
+ for (Index i=r.begin(); i<r.end(); i++)
+ v = reduction(v,func(i));
+ return v;
+ };
+ return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
+ }
+
+ template<typename Index, typename Value, typename Func, typename Reduction>
+ __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
+ {
+ return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
+ }
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_set.h b/thirdparty/embree/common/algorithms/parallel_set.h
new file mode 100644
index 0000000000..7eae577457
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_set.h
@@ -0,0 +1,52 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+ /* implementation of a set of values with parallel construction */
+ template<typename T>
+ class parallel_set
+ {
+ public:
+
+ /*! default constructor for the parallel set */
+ parallel_set () {}
+
+ /*! construction from vector */
+ template<typename Vector>
+ parallel_set (const Vector& in) { init(in); }
+
+ /*! initialized the parallel set from a vector */
+ template<typename Vector>
+ void init(const Vector& in)
+ {
+ /* copy data to internal vector */
+ vec.resize(in.size());
+ parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ vec[i] = in[i];
+ });
+
+ /* sort the data */
+ std::vector<T> temp(in.size());
+ radix_sort<T>(vec.data(),temp.data(),vec.size());
+ }
+
+ /*! tests if some element is in the set */
+ __forceinline bool lookup(const T& elt) const {
+ return std::binary_search(vec.begin(), vec.end(), elt);
+ }
+
+ /*! clears all state */
+ void clear() {
+ vec.clear();
+ }
+
+ private:
+ std::vector<T> vec; //!< vector containing sorted elements
+ };
+}
diff --git a/thirdparty/embree/common/algorithms/parallel_sort.h b/thirdparty/embree/common/algorithms/parallel_sort.h
new file mode 100644
index 0000000000..30e56c2bfc
--- /dev/null
+++ b/thirdparty/embree/common/algorithms/parallel_sort.h
@@ -0,0 +1,454 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../simd/simd.h"
+#include "parallel_for.h"
+#include <algorithm>
+
+namespace embree
+{
+ template<class T>
+ __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
+ {
+ for(size_t i = 1;i<length;++i)
+ {
+ T v = array[i];
+ size_t j = i;
+ while(j > 0 && v < array[j-1])
+ {
+ array[j] = array[j-1];
+ --j;
+ }
+ array[j] = v;
+ }
+ }
+
+ template<class T>
+ __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
+ {
+ for(size_t i = 1;i<length;++i)
+ {
+ T v = array[i];
+ size_t j = i;
+ while(j > 0 && v > array[j-1])
+ {
+ array[j] = array[j-1];
+ --j;
+ }
+ array[j] = v;
+ }
+ }
+
+ template<class T>
+ void quicksort_ascending(T *__restrict__ t,
+ const ssize_t begin,
+ const ssize_t end)
+ {
+ if (likely(begin < end))
+ {
+ const T pivotvalue = t[begin];
+ ssize_t left = begin - 1;
+ ssize_t right = end + 1;
+
+ while(1)
+ {
+ while (t[--right] > pivotvalue);
+ while (t[++left] < pivotvalue);
+
+ if (left >= right) break;
+
+ const T temp = t[right];
+ t[right] = t[left];
+ t[left] = temp;
+ }
+
+ const int pivot = right;
+ quicksort_ascending(t, begin, pivot);
+ quicksort_ascending(t, pivot + 1, end);
+ }
+ }
+
+ template<class T>
+ void quicksort_decending(T *__restrict__ t,
+ const ssize_t begin,
+ const ssize_t end)
+ {
+ if (likely(begin < end))
+ {
+ const T pivotvalue = t[begin];
+ ssize_t left = begin - 1;
+ ssize_t right = end + 1;
+
+ while(1)
+ {
+ while (t[--right] < pivotvalue);
+ while (t[++left] > pivotvalue);
+
+ if (left >= right) break;
+
+ const T temp = t[right];
+ t[right] = t[left];
+ t[left] = temp;
+ }
+
+ const int pivot = right;
+ quicksort_decending(t, begin, pivot);
+ quicksort_decending(t, pivot + 1, end);
+ }
+ }
+
+
+ template<class T, ssize_t THRESHOLD>
+ void quicksort_insertionsort_ascending(T *__restrict__ t,
+ const ssize_t begin,
+ const ssize_t end)
+ {
+ if (likely(begin < end))
+ {
+ const ssize_t size = end-begin+1;
+ if (likely(size <= THRESHOLD))
+ {
+ insertionsort_ascending<T>(&t[begin],size);
+ }
+ else
+ {
+ const T pivotvalue = t[begin];
+ ssize_t left = begin - 1;
+ ssize_t right = end + 1;
+
+ while(1)
+ {
+ while (t[--right] > pivotvalue);
+ while (t[++left] < pivotvalue);
+
+ if (left >= right) break;
+
+ const T temp = t[right];
+ t[right] = t[left];
+ t[left] = temp;
+ }
+
+ const ssize_t pivot = right;
+ quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
+ quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
+ }
+ }
+ }
+
+
+ template<class T, ssize_t THRESHOLD>
+ void quicksort_insertionsort_decending(T *__restrict__ t,
+ const ssize_t begin,
+ const ssize_t end)
+ {
+ if (likely(begin < end))
+ {
+ const ssize_t size = end-begin+1;
+ if (likely(size <= THRESHOLD))
+ {
+ insertionsort_decending<T>(&t[begin],size);
+ }
+ else
+ {
+
+ const T pivotvalue = t[begin];
+ ssize_t left = begin - 1;
+ ssize_t right = end + 1;
+
+ while(1)
+ {
+ while (t[--right] < pivotvalue);
+ while (t[++left] > pivotvalue);
+
+ if (left >= right) break;
+
+ const T temp = t[right];
+ t[right] = t[left];
+ t[left] = temp;
+ }
+
+ const ssize_t pivot = right;
+ quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
+ quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
+ }
+ }
+ }
+
+ template<typename T>
+ static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
+ {
+ static const unsigned int BITS = 8;
+ static const unsigned int BUCKETS = (1 << BITS);
+ static const unsigned int CMP_SORT_THRESHOLD = 16;
+
+ __aligned(64) unsigned int count[BUCKETS];
+
+ /* clear buckets */
+ for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
+
+ /* count buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+ for (size_t i=0;i<num;i++)
+ count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
+
+ /* prefix sums */
+ __aligned(64) unsigned int head[BUCKETS];
+ __aligned(64) unsigned int tail[BUCKETS];
+
+ head[0] = 0;
+ for (size_t i=1; i<BUCKETS; i++)
+ head[i] = head[i-1] + count[i-1];
+
+ for (size_t i=0; i<BUCKETS-1; i++)
+ tail[i] = head[i+1];
+
+ tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
+
+ assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
+ assert(tail[BUCKETS-1] == num);
+
+ /* in-place swap */
+ for (size_t i=0;i<BUCKETS;i++)
+ {
+ /* process bucket */
+ while(head[i] < tail[i])
+ {
+ T v = morton[head[i]];
+ while(1)
+ {
+ const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
+ if (b == i) break;
+ std::swap(v,morton[head[b]++]);
+ }
+ assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
+ morton[head[i]++] = v;
+ }
+ }
+ if (shift == 0) return;
+
+ size_t offset = 0;
+ for (size_t i=0;i<BUCKETS;i++)
+ if (count[i])
+ {
+
+ for (size_t j=offset;j<offset+count[i]-1;j++)
+ assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
+
+ if (unlikely(count[i] < CMP_SORT_THRESHOLD))
+ insertionsort_ascending(morton + offset, count[i]);
+ else
+ radixsort32(morton + offset, count[i], shift-BITS);
+
+ for (size_t j=offset;j<offset+count[i]-1;j++)
+ assert(morton[j] <= morton[j+1]);
+
+ offset += count[i];
+ }
+ }
+
+ template<typename Ty, typename Key>
+ class ParallelRadixSort
+ {
+ static const size_t MAX_TASKS = 64;
+ static const size_t BITS = 8;
+ static const size_t BUCKETS = (1 << BITS);
+ typedef unsigned int TyRadixCount[BUCKETS];
+
+ template<typename T>
+ static bool compare(const T& v0, const T& v1) {
+ return (Key)v0 < (Key)v1;
+ }
+
+ private:
+ ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
+ ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
+
+
+ public:
+ ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
+ : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
+
+ void sort(const size_t blockSize)
+ {
+ assert(blockSize > 0);
+
+ /* perform single threaded sort for small N */
+ if (N<=blockSize) // handles also special case of 0!
+ {
+ /* do inplace sort inside destination array */
+ std::sort(src,src+N,compare<Ty>);
+ }
+
+ /* perform parallel sort for large N */
+ else
+ {
+ const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
+ tbbRadixSort(numThreads);
+ }
+ }
+
+ ~ParallelRadixSort()
+ {
+ alignedFree(radixCount);
+ radixCount = nullptr;
+ }
+
+ private:
+
+ void tbbRadixIteration0(const Key shift,
+ const Ty* __restrict const src,
+ Ty* __restrict const dst,
+ const size_t threadIndex, const size_t threadCount)
+ {
+ const size_t startID = (threadIndex+0)*N/threadCount;
+ const size_t endID = (threadIndex+1)*N/threadCount;
+
+ /* mask to extract some number of bits */
+ const Key mask = BUCKETS-1;
+
+ /* count how many items go into the buckets */
+ for (size_t i=0; i<BUCKETS; i++)
+ radixCount[threadIndex][i] = 0;
+
+ /* iterate over src array and count buckets */
+ unsigned int * __restrict const count = radixCount[threadIndex];
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+ for (size_t i=startID; i<endID; i++) {
+#if defined(__64BIT__)
+ const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+ const Key index = ((Key)src[i] >> shift) & mask;
+#endif
+ count[index]++;
+ }
+ }
+
+ void tbbRadixIteration1(const Key shift,
+ const Ty* __restrict const src,
+ Ty* __restrict const dst,
+ const size_t threadIndex, const size_t threadCount)
+ {
+ const size_t startID = (threadIndex+0)*N/threadCount;
+ const size_t endID = (threadIndex+1)*N/threadCount;
+
+ /* mask to extract some number of bits */
+ const Key mask = BUCKETS-1;
+
+ /* calculate total number of items for each bucket */
+ __aligned(64) unsigned int total[BUCKETS];
+ /*
+ for (size_t i=0; i<BUCKETS; i++)
+ total[i] = 0;
+ */
+ for (size_t i=0; i<BUCKETS; i+=VSIZEX)
+ vintx::store(&total[i], zero);
+
+ for (size_t i=0; i<threadCount; i++)
+ {
+ /*
+ for (size_t j=0; j<BUCKETS; j++)
+ total[j] += radixCount[i][j];
+ */
+ for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+ vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
+ }
+
+ /* calculate start offset of each bucket */
+ __aligned(64) unsigned int offset[BUCKETS];
+ offset[0] = 0;
+ for (size_t i=1; i<BUCKETS; i++)
+ offset[i] = offset[i-1] + total[i-1];
+
+ /* calculate start offset of each bucket for this thread */
+ for (size_t i=0; i<threadIndex; i++)
+ {
+ /*
+ for (size_t j=0; j<BUCKETS; j++)
+ offset[j] += radixCount[i][j];
+ */
+ for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+ vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
+ }
+
+ /* copy items into their buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+ for (size_t i=startID; i<endID; i++) {
+ const Ty elt = src[i];
+#if defined(__64BIT__)
+ const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+ const size_t index = ((Key)src[i] >> shift) & mask;
+#endif
+ dst[offset[index]++] = elt;
+ }
+ }
+
+ void tbbRadixIteration(const Key shift, const bool last,
+ const Ty* __restrict src, Ty* __restrict dst,
+ const size_t numTasks)
+ {
+ affinity_partitioner ap;
+ parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
+ parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
+ }
+
+ void tbbRadixSort(const size_t numTasks)
+ {
+ radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
+
+ if (sizeof(Key) == sizeof(uint32_t)) {
+ tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+ tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
+ }
+ else if (sizeof(Key) == sizeof(uint64_t))
+ {
+ tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+ tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
+ tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
+ tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
+ tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
+ }
+ }
+
+ private:
+ TyRadixCount* radixCount;
+ Ty* const src;
+ Ty* const tmp;
+ const size_t N;
+ };
+
+ template<typename Ty>
+ void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+ {
+ ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
+ }
+
+ template<typename Ty, typename Key>
+ void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+ {
+ ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
+ }
+
+ template<typename Ty>
+ void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+ radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
+ }
+
+ template<typename Ty>
+ void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+ radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
+ }
+}