1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for.h"
namespace embree
{
template<typename Value>
struct ParallelPrefixSumState
{
enum { MAX_TASKS = 64 };
Value counts[MAX_TASKS];
Value sums [MAX_TASKS];
};
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
{
/* calculate number of tasks to use */
const size_t numThreads = TaskScheduler::threadCount();
const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
/* perform parallel prefix sum */
parallel_for(taskCount, [&](const size_t taskIndex)
{
const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
});
/* calculate prefix sum */
Value sum=identity;
for (size_t i=0; i<taskCount; i++)
{
const Value c = state.counts[i];
state.sums[i] = sum;
sum=reduction(sum,c);
}
return sum;
}
/*! parallel calculation of prefix sums */
template<typename SrcArray, typename DstArray, typename Value, typename Add>
__forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
{
/* perform single threaded prefix operation for small N */
if (N < SINGLE_THREAD_THRESHOLD)
{
Value sum=identity;
for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
return sum;
}
/* perform parallel prefix operation for large N */
else
{
ParallelPrefixSumState<Value> state;
/* initial run just sets up start values for subtasks */
parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
Value s = identity;
for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
return s;
}, add);
/* final run calculates prefix sum */
return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
Value s = identity;
for (size_t i=r.begin(); i<r.end(); i++) {
dst[i] = add(sum,s);
s = add(s,src[i]);
}
return s;
}, add);
}
}
}
|