summaryrefslogtreecommitdiff
path: root/thirdparty/embree/common/algorithms/parallel_for_for.h
blob: 7838ef11b38cede2f9698cd51fb967c1dd8f38b9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "parallel_for.h"

namespace embree
{
  template<typename ArrayArray, typename Func>
    __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) 
  {
    size_t k=0;
    for (size_t i=0; i!=array2.size(); ++i) {
      const size_t N = array2[i]->size();
      if (N) func(array2[i],range<size_t>(0,N),k);
      k+=N;
    }
  }

  class ParallelForForState
  {
  public:

    enum { MAX_TASKS = 64 };

    __forceinline ParallelForForState () 
      : taskCount(0) {}

    template<typename ArrayArray>
      __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
      init(array2,minStepSize);
    }

    template<typename SizeFunc>
    __forceinline ParallelForForState (const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) {
      init(numArrays,getSize,minStepSize);
    } 

    template<typename SizeFunc>
    __forceinline void init ( const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize )
    {
      /* first calculate total number of elements */
      size_t N = 0;
      for (size_t i=0; i<numArrays; i++) {
	N += getSize(i);
      }
      this->N = N;

      /* calculate number of tasks to use */
      const size_t numThreads = TaskScheduler::threadCount();
      const size_t numBlocks  = (N+minStepSize-1)/minStepSize;
      taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
      
      /* calculate start (i,j) for each task */
      size_t taskIndex = 0;
      i0[taskIndex] = 0;
      j0[taskIndex] = 0;
      size_t k0 = (++taskIndex)*N/taskCount;
      for (size_t i=0, k=0; taskIndex < taskCount; i++) 
      {
	assert(i<numArrays);
	size_t j=0, M = getSize(i);
	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
	  assert(taskIndex<taskCount);
	  i0[taskIndex] = i;
	  j0[taskIndex] = j += k0-k;
	  k=k0;
	  k0 = (++taskIndex)*N/taskCount;
	}
	k+=M-j;
      }
    }

    template<typename ArrayArray>
      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
    {
      init(array2.size(),[&](size_t i) { return array2[i] ? array2[i]->size() : 0; },minStepSize);
    }
    
    __forceinline size_t size() const {
      return N;
    }
    
  public:
    size_t i0[MAX_TASKS];
    size_t j0[MAX_TASKS];
    size_t taskCount;
    size_t N;
  };

  template<typename ArrayArray, typename Func>
    __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
  {
    ParallelForForState state(array2,minStepSize);
    
    parallel_for(state.taskCount, [&](const size_t taskIndex) 
    {
      /* calculate range */
      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
      size_t i0 = state.i0[taskIndex];
      size_t j0 = state.j0[taskIndex];

      /* iterate over arrays */
      size_t k=k0;
      for (size_t i=i0; k<k1; i++) {
        const size_t N =  array2[i] ? array2[i]->size() : 0;
        const size_t r0 = j0, r1 = min(N,r0+k1-k);
        if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
        k+=r1-r0; j0 = 0;
      }
    });
  }

  template<typename ArrayArray, typename Func>
    __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
  {
    parallel_for_for(array2,1,func);
  }

  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
  {
    ParallelForForState state(array2,minStepSize);
    Value temp[ParallelForForState::MAX_TASKS];

    for (size_t i=0; i<state.taskCount; i++)
      temp[i] = identity;
    
    parallel_for(state.taskCount, [&](const size_t taskIndex) 
    {
      /* calculate range */
      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
      size_t i0 = state.i0[taskIndex];
      size_t j0 = state.j0[taskIndex];

      /* iterate over arrays */
      size_t k=k0;
      for (size_t i=i0; k<k1; i++) {
        const size_t N =  array2[i] ? array2[i]->size() : 0;
        const size_t r0 = j0, r1 = min(N,r0+k1-k);
        if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
        k+=r1-r0; j0 = 0;
      }
    });

    Value ret = identity;
    for (size_t i=0; i<state.taskCount; i++)
      ret = reduction(ret,temp[i]);
    return ret;
  }

  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
  {
    return parallel_for_for_reduce(array2,1,identity,func,reduction);
  }
}