1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
#[compute]
#version 450
#VERSION_DEFINES
layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
layout(push_constant, std430) uniform Params {
uint cluster_render_data_size; // how much data for a single cluster takes
uint max_render_element_count_div_32; //divided by 32
uvec2 cluster_screen_size;
uint render_element_count_div_32; //divided by 32
uint max_cluster_element_count_div_32; //divided by 32
uint pad1;
uint pad2;
}
params;
layout(set = 0, binding = 1, std430) buffer restrict readonly ClusterRender {
uint data[];
}
cluster_render;
layout(set = 0, binding = 2, std430) buffer restrict ClusterStore {
uint data[];
}
cluster_store;
struct RenderElement {
uint type; //0-4
bool touches_near;
bool touches_far;
uint original_index;
mat3x4 transform_inv;
vec3 scale;
uint pad;
};
layout(set = 0, binding = 3, std430) buffer restrict readonly RenderElements {
RenderElement data[];
}
render_elements;
void main() {
uvec2 pos = gl_GlobalInvocationID.xy;
if (any(greaterThanEqual(pos, params.cluster_screen_size))) {
return;
}
//counter for each type of render_element
//base offset for this cluster
uint base_offset = (pos.x + params.cluster_screen_size.x * pos.y);
uint src_offset = base_offset * params.cluster_render_data_size;
uint render_element_offset = 0;
//check all render_elements and see which one was written to
while (render_element_offset < params.render_element_count_div_32) {
uint bits = cluster_render.data[src_offset + render_element_offset];
while (bits != 0) {
//if bits exist, check the render_element
uint index_bit = findLSB(bits);
uint index = render_element_offset * 32 + index_bit;
uint type = render_elements.data[index].type;
uint z_range_offset = src_offset + params.max_render_element_count_div_32 + index;
uint z_range = cluster_render.data[z_range_offset];
//if object was written, z was written, but check just in case
if (z_range != 0) { //should always be > 0
uint from_z = findLSB(z_range);
uint to_z = findMSB(z_range) + 1;
if (render_elements.data[index].touches_near) {
from_z = 0;
}
if (render_elements.data[index].touches_far) {
to_z = 32;
}
// find cluster offset in the buffer used for indexing in the renderer
uint dst_offset = (base_offset + type * (params.cluster_screen_size.x * params.cluster_screen_size.y)) * (params.max_cluster_element_count_div_32 + 32);
uint orig_index = render_elements.data[index].original_index;
//store this index in the Z slices by setting the relevant bit
for (uint i = from_z; i < to_z; i++) {
uint slice_ofs = dst_offset + params.max_cluster_element_count_div_32 + i;
uint minmax = cluster_store.data[slice_ofs];
if (minmax == 0) {
minmax = 0xFFFF; //min 0, max 0xFFFF
}
uint elem_min = min(orig_index, minmax & 0xFFFF);
uint elem_max = max(orig_index + 1, minmax >> 16); //always store plus one, so zero means range is empty when not written to
minmax = elem_min | (elem_max << 16);
cluster_store.data[slice_ofs] = minmax;
}
uint store_word = orig_index >> 5;
uint store_bit = orig_index & 0x1F;
//store the actual render_element index at the end, so the rendering code can reference it
cluster_store.data[dst_offset + store_word] |= 1 << store_bit;
}
bits &= ~(1 << index_bit); //clear the bit to continue iterating
}
render_element_offset++;
}
}
|