summaryrefslogtreecommitdiff
path: root/servers/rendering/renderer_rd/shaders/shadow_reduce.glsl
blob: a29b24e560cc9f5a86d87dfb192d4673c69597e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#[compute]

#version 450

VERSION_DEFINES

#define BLOCK_SIZE 8

#ifdef MODE_REDUCE_SUBGROUP

#extension GL_KHR_shader_subgroup_ballot : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable

//nvidia friendly, max 32
layout(local_size_x = 8, local_size_y = 4, local_size_z = 1) in;

#else

layout(local_size_x = BLOCK_SIZE, local_size_y = BLOCK_SIZE, local_size_z = 1) in;

#endif

#ifdef MODE_REDUCE

shared float tmp_data[BLOCK_SIZE * BLOCK_SIZE];
const uint swizzle_table[BLOCK_SIZE] = uint[](0, 4, 2, 6, 1, 5, 3, 7);
const uint unswizzle_table[BLOCK_SIZE] = uint[](0, 0, 0, 1, 0, 2, 1, 3);

#endif

#if defined(MODE_REDUCE) || defined(MODE_REDUCE_SUBGROUP)
layout(set = 0, binding = 0) uniform sampler2D source_depth;
#else
layout(r16, set = 0, binding = 0) uniform restrict readonly image2D source_depth;
#endif
layout(r16, set = 1, binding = 0) uniform restrict writeonly image2D dst_depth;

layout(push_constant, binding = 1, std430) uniform Params {
	ivec2 source_size;
	ivec2 source_offset;
	uint min_size;
	uint gaussian_kernel_version;
	ivec2 filter_dir;
}
params;

void main() {
#ifdef MODE_REDUCE_SUBGROUP

	uvec2 local_pos = gl_LocalInvocationID.xy;
	ivec2 image_offset = params.source_offset;
	ivec2 image_pos = image_offset + ivec2(gl_GlobalInvocationID.xy * ivec2(1, 2));

	float depth = texelFetch(source_depth, min(image_pos, params.source_size - ivec2(1)), 0).r;
	depth += texelFetch(source_depth, min(image_pos + ivec2(0, 1), params.source_size - ivec2(1)), 0).r;
	depth *= 0.5;

#ifdef MODE_REDUCE_8
	//fast version, reduce all
	float depth_average = subgroupAdd(depth) / 32.0;
	if (local_pos == uvec2(0)) {
		imageStore(dst_depth, image_pos / 8, vec4(depth_average));
	}
#else
	//bit slower version, reduce by regions
	uint group_size = (8 / params.min_size);
	uvec2 group_id = local_pos / (8 / params.min_size);

	uvec4 mask;
	float depth_average = 0;

	while (true) {
		uvec2 first = subgroupBroadcastFirst(group_id);
		mask = subgroupBallot(first == group_id);
		if (first == group_id) {
			depth_average = subgroupAdd(depth);
			break;
		}
	}

	depth_average /= float(group_size * group_size);

	if (local_pos == group_id) {
		imageStore(dst_depth, image_pos / int(group_size), vec4(depth_average));
	}
#endif

#endif

#ifdef MODE_REDUCE

	uvec2 pos = gl_LocalInvocationID.xy;

	ivec2 image_offset = params.source_offset;
	ivec2 image_pos = image_offset + ivec2(gl_GlobalInvocationID.xy);
	uint dst_t = swizzle_table[pos.y] * BLOCK_SIZE + swizzle_table[pos.x];
	tmp_data[dst_t] = texelFetch(source_depth, min(image_pos, params.source_size - ivec2(1)), 0).r;
	ivec2 image_size = params.source_size;

	uint t = pos.y * BLOCK_SIZE + pos.x;

	//neighbours
	uint size = BLOCK_SIZE;

	do {
		groupMemoryBarrier();
		barrier();

		size >>= 1;
		image_size >>= 1;
		image_offset >>= 1;

		if (all(lessThan(pos, uvec2(size)))) {
			uint nx = t + size;
			uint ny = t + (BLOCK_SIZE * size);
			uint nxy = ny + size;

			tmp_data[t] += tmp_data[nx];
			tmp_data[t] += tmp_data[ny];
			tmp_data[t] += tmp_data[nxy];
			tmp_data[t] /= 4.0;
		}

	} while (size > params.min_size);

	if (all(lessThan(pos, uvec2(size)))) {
		image_pos = ivec2(unswizzle_table[size + pos.x], unswizzle_table[size + pos.y]);
		image_pos += image_offset + ivec2(gl_WorkGroupID.xy) * int(size);

		image_size = max(ivec2(1), image_size); //in case image size became 0

		if (all(lessThan(image_pos, uvec2(image_size)))) {
			imageStore(dst_depth, image_pos, vec4(tmp_data[t]));
		}
	}
#endif

#ifdef MODE_FILTER

	ivec2 image_pos = params.source_offset + ivec2(gl_GlobalInvocationID.xy);
	if (any(greaterThanEqual(image_pos, params.source_size))) {
		return;
	}

	ivec2 clamp_min = ivec2(params.source_offset);
	ivec2 clamp_max = ivec2(params.source_size) - 1;

	//gaussian kernel, size 9, sigma 4
	const int kernel_size = 9;
	const float gaussian_kernel[kernel_size * 3] = float[](
			0.000229, 0.005977, 0.060598, 0.241732, 0.382928, 0.241732, 0.060598, 0.005977, 0.000229,
			0.028532, 0.067234, 0.124009, 0.179044, 0.20236, 0.179044, 0.124009, 0.067234, 0.028532,
			0.081812, 0.101701, 0.118804, 0.130417, 0.134535, 0.130417, 0.118804, 0.101701, 0.081812);
	float accum = 0.0;
	for (int i = 0; i < kernel_size; i++) {
		ivec2 ofs = clamp(image_pos + params.filter_dir * (i - kernel_size / 2), clamp_min, clamp_max);
		accum += imageLoad(source_depth, ofs).r * gaussian_kernel[params.gaussian_kernel_version + i];
	}

	imageStore(dst_depth, image_pos, vec4(accum));

#endif
}