Added GPU based cluster builder

Clustering is now GPU based, uses an implementation based on the Activision algorithm.
author: reduz <reduzio@gmail.com> 2021-01-17 13:25:38 -0300
committer: Juan Linietsky <reduzio@gmail.com> 2021-01-19 23:31:06 +0100
commit: 099dee35f47db3e293cb8e60287ffe6a44f3d5d4 (patch)
tree: dea148899efa156adf4c7b9ff32464871cef4253 /servers/rendering/renderer_rd/shaders
parent: 7008e3c6eafa374e5d64ee7867608abe696698c2 (diff)
10 files changed, 1071 insertions, 472 deletions
diff --git a/servers/rendering/renderer_rd/shaders/SCsub b/servers/rendering/renderer_rd/shaders/SCsub
index deaa9668df..1b0197c1c1 100644
--- a/servers/rendering/renderer_rd/shaders/SCsub
+++ b/servers/rendering/renderer_rd/shaders/SCsub
@@ -44,3 +44,6 @@ if "RD_GLSL" in env["BUILDERS"]:
     env.RD_GLSL("particles_copy.glsl")
     env.RD_GLSL("sort.glsl")
     env.RD_GLSL("skeleton.glsl")
+    env.RD_GLSL("cluster_render.glsl")
+    env.RD_GLSL("cluster_store.glsl")
+    env.RD_GLSL("cluster_debug.glsl")
diff --git a/servers/rendering/renderer_rd/shaders/cluster_debug.glsl b/servers/rendering/renderer_rd/shaders/cluster_debug.glsl
new file mode 100644
index 0000000000..70a875192c
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/cluster_debug.glsl
@@ -0,0 +1,115 @@
+#[compute]
+
+#version 450
+
+VERSION_DEFINES
+
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
+
+const vec3 usage_gradient[33] = vec3[]( // 1 (none) + 32
+		vec3(0.14, 0.17, 0.23),
+		vec3(0.24, 0.44, 0.83),
+		vec3(0.23, 0.57, 0.84),
+		vec3(0.22, 0.71, 0.84),
+		vec3(0.22, 0.85, 0.83),
+		vec3(0.21, 0.85, 0.72),
+		vec3(0.21, 0.85, 0.57),
+		vec3(0.20, 0.85, 0.42),
+		vec3(0.20, 0.85, 0.27),
+		vec3(0.27, 0.86, 0.19),
+		vec3(0.51, 0.85, 0.19),
+		vec3(0.57, 0.86, 0.19),
+		vec3(0.62, 0.85, 0.19),
+		vec3(0.67, 0.86, 0.20),
+		vec3(0.73, 0.85, 0.20),
+		vec3(0.78, 0.85, 0.20),
+		vec3(0.83, 0.85, 0.20),
+		vec3(0.85, 0.82, 0.20),
+		vec3(0.85, 0.76, 0.20),
+		vec3(0.85, 0.81, 0.20),
+		vec3(0.85, 0.65, 0.20),
+		vec3(0.84, 0.60, 0.21),
+		vec3(0.84, 0.56, 0.21),
+		vec3(0.84, 0.51, 0.21),
+		vec3(0.84, 0.46, 0.21),
+		vec3(0.84, 0.41, 0.21),
+		vec3(0.84, 0.36, 0.21),
+		vec3(0.84, 0.31, 0.21),
+		vec3(0.84, 0.27, 0.21),
+		vec3(0.83, 0.22, 0.22),
+		vec3(0.83, 0.22, 0.27),
+		vec3(0.83, 0.22, 0.32),
+		vec3(1.00, 0.63, 0.70));
+layout(push_constant, binding = 0, std430) uniform Params {
+	uvec2 screen_size;
+	uvec2 cluster_screen_size;
+
+	uint cluster_shift;
+	uint cluster_type;
+	float z_near;
+	float z_far;
+
+	bool orthogonal;
+	uint max_cluster_element_count_div_32;
+	uint pad1;
+	uint pad2;
+}
+params;
+
+layout(set = 0, binding = 1, std430) buffer restrict readonly ClusterData {
+	uint data[];
+}
+cluster_data;
+
+layout(rgba16f, set = 0, binding = 2) uniform restrict writeonly image2D screen_buffer;
+layout(set = 0, binding = 3) uniform texture2D depth_buffer;
+layout(set = 0, binding = 4) uniform sampler depth_buffer_sampler;
+
+void main() {
+	uvec2 screen_pos = gl_GlobalInvocationID.xy;
+	if (any(greaterThanEqual(screen_pos, params.screen_size))) {
+		return;
+	}
+
+	uvec2 cluster_pos = screen_pos >> params.cluster_shift;
+
+	uint offset = cluster_pos.y * params.cluster_screen_size.x + cluster_pos.x;
+	offset += params.cluster_screen_size.x * params.cluster_screen_size.y * params.cluster_type;
+	offset *= (params.max_cluster_element_count_div_32 + 32);
+
+	//depth buffers generally can't be accessed via image API
+	float depth = texelFetch(sampler2D(depth_buffer, depth_buffer_sampler), ivec2(screen_pos), 0).r * 2.0 - 1.0;
+
+	if (params.orthogonal) {
+		depth = ((depth + (params.z_far + params.z_near) / (params.z_far - params.z_near)) * (params.z_far - params.z_near)) / 2.0;
+	} else {
+		depth = 2.0 * params.z_near * params.z_far / (params.z_far + params.z_near - depth * (params.z_far - params.z_near));
+	}
+	depth /= params.z_far;
+
+	uint slice = uint(clamp(floor(depth * 32.0), 0.0, 31.0));
+	uint slice_minmax = cluster_data.data[offset + params.max_cluster_element_count_div_32 + slice];
+	uint item_min = slice_minmax & 0xFFFF;
+	uint item_max = slice_minmax >> 16;
+
+	uint item_count = 0;
+	for (uint i = 0; i < params.max_cluster_element_count_div_32; i++) {
+		uint slice_bits = cluster_data.data[offset + i];
+		while (slice_bits != 0) {
+			uint bit = findLSB(slice_bits);
+			uint item = i * 32 + bit;
+			if ((item >= item_min && item < item_max)) {
+				item_count++;
+			}
+			slice_bits &= ~(1 << bit);
+		}
+	}
+
+	item_count = min(item_count, 32);
+
+	vec3 color = usage_gradient[item_count];
+
+	color = mix(color * 1.2, color * 0.3, float(slice) / 31.0);
+
+	imageStore(screen_buffer, ivec2(screen_pos), vec4(color, 1.0));
+}
diff --git a/servers/rendering/renderer_rd/shaders/cluster_render.glsl b/servers/rendering/renderer_rd/shaders/cluster_render.glsl
new file mode 100644
index 0000000000..8723ea78e4
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/cluster_render.glsl
@@ -0,0 +1,168 @@
+#[vertex]
+
+#version 450
+
+VERSION_DEFINES
+
+layout(location = 0) in vec3 vertex_attrib;
+
+layout(location = 0) out float depth_interp;
+layout(location = 1) out flat uint element_index;
+
+layout(push_constant, binding = 0, std430) uniform Params {
+	uint base_index;
+	uint pad0;
+	uint pad1;
+	uint pad2;
+}
+params;
+
+layout(set = 0, binding = 1, std140) uniform State {
+	mat4 projection;
+
+	float inv_z_far;
+	uint screen_to_clusters_shift; // shift to obtain coordinates in block indices
+	uint cluster_screen_width; //
+	uint cluster_data_size; // how much data for a single cluster takes
+
+	uint cluster_depth_offset;
+	uint pad0;
+	uint pad1;
+	uint pad2;
+}
+state;
+
+struct RenderElement {
+	uint type; //0-4
+	bool touches_near;
+	bool touches_far;
+	uint original_index;
+	mat3x4 transform_inv;
+	vec3 scale;
+	uint pad;
+};
+
+layout(set = 0, binding = 2, std430) buffer restrict readonly RenderElements {
+	RenderElement data[];
+}
+render_elements;
+
+void main() {
+	element_index = params.base_index + gl_InstanceIndex;
+
+	vec3 vertex = vertex_attrib;
+	vertex *= render_elements.data[element_index].scale;
+
+	vertex = vec4(vertex, 1.0) * render_elements.data[element_index].transform_inv;
+	depth_interp = -vertex.z;
+
+	gl_Position = state.projection * vec4(vertex, 1.0);
+}
+
+#[fragment]
+
+#version 450
+
+VERSION_DEFINES
+
+#if defined(GL_KHR_shader_subgroup_ballot) && defined(GL_KHR_shader_subgroup_arithmetic) && defined(GL_KHR_shader_subgroup_vote)
+
+#extension GL_KHR_shader_subgroup_ballot : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#extension GL_KHR_shader_subgroup_vote : enable
+
+#define USE_SUBGROUPS
+#endif
+
+layout(location = 0) in float depth_interp;
+layout(location = 1) in flat uint element_index;
+
+layout(set = 0, binding = 1, std140) uniform State {
+	mat4 projection;
+	float inv_z_far;
+	uint screen_to_clusters_shift; // shift to obtain coordinates in block indices
+	uint cluster_screen_width; //
+	uint cluster_data_size; // how much data for a single cluster takes
+	uint cluster_depth_offset;
+	uint pad0;
+	uint pad1;
+	uint pad2;
+}
+state;
+
+//cluster data is layout linearly, each cell contains the follow information:
+// - list of bits for every element to mark as used, so (max_elem_count/32)*4 uints
+// - a uint for each element to mark the depth bits used when rendering (0-31)
+
+layout(set = 0, binding = 3, std430) buffer restrict ClusterRender {
+	uint data[];
+}
+cluster_render;
+
+void main() {
+	//convert from screen to cluster
+	uvec2 cluster = uvec2(gl_FragCoord.xy) >> state.screen_to_clusters_shift;
+
+	//get linear cluster offset from screen poss
+	uint cluster_offset = cluster.x + state.cluster_screen_width * cluster.y;
+	//multiply by data size to position at the beginning of the element list for this cluster
+	cluster_offset *= state.cluster_data_size;
+
+	//find the current element in the list and plot the bit to mark it as used
+	uint usage_write_offset = cluster_offset + (element_index >> 5);
+	uint usage_write_bit = 1 << (element_index & 0x1F);
+
+#ifdef USE_SUBGROUPS
+
+	uint cluster_thread_group_index;
+
+	if (!gl_HelperInvocation) {
+		//http://advances.realtimerendering.com/s2017/2017_Sig_Improved_Culling_final.pdf
+
+		uvec4 mask;
+
+		while (true) {
+			// find the cluster offset of the first active thread
+			// threads that did break; go inactive and no longer count
+			uint first = subgroupBroadcastFirst(cluster_offset);
+			// update the mask for thread that match this cluster
+			mask = subgroupBallot(first == cluster_offset);
+			if (first == cluster_offset) {
+				// This thread belongs to the group of threads that match this offset,
+				// so exit the loop.
+				break;
+			}
+		}
+
+		cluster_thread_group_index = subgroupBallotExclusiveBitCount(mask);
+
+		if (cluster_thread_group_index == 0) {
+			atomicOr(cluster_render.data[usage_write_offset], usage_write_bit);
+		}
+	}
+#else
+	if (!gl_HelperInvocation) {
+		atomicOr(cluster_render.data[usage_write_offset], usage_write_bit);
+	}
+#endif
+	//find the current element in the depth usage list and mark the current depth as used
+	float unit_depth = depth_interp * state.inv_z_far;
+
+	uint z_bit = clamp(uint(floor(unit_depth * 32.0)), 0, 31);
+
+	uint z_write_offset = cluster_offset + state.cluster_depth_offset + element_index;
+	uint z_write_bit = 1 << z_bit;
+
+#ifdef USE_SUBGROUPS
+	if (!gl_HelperInvocation) {
+		z_write_bit = subgroupOr(z_write_bit); //merge all Zs
+		if (cluster_thread_group_index == 0) {
+			atomicOr(cluster_render.data[z_write_offset], z_write_bit);
+		}
+	}
+#else
+	if (!gl_HelperInvocation) {
+		atomicOr(cluster_render.data[z_write_offset], z_write_bit);
+	}
+#endif
+}
diff --git a/servers/rendering/renderer_rd/shaders/cluster_store.glsl b/servers/rendering/renderer_rd/shaders/cluster_store.glsl
new file mode 100644
index 0000000000..5be0893c4f
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/cluster_store.glsl
@@ -0,0 +1,119 @@
+#[compute]
+
+#version 450
+
+VERSION_DEFINES
+
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
+
+layout(push_constant, binding = 0, std430) uniform Params {
+	uint cluster_render_data_size; // how much data for a single cluster takes
+	uint max_render_element_count_div_32; //divided by 32
+	uvec2 cluster_screen_size;
+	uint render_element_count_div_32; //divided by 32
+
+	uint max_cluster_element_count_div_32; //divided by 32
+	uint pad1;
+	uint pad2;
+}
+params;
+
+layout(set = 0, binding = 1, std430) buffer restrict readonly ClusterRender {
+	uint data[];
+}
+cluster_render;
+
+layout(set = 0, binding = 2, std430) buffer restrict ClusterStore {
+	uint data[];
+}
+cluster_store;
+
+struct RenderElement {
+	uint type; //0-4
+	bool touches_near;
+	bool touches_far;
+	uint original_index;
+	mat3x4 transform_inv;
+	vec3 scale;
+	uint pad;
+};
+
+layout(set = 0, binding = 3, std430) buffer restrict readonly RenderElements {
+	RenderElement data[];
+}
+render_elements;
+
+void main() {
+	uvec2 pos = gl_GlobalInvocationID.xy;
+	if (any(greaterThanEqual(pos, params.cluster_screen_size))) {
+		return;
+	}
+
+	//counter for each type of render_element
+
+	//base offset for this cluster
+	uint base_offset = (pos.x + params.cluster_screen_size.x * pos.y);
+	uint src_offset = base_offset * params.cluster_render_data_size;
+
+	uint render_element_offset = 0;
+
+	//check all render_elements and see which one was written to
+	while (render_element_offset < params.render_element_count_div_32) {
+		uint bits = cluster_render.data[src_offset + render_element_offset];
+		while (bits != 0) {
+			//if bits exist, check the render_element
+			uint index_bit = findLSB(bits);
+			uint index = render_element_offset * 32 + index_bit;
+			uint type = render_elements.data[index].type;
+
+			uint z_range_offset = src_offset + params.max_render_element_count_div_32 + index;
+			uint z_range = cluster_render.data[z_range_offset];
+
+			//if object was written, z was written, but check just in case
+			if (z_range != 0) { //should always be > 0
+
+				uint from_z = findLSB(z_range);
+				uint to_z = findMSB(z_range) + 1;
+
+				if (render_elements.data[index].touches_near) {
+					from_z = 0;
+				}
+
+				if (render_elements.data[index].touches_far) {
+					to_z = 32;
+				}
+
+				// find cluster offset in the buffer used for indexing in the renderer
+				uint dst_offset = (base_offset + type * (params.cluster_screen_size.x * params.cluster_screen_size.y)) * (params.max_cluster_element_count_div_32 + 32);
+
+				uint orig_index = render_elements.data[index].original_index;
+				//store this index in the Z slices by setting the relevant bit
+				for (uint i = from_z; i < to_z; i++) {
+					uint slice_ofs = dst_offset + params.max_cluster_element_count_div_32 + i;
+
+					uint minmax = cluster_store.data[slice_ofs];
+
+					if (minmax == 0) {
+						minmax = 0xFFFF; //min 0, max 0xFFFF
+					}
+
+					uint elem_min = min(orig_index, minmax & 0xFFFF);
+					uint elem_max = max(orig_index + 1, minmax >> 16); //always store plus one, so zero means range is empty when not written to
+
+					minmax = elem_min | (elem_max << 16);
+					cluster_store.data[slice_ofs] = minmax;
+				}
+
+				uint store_word = orig_index >> 5;
+				uint store_bit = orig_index & 0x1F;
+
+				//store the actual render_element index at the end, so the rendering code can reference it
+				cluster_store.data[dst_offset + store_word] |= 1 << store_bit;
+			}
+
+			bits &= ~(1 << index_bit); //clear the bit to continue iterating
+		}
+
+		render_element_offset++;
+	}
+}
diff --git a/servers/rendering/renderer_rd/shaders/gi.glsl b/servers/rendering/renderer_rd/shaders/gi.glsl
index 8011dadc72..c2965f9874 100644
--- a/servers/rendering/renderer_rd/shaders/gi.glsl
+++ b/servers/rendering/renderer_rd/shaders/gi.glsl
@@ -99,7 +99,7 @@ layout(push_constant, binding = 0, std430) uniform Params {
 
 	uint max_giprobes;
 	bool high_quality_vct;
-	bool use_sdfgi;
+	uint pad2;
 	bool orthogonal;
 
 	vec3 ao_color;
@@ -331,7 +331,7 @@ void sdfgi_process(vec3 vertex, vec3 normal, vec3 reflection, float roughness, o
 		}
 
 		ambient_light.rgb = diffuse;
-#if 1
+
 		if (roughness < 0.2) {
 			vec3 pos_to_uvw = 1.0 / sdfgi.grid_size;
 			vec4 light_accum = vec4(0.0);
@@ -363,7 +363,6 @@ void sdfgi_process(vec3 vertex, vec3 normal, vec3 reflection, float roughness, o
 				//ray_pos += ray_dir * (bias / sdfgi.cascades[cascade].to_cell); //bias to avoid self occlusion
 				ray_pos += (ray_dir * 1.0 / max(abs_ray_dir.x, max(abs_ray_dir.y, abs_ray_dir.z)) + cam_normal * 1.4) * bias / sdfgi.cascades[cascade].to_cell;
 			}
-
 			float softness = 0.2 + min(1.0, roughness * 5.0) * 4.0; //approximation to roughness so it does not seem like a hard fade
 			while (length(ray_pos) < max_distance) {
 				for (uint i = 0; i < sdfgi.max_cascades; i++) {
@@ -434,8 +433,6 @@ void sdfgi_process(vec3 vertex, vec3 normal, vec3 reflection, float roughness, o
 			}
 		}
 
-#endif
-
 		reflection_light.rgb = specular;
 
 		ambient_light.rgb *= sdfgi.energy;
@@ -621,11 +618,12 @@ void main() {
 
 		vec3 reflection = normalize(reflect(normalize(vertex), normal));
 
-		if (params.use_sdfgi) {
-			sdfgi_process(vertex, normal, reflection, roughness, ambient_light, reflection_light);
-		}
+#ifdef USE_SDFGI
+		sdfgi_process(vertex, normal, reflection, roughness, ambient_light, reflection_light);
+#endif
 
-		if (params.max_giprobes > 0) {
+#ifdef USE_GIPROBES
+		{
 			uvec2 giprobe_tex = texelFetch(usampler2D(giprobe_buffer, linear_sampler), pos, 0).rg;
 			roughness *= roughness;
 			//find arbitrary tangent and bitangent, then build a matrix
@@ -656,6 +654,7 @@ void main() {
 				ambient_light = amb_accum;
 			}
 		}
+#endif
 	}
 
 	imageStore(ambient_buffer, pos, ambient_light);
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward.glsl b/servers/rendering/renderer_rd/shaders/scene_forward.glsl
index 7fa5f7b0fe..c3e7e2acbf 100644
--- a/servers/rendering/renderer_rd/shaders/scene_forward.glsl
+++ b/servers/rendering/renderer_rd/shaders/scene_forward.glsl
@@ -541,7 +541,7 @@ vec3 F0(float metallic, float specular, vec3 albedo) {
 	return mix(vec3(dielectric), albedo, vec3(metallic));
 }
 
-void light_compute(vec3 N, vec3 L, vec3 V, vec3 light_color, float attenuation, vec3 f0, uint orms,
+void light_compute(vec3 N, vec3 L, vec3 V, vec3 light_color, float attenuation, vec3 f0, uint orms, float specular_amount,
 #ifdef LIGHT_BACKLIGHT_USED
 		vec3 backlight,
 #endif
@@ -710,7 +710,7 @@ LIGHT_SHADER_CODE
 		blinn *= (shininess + 8.0) * (1.0 / (8.0 * M_PI));
 		float intensity = blinn;
 
-		specular_light += light_color * intensity * attenuation;
+		specular_light += light_color * intensity * attenuation * specular_amount;
 
 #elif defined(SPECULAR_PHONG)
 
@@ -721,7 +721,7 @@ LIGHT_SHADER_CODE
 		phong *= (shininess + 8.0) * (1.0 / (8.0 * M_PI));
 		float intensity = (phong) / max(4.0 * cNdotV * cNdotL, 0.75);
 
-		specular_light += light_color * intensity * attenuation;
+		specular_light += light_color * intensity * attenuation * specular_amount;
 
 #elif defined(SPECULAR_TOON)
 
@@ -730,7 +730,7 @@ LIGHT_SHADER_CODE
 		float mid = 1.0 - roughness;
 		mid *= mid;
 		float intensity = smoothstep(mid - roughness * 0.5, mid + roughness * 0.5, RdotV) * mid;
-		diffuse_light += light_color * intensity * attenuation; // write to diffuse_light, as in toon shading you generally want no reflection
+		diffuse_light += light_color * intensity * attenuation * specular_amount; // write to diffuse_light, as in toon shading you generally want no reflection
 
 #elif defined(SPECULAR_DISABLED)
 		// none..
@@ -760,7 +760,7 @@ LIGHT_SHADER_CODE
 
 		vec3 specular_brdf_NL = cNdotL * D * F * G;
 
-		specular_light += specular_brdf_NL * light_color * attenuation;
+		specular_light += specular_brdf_NL * light_color * attenuation * specular_amount;
 #endif
 
 #if defined(LIGHT_CLEARCOAT_USED)
@@ -774,7 +774,7 @@ LIGHT_SHADER_CODE
 
 		float clearcoat_specular_brdf_NL = 0.25 * clearcoat * Gr * Fr * Dr * cNdotL;
 
-		specular_light += clearcoat_specular_brdf_NL * light_color * attenuation;
+		specular_light += clearcoat_specular_brdf_NL * light_color * attenuation * specular_amount;
 #endif
 	}
 
@@ -903,28 +903,28 @@ float get_omni_attenuation(float distance, float inv_range, float decay) {
 
 float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 #ifndef USE_NO_SHADOWS
-	if (lights.data[idx].shadow_enabled) {
+	if (omni_lights.data[idx].shadow_enabled) {
 		// there is a shadowmap
 
-		vec3 light_rel_vec = lights.data[idx].position - vertex;
+		vec3 light_rel_vec = omni_lights.data[idx].position - vertex;
 		float light_length = length(light_rel_vec);
 
 		vec4 v = vec4(vertex, 1.0);
 
-		vec4 splane = (lights.data[idx].shadow_matrix * v);
+		vec4 splane = (omni_lights.data[idx].shadow_matrix * v);
 		float shadow_len = length(splane.xyz); //need to remember shadow len from here
 
 		{
-			vec3 nofs = normal_interp * lights.data[idx].shadow_normal_bias / lights.data[idx].inv_radius;
+			vec3 nofs = normal_interp * omni_lights.data[idx].shadow_normal_bias / omni_lights.data[idx].inv_radius;
 			nofs *= (1.0 - max(0.0, dot(normalize(light_rel_vec), normalize(normal_interp))));
 			v.xyz += nofs;
-			splane = (lights.data[idx].shadow_matrix * v);
+			splane = (omni_lights.data[idx].shadow_matrix * v);
 		}
 
 		float shadow;
 
 #ifdef USE_SOFT_SHADOWS
-		if (lights.data[idx].soft_shadow_size > 0.0) {
+		if (omni_lights.data[idx].soft_shadow_size > 0.0) {
 			//soft shadow
 
 			//find blocker
@@ -944,10 +944,10 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 			vec3 v0 = abs(normal.z) < 0.999 ? vec3(0.0, 0.0, 1.0) : vec3(0.0, 1.0, 0.0);
 			vec3 tangent = normalize(cross(v0, normal));
 			vec3 bitangent = normalize(cross(tangent, normal));
-			float z_norm = shadow_len * lights.data[idx].inv_radius;
+			float z_norm = shadow_len * omni_lights.data[idx].inv_radius;
 
-			tangent *= lights.data[idx].soft_shadow_size * lights.data[idx].soft_shadow_scale;
-			bitangent *= lights.data[idx].soft_shadow_size * lights.data[idx].soft_shadow_scale;
+			tangent *= omni_lights.data[idx].soft_shadow_size * omni_lights.data[idx].soft_shadow_scale;
+			bitangent *= omni_lights.data[idx].soft_shadow_size * omni_lights.data[idx].soft_shadow_scale;
 
 			for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
 				vec2 disk = disk_rotation * scene_data.penumbra_shadow_kernel[i].xy;
@@ -955,7 +955,7 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 				vec3 pos = splane.xyz + tangent * disk.x + bitangent * disk.y;
 
 				pos = normalize(pos);
-				vec4 uv_rect = lights.data[idx].atlas_rect;
+				vec4 uv_rect = omni_lights.data[idx].atlas_rect;
 
 				if (pos.z >= 0.0) {
 					pos.z += 1.0;
@@ -983,7 +983,7 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 				tangent *= penumbra;
 				bitangent *= penumbra;
 
-				z_norm -= lights.data[idx].inv_radius * lights.data[idx].shadow_bias;
+				z_norm -= omni_lights.data[idx].inv_radius * omni_lights.data[idx].shadow_bias;
 
 				shadow = 0.0;
 				for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
@@ -991,7 +991,7 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 					vec3 pos = splane.xyz + tangent * disk.x + bitangent * disk.y;
 
 					pos = normalize(pos);
-					vec4 uv_rect = lights.data[idx].atlas_rect;
+					vec4 uv_rect = omni_lights.data[idx].atlas_rect;
 
 					if (pos.z >= 0.0) {
 						pos.z += 1.0;
@@ -1016,7 +1016,7 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 		} else {
 #endif
 			splane.xyz = normalize(splane.xyz);
-			vec4 clamp_rect = lights.data[idx].atlas_rect;
+			vec4 clamp_rect = omni_lights.data[idx].atlas_rect;
 
 			if (splane.z >= 0.0) {
 				splane.z += 1.0;
@@ -1030,10 +1030,10 @@ float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
 			splane.xy /= splane.z;
 
 			splane.xy = splane.xy * 0.5 + 0.5;
-			splane.z = (shadow_len - lights.data[idx].shadow_bias) * lights.data[idx].inv_radius;
+			splane.z = (shadow_len - omni_lights.data[idx].shadow_bias) * omni_lights.data[idx].inv_radius;
 			splane.xy = clamp_rect.xy + splane.xy * clamp_rect.zw;
 			splane.w = 1.0; //needed? i think it should be 1 already
-			shadow = sample_pcf_shadow(shadow_atlas, lights.data[idx].soft_shadow_scale * scene_data.shadow_atlas_pixel_size, splane);
+			shadow = sample_pcf_shadow(shadow_atlas, omni_lights.data[idx].soft_shadow_scale * scene_data.shadow_atlas_pixel_size, splane);
 #ifdef USE_SOFT_SHADOWS
 		}
 #endif
@@ -1068,17 +1068,17 @@ void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 		inout float alpha,
 #endif
 		inout vec3 diffuse_light, inout vec3 specular_light) {
-	vec3 light_rel_vec = lights.data[idx].position - vertex;
+	vec3 light_rel_vec = omni_lights.data[idx].position - vertex;
 	float light_length = length(light_rel_vec);
-	float omni_attenuation = get_omni_attenuation(light_length, lights.data[idx].inv_radius, lights.data[idx].attenuation);
+	float omni_attenuation = get_omni_attenuation(light_length, omni_lights.data[idx].inv_radius, omni_lights.data[idx].attenuation);
 	float light_attenuation = omni_attenuation;
-	vec3 color = lights.data[idx].color;
+	vec3 color = omni_lights.data[idx].color;
 
 #ifdef USE_SOFT_SHADOWS
 	float size_A = 0.0;
 
-	if (lights.data[idx].size > 0.0) {
-		float t = lights.data[idx].size / max(0.001, light_length);
+	if (omni_lights.data[idx].size > 0.0) {
+		float t = omni_lights.data[idx].size / max(0.001, light_length);
 		size_A = max(0.0, 1.0 - 1 / sqrt(1 + t * t));
 	}
 #endif
@@ -1087,10 +1087,10 @@ void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 	float transmittance_z = transmittance_depth; //no transmittance by default
 	transmittance_color.a *= light_attenuation;
 	{
-		vec4 clamp_rect = lights.data[idx].atlas_rect;
+		vec4 clamp_rect = omni_lights.data[idx].atlas_rect;
 
 		//redo shadowmapping, but shrink the model a bit to avoid arctifacts
-		vec4 splane = (lights.data[idx].shadow_matrix * vec4(vertex - normalize(normal_interp) * lights.data[idx].transmittance_bias, 1.0));
+		vec4 splane = (omni_lights.data[idx].shadow_matrix * vec4(vertex - normalize(normal_interp) * omni_lights.data[idx].transmittance_bias, 1.0));
 
 		shadow_len = length(splane.xyz);
 		splane = normalize(splane.xyz);
@@ -1104,22 +1104,22 @@ void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 
 		splane.xy /= splane.z;
 		splane.xy = splane.xy * 0.5 + 0.5;
-		splane.z = shadow_len * lights.data[idx].inv_radius;
+		splane.z = shadow_len * omni_lights.data[idx].inv_radius;
 		splane.xy = clamp_rect.xy + splane.xy * clamp_rect.zw;
 		splane.w = 1.0; //needed? i think it should be 1 already
 
 		float shadow_z = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), splane.xy, 0.0).r;
-		transmittance_z = (splane.z - shadow_z) / lights.data[idx].inv_radius;
+		transmittance_z = (splane.z - shadow_z) / omni_lights.data[idx].inv_radius;
 	}
 #endif
 
 #if 0
 
-	if (lights.data[idx].projector_rect != vec4(0.0)) {
-		vec3 local_v = (lights.data[idx].shadow_matrix * vec4(vertex, 1.0)).xyz;
+	if (omni_lights.data[idx].projector_rect != vec4(0.0)) {
+		vec3 local_v = (omni_lights.data[idx].shadow_matrix * vec4(vertex, 1.0)).xyz;
 		local_v = normalize(local_v);
 
-		vec4 atlas_rect = lights.data[idx].projector_rect;
+		vec4 atlas_rect = omni_lights.data[idx].projector_rect;
 
 		if (local_v.z >= 0.0) {
 			local_v.z += 1.0;
@@ -1136,7 +1136,7 @@ void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 		vec2 proj_uv_ddx;
 		vec2 proj_uv_ddy;
 		{
-			vec3 local_v_ddx = (lights.data[idx].shadow_matrix * vec4(vertex + vertex_ddx, 1.0)).xyz;
+			vec3 local_v_ddx = (omni_lights.data[idx].shadow_matrix * vec4(vertex + vertex_ddx, 1.0)).xyz;
 			local_v_ddx = normalize(local_v_ddx);
 
 			if (local_v_ddx.z >= 0.0) {
@@ -1150,7 +1150,7 @@ void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 
 			proj_uv_ddx = local_v_ddx.xy * atlas_rect.zw - proj_uv;
 
-			vec3 local_v_ddy = (lights.data[idx].shadow_matrix * vec4(vertex + vertex_ddy, 1.0)).xyz;
+			vec3 local_v_ddy = (omni_lights.data[idx].shadow_matrix * vec4(vertex + vertex_ddy, 1.0)).xyz;
 			local_v_ddy = normalize(local_v_ddy);
 
 			if (local_v_ddy.z >= 0.0) {
@@ -1172,7 +1172,7 @@ void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 
 	light_attenuation *= shadow;
 
-	light_compute(normal, normalize(light_rel_vec), eye_vec, color, light_attenuation, f0, orms,
+	light_compute(normal, normalize(light_rel_vec), eye_vec, color, light_attenuation, f0, orms, omni_lights.data[idx].specular_amount,
 #ifdef LIGHT_BACKLIGHT_USED
 			backlight,
 #endif
@@ -1204,37 +1204,37 @@ void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 
 float light_process_spot_shadow(uint idx, vec3 vertex, vec3 normal) {
 #ifndef USE_NO_SHADOWS
-	if (lights.data[idx].shadow_enabled) {
-		vec3 light_rel_vec = lights.data[idx].position - vertex;
+	if (spot_lights.data[idx].shadow_enabled) {
+		vec3 light_rel_vec = spot_lights.data[idx].position - vertex;
 		float light_length = length(light_rel_vec);
-		vec3 spot_dir = lights.data[idx].direction;
+		vec3 spot_dir = spot_lights.data[idx].direction;
 		//there is a shadowmap
 		vec4 v = vec4(vertex, 1.0);
 
-		v.xyz -= spot_dir * lights.data[idx].shadow_bias;
+		v.xyz -= spot_dir * spot_lights.data[idx].shadow_bias;
 
-		float z_norm = dot(spot_dir, -light_rel_vec) * lights.data[idx].inv_radius;
+		float z_norm = dot(spot_dir, -light_rel_vec) * spot_lights.data[idx].inv_radius;
 
 		float depth_bias_scale = 1.0 / (max(0.0001, z_norm)); //the closer to the light origin, the more you have to offset to reach 1px in the map
-		vec3 normal_bias = normalize(normal_interp) * (1.0 - max(0.0, dot(spot_dir, -normalize(normal_interp)))) * lights.data[idx].shadow_normal_bias * depth_bias_scale;
+		vec3 normal_bias = normalize(normal_interp) * (1.0 - max(0.0, dot(spot_dir, -normalize(normal_interp)))) * spot_lights.data[idx].shadow_normal_bias * depth_bias_scale;
 		normal_bias -= spot_dir * dot(spot_dir, normal_bias); //only XY, no Z
 		v.xyz += normal_bias;
 
 		//adjust with bias
-		z_norm = dot(spot_dir, v.xyz - lights.data[idx].position) * lights.data[idx].inv_radius;
+		z_norm = dot(spot_dir, v.xyz - spot_lights.data[idx].position) * spot_lights.data[idx].inv_radius;
 
 		float shadow;
 
-		vec4 splane = (lights.data[idx].shadow_matrix * v);
+		vec4 splane = (spot_lights.data[idx].shadow_matrix * v);
 		splane /= splane.w;
 
 #ifdef USE_SOFT_SHADOWS
-		if (lights.data[idx].soft_shadow_size > 0.0) {
+		if (spot_lights.data[idx].soft_shadow_size > 0.0) {
 			//soft shadow
 
 			//find blocker
 
-			vec2 shadow_uv = splane.xy * lights.data[idx].atlas_rect.zw + lights.data[idx].atlas_rect.xy;
+			vec2 shadow_uv = splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy;
 
 			float blocker_count = 0.0;
 			float blocker_average = 0.0;
@@ -1247,11 +1247,11 @@ float light_process_spot_shadow(uint idx, vec3 vertex, vec3 normal) {
 				disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
 			}
 
-			float uv_size = lights.data[idx].soft_shadow_size * z_norm * lights.data[idx].soft_shadow_scale;
-			vec2 clamp_max = lights.data[idx].atlas_rect.xy + lights.data[idx].atlas_rect.zw;
+			float uv_size = spot_lights.data[idx].soft_shadow_size * z_norm * spot_lights.data[idx].soft_shadow_scale;
+			vec2 clamp_max = spot_lights.data[idx].atlas_rect.xy + spot_lights.data[idx].atlas_rect.zw;
 			for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
 				vec2 suv = shadow_uv + (disk_rotation * scene_data.penumbra_shadow_kernel[i].xy) * uv_size;
-				suv = clamp(suv, lights.data[idx].atlas_rect.xy, clamp_max);
+				suv = clamp(suv, spot_lights.data[idx].atlas_rect.xy, clamp_max);
 				float d = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), suv, 0.0).r;
 				if (d < z_norm) {
 					blocker_average += d;
@@ -1268,7 +1268,7 @@ float light_process_spot_shadow(uint idx, vec3 vertex, vec3 normal) {
 				shadow = 0.0;
 				for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
 					vec2 suv = shadow_uv + (disk_rotation * scene_data.penumbra_shadow_kernel[i].xy) * uv_size;
-					suv = clamp(suv, lights.data[idx].atlas_rect.xy, clamp_max);
+					suv = clamp(suv, spot_lights.data[idx].atlas_rect.xy, clamp_max);
 					shadow += textureProj(sampler2DShadow(shadow_atlas, shadow_sampler), vec4(suv, z_norm, 1.0));
 				}
 
@@ -1282,9 +1282,9 @@ float light_process_spot_shadow(uint idx, vec3 vertex, vec3 normal) {
 		} else {
 #endif
 			//hard shadow
-			vec4 shadow_uv = vec4(splane.xy * lights.data[idx].atlas_rect.zw + lights.data[idx].atlas_rect.xy, z_norm, 1.0);
+			vec4 shadow_uv = vec4(splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy, z_norm, 1.0);
 
-			shadow = sample_pcf_shadow(shadow_atlas, lights.data[idx].soft_shadow_scale * scene_data.shadow_atlas_pixel_size, shadow_uv);
+			shadow = sample_pcf_shadow(shadow_atlas, spot_lights.data[idx].soft_shadow_scale * scene_data.shadow_atlas_pixel_size, shadow_uv);
 #ifdef USE_SOFT_SHADOWS
 		}
 #endif
@@ -1321,28 +1321,28 @@ void light_process_spot(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 #endif
 		inout vec3 diffuse_light,
 		inout vec3 specular_light) {
-	vec3 light_rel_vec = lights.data[idx].position - vertex;
+	vec3 light_rel_vec = spot_lights.data[idx].position - vertex;
 	float light_length = length(light_rel_vec);
-	float spot_attenuation = get_omni_attenuation(light_length, lights.data[idx].inv_radius, lights.data[idx].attenuation);
-	vec3 spot_dir = lights.data[idx].direction;
-	float scos = max(dot(-normalize(light_rel_vec), spot_dir), lights.data[idx].cone_angle);
-	float spot_rim = max(0.0001, (1.0 - scos) / (1.0 - lights.data[idx].cone_angle));
-	spot_attenuation *= 1.0 - pow(spot_rim, lights.data[idx].cone_attenuation);
+	float spot_attenuation = get_omni_attenuation(light_length, spot_lights.data[idx].inv_radius, spot_lights.data[idx].attenuation);
+	vec3 spot_dir = spot_lights.data[idx].direction;
+	float scos = max(dot(-normalize(light_rel_vec), spot_dir), spot_lights.data[idx].cone_angle);
+	float spot_rim = max(0.0001, (1.0 - scos) / (1.0 - spot_lights.data[idx].cone_angle));
+	spot_attenuation *= 1.0 - pow(spot_rim, spot_lights.data[idx].cone_attenuation);
 	float light_attenuation = spot_attenuation;
-	vec3 color = lights.data[idx].color;
-	float specular_amount = lights.data[idx].specular_amount;
+	vec3 color = spot_lights.data[idx].color;
+	float specular_amount = spot_lights.data[idx].specular_amount;
 
 #ifdef USE_SOFT_SHADOWS
 	float size_A = 0.0;
 
-	if (lights.data[idx].size > 0.0) {
-		float t = lights.data[idx].size / max(0.001, light_length);
+	if (spot_lights.data[idx].size > 0.0) {
+		float t = spot_lights.data[idx].size / max(0.001, light_length);
 		size_A = max(0.0, 1.0 - 1 / sqrt(1 + t * t));
 	}
 #endif
 
 	/*
-	if (lights.data[idx].atlas_rect!=vec4(0.0)) {
+	if (spot_lights.data[idx].atlas_rect!=vec4(0.0)) {
 		//use projector texture
 	}
 	*/
@@ -1351,13 +1351,13 @@ void light_process_spot(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 	float transmittance_z = transmittance_depth;
 	transmittance_color.a *= light_attenuation;
 	{
-		splane = (lights.data[idx].shadow_matrix * vec4(vertex - normalize(normal_interp) * lights.data[idx].transmittance_bias, 1.0));
+		splane = (spot_lights.data[idx].shadow_matrix * vec4(vertex - normalize(normal_interp) * spot_lights.data[idx].transmittance_bias, 1.0));
 		splane /= splane.w;
-		splane.xy = splane.xy * lights.data[idx].atlas_rect.zw + lights.data[idx].atlas_rect.xy;
+		splane.xy = splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy;
 
 		float shadow_z = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), splane.xy, 0.0).r;
 		//reconstruct depth
-		shadow_z /= lights.data[idx].inv_radius;
+		shadow_z /= spot_lights.data[idx].inv_radius;
 		//distance to light plane
 		float z = dot(spot_dir, -light_rel_vec);
 		transmittance_z = z - shadow_z;
@@ -1366,7 +1366,7 @@ void light_process_spot(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 v
 
 	light_attenuation *= shadow;
 
-	light_compute(normal, normalize(light_rel_vec), eye_vec, color, light_attenuation, f0, orms,
+	light_compute(normal, normalize(light_rel_vec), eye_vec, color, light_attenuation, f0, orms, spot_lights.data[idx].specular_amount,
 #ifdef LIGHT_BACKLIGHT_USED
 			backlight,
 #endif
@@ -1785,7 +1785,43 @@ vec4 fog_process(vec3 vertex) {
 	return vec4(fog_color, fog_amount);
 }
 
+void cluster_get_item_range(uint p_offset, out uint item_min, out uint item_max, out uint item_from, out uint item_to) {
+	uint item_min_max = cluster_buffer.data[p_offset];
+	item_min = item_min_max & 0xFFFF;
+	item_max = item_min_max >> 16;
+	;
+
+	item_from = item_min >> 5;
+	item_to = (item_max == 0) ? 0 : ((item_max - 1) >> 5) + 1; //side effect of how it is stored, as item_max 0 means no elements
+}
+
+uint cluster_get_range_clip_mask(uint i, uint z_min, uint z_max) {
+	int local_min = clamp(int(z_min) - int(i) * 32, 0, 31);
+	int mask_width = min(int(z_max) - int(z_min), 32 - local_min);
+	return bitfieldInsert(uint(0), uint(0xFFFFFFFF), local_min, mask_width);
+}
+
+float blur_shadow(float shadow) {
+	return shadow;
+#if 0
+	//disabling for now, will investigate later
+	float interp_shadow = shadow;
+	if (gl_HelperInvocation) {
+		interp_shadow = -4.0; // technically anything below -4 will do but just to make sure
+	}
+
+	uvec2 fc2 = uvec2(gl_FragCoord.xy);
+	interp_shadow -= dFdx(interp_shadow) * (float(fc2.x & 1) - 0.5);
+	interp_shadow -= dFdy(interp_shadow) * (float(fc2.y & 1) - 0.5);
+
+	if (interp_shadow >= 0.0) {
+		shadow = interp_shadow;
+	}
+	return shadow;
 #endif
+}
+
+#endif //!MODE_RENDER DEPTH
 
 void main() {
 #ifdef MODE_DUAL_PARABOLOID
@@ -2003,67 +2039,98 @@ FRAGMENT_SHADER_CODE
 
 #ifndef MODE_RENDER_DEPTH
 
-	uvec4 cluster_cell = texture(usampler3D(cluster_texture, material_samplers[SAMPLER_NEAREST_CLAMP]), vec3(screen_uv, (abs(vertex.z) - scene_data.z_near) / (scene_data.z_far - scene_data.z_near)));
+	uvec2 cluster_pos = uvec2(gl_FragCoord.xy) >> scene_data.cluster_shift;
+	uint cluster_offset = (scene_data.cluster_width * cluster_pos.y + cluster_pos.x) * (scene_data.max_cluster_element_count_div_32 + 32);
+
+	uint cluster_z = uint(clamp((-vertex.z / scene_data.z_far) * 32.0, 0.0, 31.0));
+
 	//used for interpolating anything cluster related
 	vec3 vertex_ddx = dFdx(vertex);
 	vec3 vertex_ddy = dFdy(vertex);
 
 	{ // process decals
 
-		uint decal_count = cluster_cell.w >> CLUSTER_COUNTER_SHIFT;
-		uint decal_pointer = cluster_cell.w & CLUSTER_POINTER_MASK;
+		uint cluster_decal_offset = cluster_offset + scene_data.cluster_type_size * 2;
 
-		//do outside for performance and avoiding arctifacts
+		uint item_min;
+		uint item_max;
+		uint item_from;
+		uint item_to;
 
-		for (uint i = 0; i < decal_count; i++) {
-			uint decal_index = cluster_data.indices[decal_pointer + i];
-			if (!bool(decals.data[decal_index].mask & draw_call.layer_mask)) {
-				continue; //not masked
-			}
+		cluster_get_item_range(cluster_decal_offset + scene_data.max_cluster_element_count_div_32 + cluster_z, item_min, item_max, item_from, item_to);
 
-			vec3 uv_local = (decals.data[decal_index].xform * vec4(vertex, 1.0)).xyz;
-			if (any(lessThan(uv_local, vec3(0.0, -1.0, 0.0))) || any(greaterThan(uv_local, vec3(1.0)))) {
-				continue; //out of decal
-			}
+#ifdef USE_SUBGROUPS
+		item_from = subgroupBroadcastFirst(subgroupMin(item_from));
+		item_to = subgroupBroadcastFirst(subgroupMax(item_to));
+#endif
 
-			//we need ddx/ddy for mipmaps, so simulate them
-			vec2 ddx = (decals.data[decal_index].xform * vec4(vertex_ddx, 0.0)).xz;
-			vec2 ddy = (decals.data[decal_index].xform * vec4(vertex_ddy, 0.0)).xz;
+		for (uint i = item_from; i < item_to; i++) {
+			uint mask = cluster_buffer.data[cluster_decal_offset + i];
+			mask &= cluster_get_range_clip_mask(i, item_min, item_max);
+#ifdef USE_SUBGROUPS
+			uint merged_mask = subgroupBroadcastFirst(subgroupOr(mask));
+#else
+			uint merged_mask = mask;
+#endif
 
-			float fade = pow(1.0 - (uv_local.y > 0.0 ? uv_local.y : -uv_local.y), uv_local.y > 0.0 ? decals.data[decal_index].upper_fade : decals.data[decal_index].lower_fade);
+			while (merged_mask != 0) {
+				uint bit = findMSB(merged_mask);
+				merged_mask &= ~(1 << bit);
+#ifdef USE_SUBGROUPS
+				if (((1 << bit) & mask) == 0) { //do not process if not originally here
+					continue;
+				}
+#endif
+				uint decal_index = 32 * i + bit;
 
-			if (decals.data[decal_index].normal_fade > 0.0) {
-				fade *= smoothstep(decals.data[decal_index].normal_fade, 1.0, dot(normal_interp, decals.data[decal_index].normal) * 0.5 + 0.5);
-			}
+				if (!bool(decals.data[decal_index].mask & draw_call.layer_mask)) {
+					continue; //not masked
+				}
+
+				vec3 uv_local = (decals.data[decal_index].xform * vec4(vertex, 1.0)).xyz;
+				if (any(lessThan(uv_local, vec3(0.0, -1.0, 0.0))) || any(greaterThan(uv_local, vec3(1.0)))) {
+					continue; //out of decal
+				}
+
+				//we need ddx/ddy for mipmaps, so simulate them
+				vec2 ddx = (decals.data[decal_index].xform * vec4(vertex_ddx, 0.0)).xz;
+				vec2 ddy = (decals.data[decal_index].xform * vec4(vertex_ddy, 0.0)).xz;
 
-			if (decals.data[decal_index].albedo_rect != vec4(0.0)) {
-				//has albedo
-				vec4 decal_albedo = textureGrad(sampler2D(decal_atlas_srgb, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].albedo_rect.zw + decals.data[decal_index].albedo_rect.xy, ddx * decals.data[decal_index].albedo_rect.zw, ddy * decals.data[decal_index].albedo_rect.zw);
-				decal_albedo *= decals.data[decal_index].modulate;
-				decal_albedo.a *= fade;
-				albedo = mix(albedo, decal_albedo.rgb, decal_albedo.a * decals.data[decal_index].albedo_mix);
-
-				if (decals.data[decal_index].normal_rect != vec4(0.0)) {
-					vec3 decal_normal = textureGrad(sampler2D(decal_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].normal_rect.zw + decals.data[decal_index].normal_rect.xy, ddx * decals.data[decal_index].normal_rect.zw, ddy * decals.data[decal_index].normal_rect.zw).xyz;
-					decal_normal.xy = decal_normal.xy * vec2(2.0, -2.0) - vec2(1.0, -1.0); //users prefer flipped y normal maps in most authoring software
-					decal_normal.z = sqrt(max(0.0, 1.0 - dot(decal_normal.xy, decal_normal.xy)));
-					//convert to view space, use xzy because y is up
-					decal_normal = (decals.data[decal_index].normal_xform * decal_normal.xzy).xyz;
-
-					normal = normalize(mix(normal, decal_normal, decal_albedo.a));
+				float fade = pow(1.0 - (uv_local.y > 0.0 ? uv_local.y : -uv_local.y), uv_local.y > 0.0 ? decals.data[decal_index].upper_fade : decals.data[decal_index].lower_fade);
+
+				if (decals.data[decal_index].normal_fade > 0.0) {
+					fade *= smoothstep(decals.data[decal_index].normal_fade, 1.0, dot(normal_interp, decals.data[decal_index].normal) * 0.5 + 0.5);
 				}
 
-				if (decals.data[decal_index].orm_rect != vec4(0.0)) {
-					vec3 decal_orm = textureGrad(sampler2D(decal_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].orm_rect.zw + decals.data[decal_index].orm_rect.xy, ddx * decals.data[decal_index].orm_rect.zw, ddy * decals.data[decal_index].orm_rect.zw).xyz;
-					ao = mix(ao, decal_orm.r, decal_albedo.a);
-					roughness = mix(roughness, decal_orm.g, decal_albedo.a);
-					metallic = mix(metallic, decal_orm.b, decal_albedo.a);
+				if (decals.data[decal_index].albedo_rect != vec4(0.0)) {
+					//has albedo
+					vec4 decal_albedo = textureGrad(sampler2D(decal_atlas_srgb, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].albedo_rect.zw + decals.data[decal_index].albedo_rect.xy, ddx * decals.data[decal_index].albedo_rect.zw, ddy * decals.data[decal_index].albedo_rect.zw);
+					decal_albedo *= decals.data[decal_index].modulate;
+					decal_albedo.a *= fade;
+					albedo = mix(albedo, decal_albedo.rgb, decal_albedo.a * decals.data[decal_index].albedo_mix);
+
+					if (decals.data[decal_index].normal_rect != vec4(0.0)) {
+						vec3 decal_normal = textureGrad(sampler2D(decal_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].normal_rect.zw + decals.data[decal_index].normal_rect.xy, ddx * decals.data[decal_index].normal_rect.zw, ddy * decals.data[decal_index].normal_rect.zw).xyz;
+						decal_normal.xy = decal_normal.xy * vec2(2.0, -2.0) - vec2(1.0, -1.0); //users prefer flipped y normal maps in most authoring software
+						decal_normal.z = sqrt(max(0.0, 1.0 - dot(decal_normal.xy, decal_normal.xy)));
+						//convert to view space, use xzy because y is up
+						decal_normal = (decals.data[decal_index].normal_xform * decal_normal.xzy).xyz;
+
+						normal = normalize(mix(normal, decal_normal, decal_albedo.a));
+					}
+
+					if (decals.data[decal_index].orm_rect != vec4(0.0)) {
+						vec3 decal_orm = textureGrad(sampler2D(decal_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].orm_rect.zw + decals.data[decal_index].orm_rect.xy, ddx * decals.data[decal_index].orm_rect.zw, ddy * decals.data[decal_index].orm_rect.zw).xyz;
+						ao = mix(ao, decal_orm.r, decal_albedo.a);
+						roughness = mix(roughness, decal_orm.g, decal_albedo.a);
+						metallic = mix(metallic, decal_orm.b, decal_albedo.a);
+					}
 				}
-			}
 
-			if (decals.data[decal_index].emission_rect != vec4(0.0)) {
-				//emission is additive, so its independent from albedo
-				emission += textureGrad(sampler2D(decal_atlas_srgb, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].emission_rect.zw + decals.data[decal_index].emission_rect.xy, ddx * decals.data[decal_index].emission_rect.zw, ddy * decals.data[decal_index].emission_rect.zw).xyz * decals.data[decal_index].emission_energy * fade;
+				if (decals.data[decal_index].emission_rect != vec4(0.0)) {
+					//emission is additive, so its independent from albedo
+					emission += textureGrad(sampler2D(decal_atlas_srgb, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].emission_rect.zw + decals.data[decal_index].emission_rect.xy, ddx * decals.data[decal_index].emission_rect.zw, ddy * decals.data[decal_index].emission_rect.zw).xyz * decals.data[decal_index].emission_energy * fade;
+				}
 			}
 		}
 	}
@@ -2348,12 +2415,45 @@ FRAGMENT_SHADER_CODE
 		vec4 reflection_accum = vec4(0.0, 0.0, 0.0, 0.0);
 		vec4 ambient_accum = vec4(0.0, 0.0, 0.0, 0.0);
 
-		uint reflection_probe_count = cluster_cell.z >> CLUSTER_COUNTER_SHIFT;
-		uint reflection_probe_pointer = cluster_cell.z & CLUSTER_POINTER_MASK;
+		uint cluster_reflection_offset = cluster_offset + scene_data.cluster_type_size * 3;
+
+		uint item_min;
+		uint item_max;
+		uint item_from;
+		uint item_to;
+
+		cluster_get_item_range(cluster_reflection_offset + scene_data.max_cluster_element_count_div_32 + cluster_z, item_min, item_max, item_from, item_to);
+
+#ifdef USE_SUBGROUPS
+		item_from = subgroupBroadcastFirst(subgroupMin(item_from));
+		item_to = subgroupBroadcastFirst(subgroupMax(item_to));
+#endif
+
+		for (uint i = item_from; i < item_to; i++) {
+			uint mask = cluster_buffer.data[cluster_reflection_offset + i];
+			mask &= cluster_get_range_clip_mask(i, item_min, item_max);
+#ifdef USE_SUBGROUPS
+			uint merged_mask = subgroupBroadcastFirst(subgroupOr(mask));
+#else
+			uint merged_mask = mask;
+#endif
 
-		for (uint i = 0; i < reflection_probe_count; i++) {
-			uint ref_index = cluster_data.indices[reflection_probe_pointer + i];
-			reflection_process(ref_index, vertex, normal, roughness, ambient_light, specular_light, ambient_accum, reflection_accum);
+			while (merged_mask != 0) {
+				uint bit = findMSB(merged_mask);
+				merged_mask &= ~(1 << bit);
+#ifdef USE_SUBGROUPS
+				if (((1 << bit) & mask) == 0) { //do not process if not originally here
+					continue;
+				}
+#endif
+				uint reflection_index = 32 * i + bit;
+
+				if (!bool(reflections.data[reflection_index].mask & draw_call.layer_mask)) {
+					continue; //not masked
+				}
+
+				reflection_process(reflection_index, vertex, normal, roughness, ambient_light, specular_light, ambient_accum, reflection_accum);
+			}
 		}
 
 		if (reflection_accum.a > 0.0) {
@@ -2800,7 +2900,9 @@ FRAGMENT_SHADER_CODE
 					shadow = float(shadow1 >> ((i - 4) * 8) & 0xFF) / 255.0;
 				}
 
-				light_compute(normal, directional_lights.data[i].direction, normalize(view), directional_lights.data[i].color * directional_lights.data[i].energy, shadow, f0, orms,
+				blur_shadow(shadow);
+
+				light_compute(normal, directional_lights.data[i].direction, normalize(view), directional_lights.data[i].color * directional_lights.data[i].energy, shadow, f0, orms, 1.0,
 #ifdef LIGHT_BACKLIGHT_USED
 						backlight,
 #endif
@@ -2833,154 +2935,146 @@ FRAGMENT_SHADER_CODE
 
 		{ //omni lights
 
-			uint omni_light_count = cluster_cell.x >> CLUSTER_COUNTER_SHIFT;
-			uint omni_light_pointer = cluster_cell.x & CLUSTER_POINTER_MASK;
+			uint cluster_omni_offset = cluster_offset;
 
-			// Do shadow and lighting in two passes to reduce register pressure
-			uint shadow0 = 0;
-			uint shadow1 = 0;
-			uint shadow2 = 0;
+			uint item_min;
+			uint item_max;
+			uint item_from;
+			uint item_to;
 
-			for (uint i = 0; i < 18; i++) {
-				if (i >= omni_light_count) {
-					break;
-				}
-				uint light_index = cluster_data.indices[omni_light_pointer + i];
+			cluster_get_item_range(cluster_omni_offset + scene_data.max_cluster_element_count_div_32 + cluster_z, item_min, item_max, item_from, item_to);
 
-				if (!bool(lights.data[light_index].mask & draw_call.layer_mask)) {
-					continue; //not masked
-				}
+#ifdef USE_SUBGROUPS
+			item_from = subgroupBroadcastFirst(subgroupMin(item_from));
+			item_to = subgroupBroadcastFirst(subgroupMax(item_to));
+#endif
 
-				float s = light_process_omni_shadow(light_index, vertex, view);
-				if (i < 6) {
-					shadow0 |= uint(clamp(s * 31.0, 0.0, 31.0)) << (i * 5);
-				} else if (i < 12) {
-					shadow1 |= uint(clamp(s * 31.0, 0.0, 31.0)) << ((i - 6) * 5);
-				} else {
-					shadow2 |= uint(clamp(s * 31.0, 0.0, 31.0)) << ((i - 12) * 5);
-				}
-			}
+			for (uint i = item_from; i < item_to; i++) {
+				uint mask = cluster_buffer.data[cluster_omni_offset + i];
+				mask &= cluster_get_range_clip_mask(i, item_min, item_max);
+#ifdef USE_SUBGROUPS
+				uint merged_mask = subgroupBroadcastFirst(subgroupOr(mask));
+#else
+			uint merged_mask = mask;
+#endif
 
-			for (uint i = 0; i < 18; i++) {
-				if (i == omni_light_count) {
-					break;
-				}
-				uint light_index = cluster_data.indices[omni_light_pointer + i];
+				while (merged_mask != 0) {
+					uint bit = findMSB(merged_mask);
+					merged_mask &= ~(1 << bit);
+#ifdef USE_SUBGROUPS
+					if (((1 << bit) & mask) == 0) { //do not process if not originally here
+						continue;
+					}
+#endif
+					uint light_index = 32 * i + bit;
 
-				if (!bool(lights.data[light_index].mask & draw_call.layer_mask)) {
-					continue; //not masked
-				}
+					if (!bool(omni_lights.data[light_index].mask & draw_call.layer_mask)) {
+						continue; //not masked
+					}
 
-				float shadow;
-				if (i < 6) {
-					shadow = float(shadow0 >> (i * 5) & 0x1F) / 31.0;
-				} else if (i < 12) {
-					shadow = float(shadow1 >> ((i - 6) * 5) & 0x1F) / 31.0;
-				} else {
-					shadow = float(shadow1 >> ((i - 12) * 5) & 0x1F) / 31.0;
-				}
+					float shadow = light_process_omni_shadow(light_index, vertex, view);
+
+					shadow = blur_shadow(shadow);
 
-				light_process_omni(light_index, vertex, view, normal, vertex_ddx, vertex_ddy, f0, orms, shadow,
+					light_process_omni(light_index, vertex, view, normal, vertex_ddx, vertex_ddy, f0, orms, shadow,
 #ifdef LIGHT_BACKLIGHT_USED
-						backlight,
+							backlight,
 #endif
 #ifdef LIGHT_TRANSMITTANCE_USED
-						transmittance_color,
-						transmittance_depth,
-						transmittance_curve,
-						transmittance_boost,
+							transmittance_color,
+							transmittance_depth,
+							transmittance_curve,
+							transmittance_boost,
 #endif
 #ifdef LIGHT_RIM_USED
-						rim,
-						rim_tint,
-						albedo,
+							rim,
+							rim_tint,
+							albedo,
 #endif
 #ifdef LIGHT_CLEARCOAT_USED
-						clearcoat, clearcoat_gloss,
+							clearcoat, clearcoat_gloss,
 #endif
 #ifdef LIGHT_ANISOTROPY_USED
-						tangent, binormal, anisotropy,
+							tangent, binormal, anisotropy,
 #endif
 #ifdef USE_SHADOW_TO_OPACITY
-						alpha,
+							alpha,
 #endif
-						diffuse_light, specular_light);
+							diffuse_light, specular_light);
+				}
 			}
 		}
 
 		{ //spot lights
-			uint spot_light_count = cluster_cell.y >> CLUSTER_COUNTER_SHIFT;
-			uint spot_light_pointer = cluster_cell.y & CLUSTER_POINTER_MASK;
 
-			// Do shadow and lighting in two passes to reduce register pressure
-			uint shadow0 = 0;
-			uint shadow1 = 0;
-			uint shadow2 = 0;
+			uint cluster_spot_offset = cluster_offset + scene_data.cluster_type_size;
 
-			for (uint i = 0; i < 18; i++) {
-				if (i >= spot_light_count) {
-					break;
-				}
-				uint light_index = cluster_data.indices[spot_light_pointer + i];
+			uint item_min;
+			uint item_max;
+			uint item_from;
+			uint item_to;
 
-				if (!bool(lights.data[light_index].mask & draw_call.layer_mask)) {
-					continue; //not masked
-				}
+			cluster_get_item_range(cluster_spot_offset + scene_data.max_cluster_element_count_div_32 + cluster_z, item_min, item_max, item_from, item_to);
 
-				float s = light_process_spot_shadow(light_index, vertex, view);
-				if (i < 6) {
-					shadow0 |= uint(clamp(s * 31.0, 0.0, 31.0)) << (i * 5);
-				} else if (i < 12) {
-					shadow1 |= uint(clamp(s * 31.0, 0.0, 31.0)) << ((i - 6) * 5);
-				} else {
-					shadow2 |= uint(clamp(s * 31.0, 0.0, 31.0)) << ((i - 12) * 5);
-				}
-			}
+#ifdef USE_SUBGROUPS
+			item_from = subgroupBroadcastFirst(subgroupMin(item_from));
+			item_to = subgroupBroadcastFirst(subgroupMax(item_to));
+#endif
 
-			for (uint i = 0; i < 18; i++) {
-				if (i == spot_light_count) {
-					break;
-				}
-				uint light_index = cluster_data.indices[spot_light_pointer + i];
+			for (uint i = item_from; i < item_to; i++) {
+				uint mask = cluster_buffer.data[cluster_spot_offset + i];
+				mask &= cluster_get_range_clip_mask(i, item_min, item_max);
+#ifdef USE_SUBGROUPS
+				uint merged_mask = subgroupBroadcastFirst(subgroupOr(mask));
+#else
+			uint merged_mask = mask;
+#endif
 
-				if (!bool(lights.data[light_index].mask & draw_call.layer_mask)) {
-					continue; //not masked
-				}
+				while (merged_mask != 0) {
+					uint bit = findMSB(merged_mask);
+					merged_mask &= ~(1 << bit);
+#ifdef USE_SUBGROUPS
+					if (((1 << bit) & mask) == 0) { //do not process if not originally here
+						continue;
+					}
+#endif
 
-				float shadow;
-				if (i < 6) {
-					shadow = float(shadow0 >> (i * 5) & 0x1F) / 31.0;
-				} else if (i < 12) {
-					shadow = float(shadow1 >> ((i - 6) * 5) & 0x1F) / 31.0;
-				} else {
-					shadow = float(shadow1 >> ((i - 12) * 5) & 0x1F) / 31.0;
-				}
+					uint light_index = 32 * i + bit;
 
-				light_process_spot(light_index, vertex, view, normal, vertex_ddx, vertex_ddy, f0, orms, shadow,
+					if (!bool(spot_lights.data[light_index].mask & draw_call.layer_mask)) {
+						continue; //not masked
+					}
+
+					float shadow = light_process_spot_shadow(light_index, vertex, view);
+
+					shadow = blur_shadow(shadow);
+
+					light_process_spot(light_index, vertex, view, normal, vertex_ddx, vertex_ddy, f0, orms, shadow,
 #ifdef LIGHT_BACKLIGHT_USED
-						backlight,
+							backlight,
 #endif
 #ifdef LIGHT_TRANSMITTANCE_USED
-						transmittance_color,
-						transmittance_depth,
-						transmittance_curve,
-						transmittance_boost,
+							transmittance_color,
+							transmittance_depth,
+							transmittance_curve,
+							transmittance_boost,
 #endif
 #ifdef LIGHT_RIM_USED
-						rim,
-						rim_tint,
-						albedo,
+							rim,
+							rim_tint,
+							albedo,
 #endif
 #ifdef LIGHT_CLEARCOAT_USED
-						clearcoat, clearcoat_gloss,
+							clearcoat, clearcoat_gloss,
 #endif
 #ifdef LIGHT_ANISOTROPY_USED
-						tangent, binormal, anisotropy,
+							tangent, binormal, anisotropy,
 #endif
 #ifdef USE_SHADOW_TO_OPACITY
-						alpha,
+							alpha,
 #endif
-						diffuse_light, specular_light);
+							diffuse_light, specular_light);
+				}
 			}
 		}
 
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_inc.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_inc.glsl
index 87ce74ba88..a37e32e1fc 100644
--- a/servers/rendering/renderer_rd/shaders/scene_forward_inc.glsl
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_inc.glsl
@@ -3,6 +3,15 @@
 
 #define MAX_GI_PROBES 8
 
+#if defined(GL_KHR_shader_subgroup_ballot) && defined(GL_KHR_shader_subgroup_arithmetic)
+
+#extension GL_KHR_shader_subgroup_ballot : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+
+#define USE_SUBGROUPS
+
+#endif
+
 #include "cluster_data_inc.glsl"
 
 #if !defined(MODE_RENDER_DEPTH) || defined(MODE_RENDER_MATERIAL) || defined(MODE_RENDER_SDF) || defined(MODE_RENDER_NORMAL_ROUGHNESS) || defined(MODE_RENDER_GIPROBE) || defined(TANGENT_USED) || defined(NORMAL_MAP_USED)
@@ -52,6 +61,11 @@ layout(set = 0, binding = 3, std140) uniform SceneData {
 	vec2 viewport_size;
 	vec2 screen_pixel_size;
 
+	uint cluster_shift;
+	uint cluster_width;
+	uint cluster_type_size;
+	uint max_cluster_element_count_div_32;
+
 	//use vec4s because std140 doesnt play nice with vec2s, z and w are wasted
 	vec4 directional_penumbra_shadow_kernel[32];
 	vec4 directional_soft_shadow_kernel[32];
@@ -139,17 +153,22 @@ scene_data;
 #define INSTANCE_FLAGS_SKELETON (1 << 19)
 #define INSTANCE_FLAGS_NON_UNIFORM_SCALE (1 << 20)
 
-layout(set = 0, binding = 5, std430) restrict readonly buffer Lights {
+layout(set = 0, binding = 5, std430) restrict readonly buffer OmniLights {
+	LightData data[];
+}
+omni_lights;
+
+layout(set = 0, binding = 6, std430) restrict readonly buffer SpotLights {
 	LightData data[];
 }
-lights;
+spot_lights;
 
-layout(set = 0, binding = 6) buffer restrict readonly ReflectionProbeData {
+layout(set = 0, binding = 7) buffer restrict readonly ReflectionProbeData {
 	ReflectionData data[];
 }
 reflections;
 
-layout(set = 0, binding = 7, std140) uniform DirectionalLights {
+layout(set = 0, binding = 8, std140) uniform DirectionalLights {
 	DirectionalLightData data[MAX_DIRECTIONAL_LIGHT_DATA_STRUCTS];
 }
 directional_lights;
@@ -183,16 +202,9 @@ layout(set = 0, binding = 14, std430) restrict readonly buffer Decals {
 }
 decals;
 
-layout(set = 0, binding = 15) uniform utexture3D cluster_texture;
-
-layout(set = 0, binding = 16, std430) restrict readonly buffer ClusterData {
-	uint indices[];
-}
-cluster_data;
+layout(set = 0, binding = 15) uniform texture2D directional_shadow_atlas;
 
-layout(set = 0, binding = 17) uniform texture2D directional_shadow_atlas;
-
-layout(set = 0, binding = 18, std430) restrict readonly buffer GlobalVariableData {
+layout(set = 0, binding = 16, std430) restrict readonly buffer GlobalVariableData {
 	vec4 data[];
 }
 global_variables;
@@ -206,7 +218,7 @@ struct SDFGIProbeCascadeData {
 	float to_cell; // 1/bounds * grid_size
 };
 
-layout(set = 0, binding = 19, std140) uniform SDFGI {
+layout(set = 0, binding = 17, std140) uniform SDFGI {
 	vec3 grid_size;
 	uint max_cascades;
 
@@ -262,14 +274,19 @@ layout(set = 1, binding = 3) uniform texture2DArray lightmap_textures[MAX_LIGHTM
 layout(set = 1, binding = 4) uniform texture3D gi_probe_textures[MAX_GI_PROBES];
 #endif
 
+layout(set = 1, binding = 5, std430) buffer restrict readonly ClusterBuffer {
+	uint data[];
+}
+cluster_buffer;
+
 /* Set 3, Render Buffers */
 
 #ifdef MODE_RENDER_SDF
 
-layout(r16ui, set = 1, binding = 5) uniform restrict writeonly uimage3D albedo_volume_grid;
-layout(r32ui, set = 1, binding = 6) uniform restrict writeonly uimage3D emission_grid;
-layout(r32ui, set = 1, binding = 7) uniform restrict writeonly uimage3D emission_aniso_grid;
-layout(r32ui, set = 1, binding = 8) uniform restrict uimage3D geom_facing_grid;
+layout(r16ui, set = 1, binding = 6) uniform restrict writeonly uimage3D albedo_volume_grid;
+layout(r32ui, set = 1, binding = 7) uniform restrict writeonly uimage3D emission_grid;
+layout(r32ui, set = 1, binding = 8) uniform restrict writeonly uimage3D emission_aniso_grid;
+layout(r32ui, set = 1, binding = 9) uniform restrict uimage3D geom_facing_grid;
 
 //still need to be present for shaders that use it, so remap them to something
 #define depth_buffer shadow_atlas
@@ -278,17 +295,17 @@ layout(r32ui, set = 1, binding = 8) uniform restrict uimage3D geom_facing_grid;
 
 #else
 
-layout(set = 1, binding = 5) uniform texture2D depth_buffer;
-layout(set = 1, binding = 6) uniform texture2D color_buffer;
+layout(set = 1, binding = 6) uniform texture2D depth_buffer;
+layout(set = 1, binding = 7) uniform texture2D color_buffer;
 
 #ifndef LOW_END_MODE
 
-layout(set = 1, binding = 7) uniform texture2D normal_roughness_buffer;
-layout(set = 1, binding = 8) uniform texture2D ao_buffer;
-layout(set = 1, binding = 9) uniform texture2D ambient_buffer;
-layout(set = 1, binding = 10) uniform texture2D reflection_buffer;
-layout(set = 1, binding = 11) uniform texture2DArray sdfgi_lightprobe_texture;
-layout(set = 1, binding = 12) uniform texture3D sdfgi_occlusion_cascades;
+layout(set = 1, binding = 8) uniform texture2D normal_roughness_buffer;
+layout(set = 1, binding = 9) uniform texture2D ao_buffer;
+layout(set = 1, binding = 10) uniform texture2D ambient_buffer;
+layout(set = 1, binding = 11) uniform texture2D reflection_buffer;
+layout(set = 1, binding = 12) uniform texture2DArray sdfgi_lightprobe_texture;
+layout(set = 1, binding = 13) uniform texture3D sdfgi_occlusion_cascades;
 
 struct GIProbeData {
 	mat4 xform;
@@ -306,12 +323,12 @@ struct GIProbeData {
 	uint mipmaps;
 };
 
-layout(set = 1, binding = 13, std140) uniform GIProbes {
+layout(set = 1, binding = 14, std140) uniform GIProbes {
 	GIProbeData data[MAX_GI_PROBES];
 }
 gi_probes;
 
-layout(set = 1, binding = 14) uniform texture3D volumetric_fog_texture;
+layout(set = 1, binding = 15) uniform texture3D volumetric_fog_texture;
 
 #endif // LOW_END_MODE
 
diff --git a/servers/rendering/renderer_rd/shaders/sdfgi_direct_light.glsl b/servers/rendering/renderer_rd/shaders/sdfgi_direct_light.glsl
index 30dbf5871f..ed0a8a4b86 100644
--- a/servers/rendering/renderer_rd/shaders/sdfgi_direct_light.glsl
+++ b/servers/rendering/renderer_rd/shaders/sdfgi_direct_light.glsl
@@ -143,10 +143,78 @@ void main() {
 	uint voxel_albedo = process_voxels.data[voxel_index].albedo;
 
 	vec3 albedo = vec3(uvec3(voxel_albedo >> 10, voxel_albedo >> 5, voxel_albedo) & uvec3(0x1F)) / float(0x1F);
-	vec3 light_accum[6];
-
+	vec3 light_accum[6] = vec3[](vec3(0.0), vec3(0.0), vec3(0.0), vec3(0.0), vec3(0.0), vec3(0.0));
 	uint valid_aniso = (voxel_albedo >> 15) & 0x3F;
 
+	const vec3 aniso_dir[6] = vec3[](
+			vec3(1, 0, 0),
+			vec3(0, 1, 0),
+			vec3(0, 0, 1),
+			vec3(-1, 0, 0),
+			vec3(0, -1, 0),
+			vec3(0, 0, -1));
+
+	// Add indirect light first, in order to save computation resources
+#ifdef MODE_PROCESS_DYNAMIC
+	if (params.multibounce) {
+		vec3 pos = (vec3(positioni) + vec3(0.5)) * float(params.probe_axis_size - 1) / params.grid_size;
+		ivec3 probe_base_pos = ivec3(pos);
+
+		float weight_accum[6] = float[](0, 0, 0, 0, 0, 0);
+
+		ivec3 tex_pos = ivec3(probe_base_pos.xy, int(params.cascade));
+		tex_pos.x += probe_base_pos.z * int(params.probe_axis_size);
+
+		tex_pos.xy = tex_pos.xy * (OCT_SIZE + 2) + ivec2(1);
+
+		vec3 base_tex_posf = vec3(tex_pos);
+		vec2 tex_pixel_size = 1.0 / vec2(ivec2((OCT_SIZE + 2) * params.probe_axis_size * params.probe_axis_size, (OCT_SIZE + 2) * params.probe_axis_size));
+		vec3 probe_uv_offset = (ivec3(OCT_SIZE + 2, OCT_SIZE + 2, (OCT_SIZE + 2) * params.probe_axis_size)) * tex_pixel_size.xyx;
+
+		for (uint j = 0; j < 8; j++) {
+			ivec3 offset = (ivec3(j) >> ivec3(0, 1, 2)) & ivec3(1, 1, 1);
+			ivec3 probe_posi = probe_base_pos;
+			probe_posi += offset;
+
+			// Compute weight
+
+			vec3 probe_pos = vec3(probe_posi);
+			vec3 probe_to_pos = pos - probe_pos;
+			vec3 probe_dir = normalize(-probe_to_pos);
+
+			// Compute lightprobe texture position
+
+			vec3 trilinear = vec3(1.0) - abs(probe_to_pos);
+
+			for (uint k = 0; k < 6; k++) {
+				if (bool(valid_aniso & (1 << k))) {
+					vec3 n = aniso_dir[k];
+					float weight = trilinear.x * trilinear.y * trilinear.z * max(0.005, dot(n, probe_dir));
+
+					vec3 tex_posf = base_tex_posf + vec3(octahedron_encode(n) * float(OCT_SIZE), 0.0);
+					tex_posf.xy *= tex_pixel_size;
+
+					vec3 pos_uvw = tex_posf;
+					pos_uvw.xy += vec2(offset.xy) * probe_uv_offset.xy;
+					pos_uvw.x += float(offset.z) * probe_uv_offset.z;
+					vec3 indirect_light = textureLod(sampler2DArray(lightprobe_texture, linear_sampler), pos_uvw, 0.0).rgb;
+
+					light_accum[k] += indirect_light * weight;
+					weight_accum[k] += weight;
+				}
+			}
+		}
+
+		for (uint k = 0; k < 6; k++) {
+			if (weight_accum[k] > 0.0) {
+				light_accum[k] /= weight_accum[k];
+				light_accum[k] *= albedo;
+			}
+		}
+	}
+
+#endif
+
 	{
 		uint rgbe = process_voxels.data[voxel_index].light;
 
@@ -162,18 +230,10 @@ void main() {
 		uint aniso = process_voxels.data[voxel_index].light_aniso;
 		for (uint i = 0; i < 6; i++) {
 			float strength = ((aniso >> (i * 5)) & 0x1F) / float(0x1F);
-			light_accum[i] = l * strength;
+			light_accum[i] += l * strength;
 		}
 	}
 
-	const vec3 aniso_dir[6] = vec3[](
-			vec3(1, 0, 0),
-			vec3(0, 1, 0),
-			vec3(0, 0, 1),
-			vec3(-1, 0, 0),
-			vec3(0, -1, 0),
-			vec3(0, 0, -1));
-
 	// Raytrace light
 
 	vec3 pos_to_uvw = 1.0 / params.grid_size;
@@ -292,65 +352,6 @@ void main() {
 		}
 	}
 
-	// Add indirect light
-
-	if (params.multibounce) {
-		vec3 pos = (vec3(positioni) + vec3(0.5)) * float(params.probe_axis_size - 1) / params.grid_size;
-		ivec3 probe_base_pos = ivec3(pos);
-
-		vec4 probe_accum[6] = vec4[](vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
-		float weight_accum[6] = float[](0, 0, 0, 0, 0, 0);
-
-		ivec3 tex_pos = ivec3(probe_base_pos.xy, int(params.cascade));
-		tex_pos.x += probe_base_pos.z * int(params.probe_axis_size);
-
-		tex_pos.xy = tex_pos.xy * (OCT_SIZE + 2) + ivec2(1);
-
-		vec3 base_tex_posf = vec3(tex_pos);
-		vec2 tex_pixel_size = 1.0 / vec2(ivec2((OCT_SIZE + 2) * params.probe_axis_size * params.probe_axis_size, (OCT_SIZE + 2) * params.probe_axis_size));
-		vec3 probe_uv_offset = (ivec3(OCT_SIZE + 2, OCT_SIZE + 2, (OCT_SIZE + 2) * params.probe_axis_size)) * tex_pixel_size.xyx;
-
-		for (uint j = 0; j < 8; j++) {
-			ivec3 offset = (ivec3(j) >> ivec3(0, 1, 2)) & ivec3(1, 1, 1);
-			ivec3 probe_posi = probe_base_pos;
-			probe_posi += offset;
-
-			// Compute weight
-
-			vec3 probe_pos = vec3(probe_posi);
-			vec3 probe_to_pos = pos - probe_pos;
-			vec3 probe_dir = normalize(-probe_to_pos);
-
-			// Compute lightprobe texture position
-
-			vec3 trilinear = vec3(1.0) - abs(probe_to_pos);
-
-			for (uint k = 0; k < 6; k++) {
-				if (bool(valid_aniso & (1 << k))) {
-					vec3 n = aniso_dir[k];
-					float weight = trilinear.x * trilinear.y * trilinear.z * max(0.005, dot(n, probe_dir));
-
-					vec3 tex_posf = base_tex_posf + vec3(octahedron_encode(n) * float(OCT_SIZE), 0.0);
-					tex_posf.xy *= tex_pixel_size;
-
-					vec3 pos_uvw = tex_posf;
-					pos_uvw.xy += vec2(offset.xy) * probe_uv_offset.xy;
-					pos_uvw.x += float(offset.z) * probe_uv_offset.z;
-					vec4 indirect_light = textureLod(sampler2DArray(lightprobe_texture, linear_sampler), pos_uvw, 0.0);
-
-					probe_accum[k] += indirect_light * weight;
-					weight_accum[k] += weight;
-				}
-			}
-		}
-
-		for (uint k = 0; k < 6; k++) {
-			if (weight_accum[k] > 0.0) {
-				light_accum[k] += probe_accum[k].rgb * albedo / weight_accum[k];
-			}
-		}
-	}
-
 	// Store the light in the light texture
 
 	float lumas[6];
diff --git a/servers/rendering/renderer_rd/shaders/sdfgi_integrate.glsl b/servers/rendering/renderer_rd/shaders/sdfgi_integrate.glsl
index d516ab22c3..67630a3aa1 100644
--- a/servers/rendering/renderer_rd/shaders/sdfgi_integrate.glsl
+++ b/servers/rendering/renderer_rd/shaders/sdfgi_integrate.glsl
@@ -136,12 +136,24 @@ uint rgbe_encode(vec3 color) {
 	return (uint(sRed) & 0x1FF) | ((uint(sGreen) & 0x1FF) << 9) | ((uint(sBlue) & 0x1FF) << 18) | ((uint(exps) & 0x1F) << 27);
 }
 
+struct SH {
+#if (SH_SIZE == 16)
+	float c[48];
+#else
+	float c[28];
+#endif
+};
+
+shared SH sh_accum[64]; //8x8
+
 void main() {
 	ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
 	if (any(greaterThanEqual(pos, params.image_size))) { //too large, do nothing
 		return;
 	}
 
+	uint probe_index = gl_LocalInvocationID.x + gl_LocalInvocationID.y * 8;
+
 #ifdef MODE_PROCESS
 
 	float probe_cell_size = float(params.grid_size.x / float(params.probe_axis_size - 1)) / cascades.data[params.cascade].to_cell;
@@ -154,27 +166,9 @@ void main() {
 	vec3 probe_pos = cascades.data[params.cascade].offset + vec3(probe_cell) * probe_cell_size;
 	vec3 pos_to_uvw = 1.0 / params.grid_size;
 
-	vec4 probe_sh_accum[SH_SIZE] = vec4[](
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0)
-#if (SH_SIZE == 16)
-					,
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0),
-			vec4(0.0)
-#endif
-	);
+	for (uint i = 0; i < SH_SIZE * 3; i++) {
+		sh_accum[probe_index].c[i] = 0.0;
+	}
 
 	// quickly ensure each probe has a different "offset" for the vogel function, based on integer world position
 	uvec3 h3 = hash3(uvec3(params.world_offset + probe_cell));
@@ -278,33 +272,33 @@ void main() {
 		}
 
 		vec3 ray_dir2 = ray_dir * ray_dir;
-		float c[SH_SIZE] = float[](
-
-				0.282095, //l0
-				0.488603 * ray_dir.y, //l1n1
-				0.488603 * ray_dir.z, //l1n0
-				0.488603 * ray_dir.x, //l1p1
-				1.092548 * ray_dir.x * ray_dir.y, //l2n2
-				1.092548 * ray_dir.y * ray_dir.z, //l2n1
-				0.315392 * (3.0 * ray_dir2.z - 1.0), //l20
-				1.092548 * ray_dir.x * ray_dir.z, //l2p1
-				0.546274 * (ray_dir2.x - ray_dir2.y) //l2p2
+
+#define SH_ACCUM(m_idx, m_value)                       \
+	{                                                  \
+		vec3 l = light.rgb * (m_value);                \
+		sh_accum[probe_index].c[m_idx * 3 + 0] += l.r; \
+		sh_accum[probe_index].c[m_idx * 3 + 1] += l.g; \
+		sh_accum[probe_index].c[m_idx * 3 + 2] += l.b; \
+	}
+		SH_ACCUM(0, 0.282095); //l0
+		SH_ACCUM(1, 0.488603 * ray_dir.y); //l1n1
+		SH_ACCUM(2, 0.488603 * ray_dir.z); //l1n0
+		SH_ACCUM(3, 0.488603 * ray_dir.x); //l1p1
+		SH_ACCUM(4, 1.092548 * ray_dir.x * ray_dir.y); //l2n2
+		SH_ACCUM(5, 1.092548 * ray_dir.y * ray_dir.z); //l2n1
+		SH_ACCUM(6, 0.315392 * (3.0 * ray_dir2.z - 1.0)); //l20
+		SH_ACCUM(7, 1.092548 * ray_dir.x * ray_dir.z); //l2p1
+		SH_ACCUM(8, 0.546274 * (ray_dir2.x - ray_dir2.y)); //l2p2
 #if (SH_SIZE == 16)
-				,
-				0.590043 * ray_dir.y * (3.0f * ray_dir2.x - ray_dir2.y),
-				2.890611 * ray_dir.y * ray_dir.x * ray_dir.z,
-				0.646360 * ray_dir.y * (-1.0f + 5.0f * ray_dir2.z),
-				0.373176 * (5.0f * ray_dir2.z * ray_dir.z - 3.0f * ray_dir.z),
-				0.457045 * ray_dir.x * (-1.0f + 5.0f * ray_dir2.z),
-				1.445305 * (ray_dir2.x - ray_dir2.y) * ray_dir.z,
-				0.590043 * ray_dir.x * (ray_dir2.x - 3.0f * ray_dir2.y)
+		SH_ACCUM(9, 0.590043 * ray_dir.y * (3.0f * ray_dir2.x - ray_dir2.y));
+		SH_ACCUM(10, 2.890611 * ray_dir.y * ray_dir.x * ray_dir.z);
+		SH_ACCUM(11, 0.646360 * ray_dir.y * (-1.0f + 5.0f * ray_dir2.z));
+		SH_ACCUM(12, 0.373176 * (5.0f * ray_dir2.z * ray_dir.z - 3.0f * ray_dir.z));
+		SH_ACCUM(13, 0.457045 * ray_dir.x * (-1.0f + 5.0f * ray_dir2.z));
+		SH_ACCUM(14, 1.445305 * (ray_dir2.x - ray_dir2.y) * ray_dir.z);
+		SH_ACCUM(15, 0.590043 * ray_dir.x * (ray_dir2.x - 3.0f * ray_dir2.y));
 
 #endif
-		);
-
-		for (uint j = 0; j < SH_SIZE; j++) {
-			probe_sh_accum[j] += light * c[j];
-		}
 	}
 
 	for (uint i = 0; i < SH_SIZE; i++) {
@@ -312,7 +306,7 @@ void main() {
 		ivec3 prev_pos = ivec3(pos.x, pos.y * SH_SIZE + i, int(params.history_index));
 		ivec2 average_pos = prev_pos.xy;
 
-		vec4 value = probe_sh_accum[i] * 4.0 / float(params.ray_count);
+		vec4 value = vec4(sh_accum[probe_index].c[i * 3 + 0], sh_accum[probe_index].c[i * 3 + 1], sh_accum[probe_index].c[i * 3 + 2], 1.0) * 4.0 / float(params.ray_count);
 
 		ivec4 ivalue = clamp(ivec4(value * float(1 << HISTORY_BITS)), -32768, 32767); //clamp to 16 bits, so higher values don't break average
 
@@ -344,37 +338,11 @@ void main() {
 	ivec2 oct_pos = (pos / OCT_SIZE) * (OCT_SIZE + 2) + ivec2(1);
 	ivec2 local_pos = pos % OCT_SIZE;
 
-	//fill the spherical harmonic
-	vec4 sh[SH_SIZE];
-
-	for (uint i = 0; i < SH_SIZE; i++) {
-		// store in history texture
-		ivec2 average_pos = sh_pos + ivec2(0, i);
-		ivec4 average = imageLoad(lightprobe_average_texture, average_pos);
-
-		sh[i] = (vec4(average) / float(params.history_size)) / float(1 << HISTORY_BITS);
-	}
-
 	//compute the octahedral normal for this texel
 	vec3 normal = octahedron_encode(vec2(local_pos) / float(OCT_SIZE));
-	/*
+
 	// read the spherical harmonic
-	const float c1 = 0.429043;
-	const float c2 = 0.511664;
-	const float c3 = 0.743125;
-	const float c4 = 0.886227;
-	const float c5 = 0.247708;
-	vec4 light = (c1 * sh[8] * (normal.x * normal.x - normal.y * normal.y) +
-					  c3 * sh[6] * normal.z * normal.z +
-					  c4 * sh[0] -
-					  c5 * sh[6] +
-					  2.0 * c1 * sh[4] * normal.x * normal.y +
-					  2.0 * c1 * sh[7] * normal.x * normal.z +
-					  2.0 * c1 * sh[5] * normal.y * normal.z +
-					  2.0 * c2 * sh[3] * normal.x +
-					  2.0 * c2 * sh[1] * normal.y +
-					  2.0 * c2 * sh[2] * normal.z);
-*/
+
 	vec3 normal2 = normal * normal;
 	float c[SH_SIZE] = float[](
 
@@ -426,7 +394,14 @@ void main() {
 	vec3 radiance = vec3(0.0);
 
 	for (uint i = 0; i < SH_SIZE; i++) {
-		vec3 m = sh[i].rgb * c[i] * 4.0;
+		// store in history texture
+		ivec2 average_pos = sh_pos + ivec2(0, i);
+		ivec4 average = imageLoad(lightprobe_average_texture, average_pos);
+
+		vec4 sh = (vec4(average) / float(params.history_size)) / float(1 << HISTORY_BITS);
+
+		vec3 m = sh.rgb * c[i] * 4.0;
+
 		irradiance += m * l_mult[i];
 		radiance += m;
 	}
diff --git a/servers/rendering/renderer_rd/shaders/volumetric_fog.glsl b/servers/rendering/renderer_rd/shaders/volumetric_fog.glsl
index 6215e721ce..aa32809a06 100644
--- a/servers/rendering/renderer_rd/shaders/volumetric_fog.glsl
+++ b/servers/rendering/renderer_rd/shaders/volumetric_fog.glsl
@@ -4,6 +4,15 @@
 
 VERSION_DEFINES
 
+/* Do not use subgroups here, seems there is not much advantage and causes glitches
+#extension GL_KHR_shader_subgroup_ballot: enable
+#extension GL_KHR_shader_subgroup_arithmetic: enable
+
+#if defined(GL_KHR_shader_subgroup_ballot) && defined(GL_KHR_shader_subgroup_arithmetic)
+#define USE_SUBGROUPS
+#endif
+*/
+
 #if defined(MODE_FOG) || defined(MODE_FILTER)
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
@@ -23,22 +32,25 @@ layout(local_size_x = 4, local_size_y = 4, local_size_z = 4) in;
 layout(set = 0, binding = 1) uniform texture2D shadow_atlas;
 layout(set = 0, binding = 2) uniform texture2D directional_shadow_atlas;
 
-layout(set = 0, binding = 3, std430) restrict readonly buffer Lights {
+layout(set = 0, binding = 3, std430) restrict readonly buffer OmniLights {
 	LightData data[];
 }
-lights;
+omni_lights;
 
-layout(set = 0, binding = 4, std140) uniform DirectionalLights {
+layout(set = 0, binding = 4, std430) restrict readonly buffer SpotLights {
+	LightData data[];
+}
+spot_lights;
+
+layout(set = 0, binding = 5, std140) uniform DirectionalLights {
 	DirectionalLightData data[MAX_DIRECTIONAL_LIGHT_DATA_STRUCTS];
 }
 directional_lights;
 
-layout(set = 0, binding = 5) uniform utexture3D cluster_texture;
-
-layout(set = 0, binding = 6, std430) restrict readonly buffer ClusterData {
-	uint indices[];
+layout(set = 0, binding = 6, std430) buffer restrict readonly ClusterBuffer {
+	uint data[];
 }
-cluster_data;
+cluster_buffer;
 
 layout(set = 0, binding = 7) uniform sampler linear_sampler;
 
@@ -132,7 +144,7 @@ layout(set = 1, binding = 2) uniform texture3D sdfgi_occlusion_texture;
 
 #endif //SDFGI
 
-layout(push_constant, binding = 0, std430) uniform Params {
+layout(set = 0, binding = 14, std140) uniform Params {
 	vec2 fog_frustum_size_begin;
 	vec2 fog_frustum_size_end;
 
@@ -150,7 +162,14 @@ layout(push_constant, binding = 0, std430) uniform Params {
 	float detail_spread;
 	float gi_inject;
 	uint max_gi_probes;
-	uint pad;
+	uint cluster_type_size;
+
+	vec2 screen_size;
+	uint cluster_shift;
+	uint cluster_width;
+
+	uvec3 cluster_pad;
+	uint max_cluster_element_count_div_32;
 
 	mat3x4 cam_rotation;
 }
@@ -178,6 +197,22 @@ float get_omni_attenuation(float distance, float inv_range, float decay) {
 	return nd * pow(max(distance, 0.0001), -decay);
 }
 
+void cluster_get_item_range(uint p_offset, out uint item_min, out uint item_max, out uint item_from, out uint item_to) {
+	uint item_min_max = cluster_buffer.data[p_offset];
+	item_min = item_min_max & 0xFFFF;
+	item_max = item_min_max >> 16;
+	;
+
+	item_from = item_min >> 5;
+	item_to = (item_max == 0) ? 0 : ((item_max - 1) >> 5) + 1; //side effect of how it is stored, as item_max 0 means no elements
+}
+
+uint cluster_get_range_clip_mask(uint i, uint z_min, uint z_max) {
+	int local_min = clamp(int(z_min) - int(i) * 32, 0, 31);
+	int mask_width = min(int(z_max) - int(z_min), 32 - local_min);
+	return bitfieldInsert(uint(0), uint(0xFFFFFFFF), local_min, mask_width);
+}
+
 void main() {
 	vec3 fog_cell_size = 1.0 / vec3(params.fog_volume_size);
 
@@ -193,6 +228,12 @@ void main() {
 	//posf += mix(vec3(0.0),vec3(1.0),0.3) * hash3f(uvec3(pos)) * 2.0 - 1.0;
 
 	vec3 fog_unit_pos = posf * fog_cell_size + fog_cell_size * 0.5; //center of voxels
+
+	uvec2 screen_pos = uvec2(fog_unit_pos.xy * params.screen_size);
+	uvec2 cluster_pos = screen_pos >> params.cluster_shift;
+	uint cluster_offset = (params.cluster_width * cluster_pos.y + cluster_pos.x) * (params.max_cluster_element_count_div_32 + 32);
+	//positions in screen are too spread apart, no hopes for optimizing with subgroups
+
 	fog_unit_pos.z = pow(fog_unit_pos.z, params.detail_spread);
 
 	vec3 view_pos;
@@ -200,6 +241,8 @@ void main() {
 	view_pos.z = -params.fog_frustum_end * fog_unit_pos.z;
 	view_pos.y = -view_pos.y;
 
+	uint cluster_z = uint(clamp((abs(view_pos.z) / params.z_far) * 32.0, 0.0, 31.0));
+
 	vec3 total_light = params.light_color;
 
 	float total_density = params.base_density;
@@ -266,95 +309,160 @@ void main() {
 
 	//compute lights from cluster
 
-	vec3 cluster_pos;
-	cluster_pos.xy = fog_unit_pos.xy;
-	cluster_pos.z = clamp((abs(view_pos.z) - params.z_near) / (params.z_far - params.z_near), 0.0, 1.0);
+	{ //omni lights
+
+		uint cluster_omni_offset = cluster_offset;
+
+		uint item_min;
+		uint item_max;
+		uint item_from;
+		uint item_to;
+
+		cluster_get_item_range(cluster_omni_offset + params.max_cluster_element_count_div_32 + cluster_z, item_min, item_max, item_from, item_to);
+
+#ifdef USE_SUBGROUPS
+		item_from = subgroupBroadcastFirst(subgroupMin(item_from));
+		item_to = subgroupBroadcastFirst(subgroupMax(item_to));
+#endif
 
-	uvec4 cluster_cell = texture(usampler3D(cluster_texture, linear_sampler), cluster_pos);
+		for (uint i = item_from; i < item_to; i++) {
+			uint mask = cluster_buffer.data[cluster_omni_offset + i];
+			mask &= cluster_get_range_clip_mask(i, item_min, item_max);
+#ifdef USE_SUBGROUPS
+			uint merged_mask = subgroupBroadcastFirst(subgroupOr(mask));
+#else
+			uint merged_mask = mask;
+#endif
 
-	uint omni_light_count = cluster_cell.x >> CLUSTER_COUNTER_SHIFT;
-	uint omni_light_pointer = cluster_cell.x & CLUSTER_POINTER_MASK;
+			while (merged_mask != 0) {
+				uint bit = findMSB(merged_mask);
+				merged_mask &= ~(1 << bit);
+#ifdef USE_SUBGROUPS
+				if (((1 << bit) & mask) == 0) { //do not process if not originally here
+					continue;
+				}
+#endif
+				uint light_index = 32 * i + bit;
 
-	for (uint i = 0; i < omni_light_count; i++) {
-		uint light_index = cluster_data.indices[omni_light_pointer + i];
+				//if (!bool(omni_omni_lights.data[light_index].mask & draw_call.layer_mask)) {
+				//	continue; //not masked
+				//}
 
-		vec3 light_pos = lights.data[i].position;
-		float d = distance(lights.data[i].position, view_pos);
-		float shadow_attenuation = 1.0;
+				vec3 light_pos = omni_lights.data[light_index].position;
+				float d = distance(omni_lights.data[light_index].position, view_pos);
+				float shadow_attenuation = 1.0;
 
-		if (d * lights.data[i].inv_radius < 1.0) {
-			float attenuation = get_omni_attenuation(d, lights.data[i].inv_radius, lights.data[i].attenuation);
+				if (d * omni_lights.data[light_index].inv_radius < 1.0) {
+					float attenuation = get_omni_attenuation(d, omni_lights.data[light_index].inv_radius, omni_lights.data[light_index].attenuation);
 
-			vec3 light = lights.data[i].color / M_PI;
+					vec3 light = omni_lights.data[light_index].color / M_PI;
 
-			if (lights.data[i].shadow_enabled) {
-				//has shadow
-				vec4 v = vec4(view_pos, 1.0);
+					if (omni_lights.data[light_index].shadow_enabled) {
+						//has shadow
+						vec4 v = vec4(view_pos, 1.0);
 
-				vec4 splane = (lights.data[i].shadow_matrix * v);
-				float shadow_len = length(splane.xyz); //need to remember shadow len from here
+						vec4 splane = (omni_lights.data[light_index].shadow_matrix * v);
+						float shadow_len = length(splane.xyz); //need to remember shadow len from here
 
-				splane.xyz = normalize(splane.xyz);
-				vec4 clamp_rect = lights.data[i].atlas_rect;
+						splane.xyz = normalize(splane.xyz);
+						vec4 clamp_rect = omni_lights.data[light_index].atlas_rect;
 
-				if (splane.z >= 0.0) {
-					splane.z += 1.0;
+						if (splane.z >= 0.0) {
+							splane.z += 1.0;
 
-					clamp_rect.y += clamp_rect.w;
+							clamp_rect.y += clamp_rect.w;
 
-				} else {
-					splane.z = 1.0 - splane.z;
-				}
+						} else {
+							splane.z = 1.0 - splane.z;
+						}
 
-				splane.xy /= splane.z;
+						splane.xy /= splane.z;
 
-				splane.xy = splane.xy * 0.5 + 0.5;
-				splane.z = shadow_len * lights.data[i].inv_radius;
-				splane.xy = clamp_rect.xy + splane.xy * clamp_rect.zw;
-				splane.w = 1.0; //needed? i think it should be 1 already
+						splane.xy = splane.xy * 0.5 + 0.5;
+						splane.z = shadow_len * omni_lights.data[light_index].inv_radius;
+						splane.xy = clamp_rect.xy + splane.xy * clamp_rect.zw;
+						splane.w = 1.0; //needed? i think it should be 1 already
 
-				float depth = texture(sampler2D(shadow_atlas, linear_sampler), splane.xy).r;
+						float depth = texture(sampler2D(shadow_atlas, linear_sampler), splane.xy).r;
 
-				shadow_attenuation = exp(min(0.0, (depth - splane.z)) / lights.data[i].inv_radius * lights.data[i].shadow_volumetric_fog_fade);
+						shadow_attenuation = exp(min(0.0, (depth - splane.z)) / omni_lights.data[light_index].inv_radius * omni_lights.data[light_index].shadow_volumetric_fog_fade);
+					}
+					total_light += light * attenuation * shadow_attenuation;
+				}
 			}
-			total_light += light * attenuation * shadow_attenuation;
 		}
 	}
 
-	uint spot_light_count = cluster_cell.y >> CLUSTER_COUNTER_SHIFT;
-	uint spot_light_pointer = cluster_cell.y & CLUSTER_POINTER_MASK;
+	{ //spot lights
+
+		uint cluster_spot_offset = cluster_offset + params.cluster_type_size;
+
+		uint item_min;
+		uint item_max;
+		uint item_from;
+		uint item_to;
+
+		cluster_get_item_range(cluster_spot_offset + params.max_cluster_element_count_div_32 + cluster_z, item_min, item_max, item_from, item_to);
+
+#ifdef USE_SUBGROUPS
+		item_from = subgroupBroadcastFirst(subgroupMin(item_from));
+		item_to = subgroupBroadcastFirst(subgroupMax(item_to));
+#endif
+
+		for (uint i = item_from; i < item_to; i++) {
+			uint mask = cluster_buffer.data[cluster_spot_offset + i];
+			mask &= cluster_get_range_clip_mask(i, item_min, item_max);
+#ifdef USE_SUBGROUPS
+			uint merged_mask = subgroupBroadcastFirst(subgroupOr(mask));
+#else
+			uint merged_mask = mask;
+#endif
 
-	for (uint i = 0; i < spot_light_count; i++) {
-		uint light_index = cluster_data.indices[spot_light_pointer + i];
+			while (merged_mask != 0) {
+				uint bit = findMSB(merged_mask);
+				merged_mask &= ~(1 << bit);
+#ifdef USE_SUBGROUPS
+				if (((1 << bit) & mask) == 0) { //do not process if not originally here
+					continue;
+				}
+#endif
 
-		vec3 light_pos = lights.data[i].position;
-		vec3 light_rel_vec = lights.data[i].position - view_pos;
-		float d = length(light_rel_vec);
-		float shadow_attenuation = 1.0;
+				//if (!bool(omni_lights.data[light_index].mask & draw_call.layer_mask)) {
+				//	continue; //not masked
+				//}
 
-		if (d * lights.data[i].inv_radius < 1.0) {
-			float attenuation = get_omni_attenuation(d, lights.data[i].inv_radius, lights.data[i].attenuation);
+				uint light_index = 32 * i + bit;
 
-			vec3 spot_dir = lights.data[i].direction;
-			float scos = max(dot(-normalize(light_rel_vec), spot_dir), lights.data[i].cone_angle);
-			float spot_rim = max(0.0001, (1.0 - scos) / (1.0 - lights.data[i].cone_angle));
-			attenuation *= 1.0 - pow(spot_rim, lights.data[i].cone_attenuation);
+				vec3 light_pos = omni_lights.data[light_index].position;
+				vec3 light_rel_vec = omni_lights.data[light_index].position - view_pos;
+				float d = length(light_rel_vec);
+				float shadow_attenuation = 1.0;
 
-			vec3 light = lights.data[i].color / M_PI;
+				if (d * omni_lights.data[light_index].inv_radius < 1.0) {
+					float attenuation = get_omni_attenuation(d, omni_lights.data[light_index].inv_radius, omni_lights.data[light_index].attenuation);
 
-			if (lights.data[i].shadow_enabled) {
-				//has shadow
-				vec4 v = vec4(view_pos, 1.0);
+					vec3 spot_dir = omni_lights.data[light_index].direction;
+					float scos = max(dot(-normalize(light_rel_vec), spot_dir), omni_lights.data[light_index].cone_angle);
+					float spot_rim = max(0.0001, (1.0 - scos) / (1.0 - omni_lights.data[light_index].cone_angle));
+					attenuation *= 1.0 - pow(spot_rim, omni_lights.data[light_index].cone_attenuation);
 
-				vec4 splane = (lights.data[i].shadow_matrix * v);
-				splane /= splane.w;
+					vec3 light = omni_lights.data[light_index].color / M_PI;
 
-				float depth = texture(sampler2D(shadow_atlas, linear_sampler), splane.xy).r;
+					if (omni_lights.data[light_index].shadow_enabled) {
+						//has shadow
+						vec4 v = vec4(view_pos, 1.0);
 
-				shadow_attenuation = exp(min(0.0, (depth - splane.z)) / lights.data[i].inv_radius * lights.data[i].shadow_volumetric_fog_fade);
-			}
+						vec4 splane = (omni_lights.data[light_index].shadow_matrix * v);
+						splane /= splane.w;
 
-			total_light += light * attenuation * shadow_attenuation;
+						float depth = texture(sampler2D(shadow_atlas, linear_sampler), splane.xy).r;
+
+						shadow_attenuation = exp(min(0.0, (depth - splane.z)) / omni_lights.data[light_index].inv_radius * omni_lights.data[light_index].shadow_volumetric_fog_fade);
+					}
+
+					total_light += light * attenuation * shadow_attenuation;
+				}
+			}
 		}
 	}
author	reduz <reduzio@gmail.com>	2021-01-17 13:25:38 -0300
committer	Juan Linietsky <reduzio@gmail.com>	2021-01-19 23:31:06 +0100
commit	099dee35f47db3e293cb8e60287ffe6a44f3d5d4 (patch)
tree	dea148899efa156adf4c7b9ff32464871cef4253 /servers/rendering/renderer_rd/shaders
parent	7008e3c6eafa374e5d64ee7867608abe696698c2 (diff)