6 files changed, 438 insertions, 1 deletions
diff --git a/drivers/gles3/shaders/SCsub b/drivers/gles3/shaders/SCsub
index b5797e78b8..dd7ec45242 100644
--- a/drivers/gles3/shaders/SCsub
+++ b/drivers/gles3/shaders/SCsub
@@ -12,5 +12,8 @@ if env['BUILDERS'].has_key('GLES3_GLSL'):
 	env.GLES3_GLSL('screen_space_reflection.glsl');
 	env.GLES3_GLSL('effect_blur.glsl');
 	env.GLES3_GLSL('subsurf_scattering.glsl');
+	env.GLES3_GLSL('ssao.glsl');
+	env.GLES3_GLSL('ssao_minify.glsl');
+	env.GLES3_GLSL('ssao_blur.glsl');
 
 
diff --git a/drivers/gles3/shaders/effect_blur.glsl b/drivers/gles3/shaders/effect_blur.glsl
index fc6de7f654..211b60ca2e 100644
--- a/drivers/gles3/shaders/effect_blur.glsl
+++ b/drivers/gles3/shaders/effect_blur.glsl
@@ -19,12 +19,22 @@ void main() {
 in vec2 uv_interp;
 uniform sampler2D source_color; //texunit:0
 
+#ifdef SSAO_MERGE
+uniform sampler2D source_ssao; //texunit:1
+#endif
+
 uniform float lod;
 uniform vec2 pixel_size;
 
 
 layout(location = 0) out vec4 frag_color;
 
+#ifdef SSAO_MERGE
+
+uniform vec4 ssao_color;
+
+#endif
+
 void main() {
 
 
@@ -52,6 +62,13 @@ void main() {
 	frag_color = color;
 #endif
 
+#ifdef SSAO_MERGE
+
+	vec4 color =textureLod( source_color,  uv_interp,0.0);
+	float ssao =textureLod( source_ssao,  uv_interp,0.0).r;
 
+	frag_color = vec4( mix(color.rgb,color.rgb*mix(ssao_color.rgb,vec3(1.0),ssao),color.a), 1.0 );
+
+#endif
 }
 
diff --git a/drivers/gles3/shaders/scene.glsl b/drivers/gles3/shaders/scene.glsl
index 230544c1c3..61e9e37d2b 100644
--- a/drivers/gles3/shaders/scene.glsl
+++ b/drivers/gles3/shaders/scene.glsl
@@ -76,6 +76,7 @@ layout(std140) uniform SceneData { //ubo:0
 
 	float reflection_multiplier;
 	float subsurface_scatter_width;
+	float ambient_occlusion_affect_light;
 
 };
 
@@ -387,6 +388,7 @@ layout(std140) uniform SceneData {
 
 	float reflection_multiplier;
 	float subsurface_scatter_width;
+	float ambient_occlusion_affect_light;
 
 };
 
@@ -1223,7 +1225,7 @@ LIGHT_SHADER_CODE
 	float max_ambient=max(ambient_light.r,max(ambient_light.g,ambient_light.b));
 	float max_diffuse=max(diffuse_light.r,max(diffuse_light.g,diffuse_light.b));
 	float total_ambient = max_ambient+max_diffuse+max_emission;
-	float ambient_scale = (total_ambient>0.0) ? max_ambient/total_ambient : 0.0;
+	float ambient_scale = (total_ambient>0.0) ? (max_ambient+ambient_occlusion_affect_light*max_diffuse)/total_ambient : 0.0;
 #endif //ENABLE_AO
 
 	diffuse_buffer=vec4(emission+diffuse_light+ambient_light,ambient_scale);
diff --git a/drivers/gles3/shaders/ssao.glsl b/drivers/gles3/shaders/ssao.glsl
new file mode 100644
index 0000000000..75f49ef37a
--- /dev/null
+++ b/drivers/gles3/shaders/ssao.glsl
@@ -0,0 +1,247 @@
+[vertex]
+
+
+layout(location=0) in highp vec4 vertex_attrib;
+
+void main() {
+
+	gl_Position = vertex_attrib;
+	gl_Position.z=1.0;
+}
+
+[fragment]
+
+
+#define NUM_SAMPLES (11)
+
+// If using depth mip levels, the log of the maximum pixel offset before we need to switch to a lower
+// miplevel to maintain reasonable spatial locality in the cache
+// If this number is too small (< 3), too many taps will land in the same pixel, and we'll get bad variance that manifests as flashing.
+// If it is too high (> 5), we'll get bad performance because we're not using the MIP levels effectively
+#define LOG_MAX_OFFSET (3)
+
+// This must be less than or equal to the MAX_MIP_LEVEL defined in SSAO.cpp
+#define MAX_MIP_LEVEL (4)
+
+// This is the number of turns around the circle that the spiral pattern makes.  This should be prime to prevent
+// taps from lining up.  This particular choice was tuned for NUM_SAMPLES == 9
+#define NUM_SPIRAL_TURNS (7)
+
+
+uniform sampler2D source_depth; //texunit:0
+uniform usampler2D source_depth_mipmaps; //texunit:1
+uniform sampler2D source_normal; //texunit:2
+
+uniform ivec2 screen_size;
+uniform float camera_z_far;
+uniform float camera_z_near;
+
+uniform float intensity_div_r6;
+uniform float radius;
+
+#ifdef ENABLE_RADIUS2
+uniform float intensity_div_r62;
+uniform float radius2;
+#endif
+
+uniform float bias;
+uniform float proj_scale;
+
+layout(location = 0) out float visibility;
+
+uniform vec4 proj_info;
+
+vec3 reconstructCSPosition(vec2 S, float z) {
+    return vec3((S.xy * proj_info.xy + proj_info.zw) * z, z);
+}
+
+vec3 getPosition(ivec2 ssP) {
+    vec3 P;
+    P.z = texelFetch(source_depth, ssP, 0).r;
+
+    P.z = P.z * 2.0 - 1.0;
+    P.z = 2.0 * camera_z_near * camera_z_far / (camera_z_far + camera_z_near - P.z * (camera_z_far - camera_z_near));
+    P.z = -P.z;
+
+    // Offset to pixel center
+    P = reconstructCSPosition(vec2(ssP) + vec2(0.5), P.z);
+    return P;
+}
+
+/** Reconstructs screen-space unit normal from screen-space position */
+vec3 reconstructCSFaceNormal(vec3 C) {
+    return normalize(cross(dFdy(C), dFdx(C)));
+}
+
+
+
+/** Returns a unit vector and a screen-space radius for the tap on a unit disk (the caller should scale by the actual disk radius) */
+vec2 tapLocation(int sampleNumber, float spinAngle, out float ssR){
+    // Radius relative to ssR
+    float alpha = float(sampleNumber + 0.5) * (1.0 / NUM_SAMPLES);
+    float angle = alpha * (NUM_SPIRAL_TURNS * 6.28) + spinAngle;
+
+    ssR = alpha;
+    return vec2(cos(angle), sin(angle));
+}
+
+
+/** Read the camera-space position of the point at screen-space pixel ssP + unitOffset * ssR.  Assumes length(unitOffset) == 1 */
+vec3 getOffsetPosition(ivec2 ssC, vec2 unitOffset, float ssR) {
+    // Derivation:
+    //  mipLevel = floor(log(ssR / MAX_OFFSET));
+	int mipLevel = clamp(int(floor(log2(ssR))) - LOG_MAX_OFFSET, 0, MAX_MIP_LEVEL);
+
+	ivec2 ssP = ivec2(ssR * unitOffset) + ssC;
+
+	vec3 P;
+
+	// We need to divide by 2^mipLevel to read the appropriately scaled coordinate from a MIP-map.
+	// Manually clamp to the texture size because texelFetch bypasses the texture unit
+	ivec2 mipP = clamp(ssP >> mipLevel, ivec2(0), (screen_size >> mipLevel) - ivec2(1));
+
+
+	if (mipLevel < 1) {
+		//read from depth buffer
+		P.z = texelFetch(source_depth, mipP, 0).r;
+		P.z = P.z * 2.0 - 1.0;
+		P.z = 2.0 * camera_z_near * camera_z_far / (camera_z_far + camera_z_near - P.z * (camera_z_far - camera_z_near));
+		P.z = -P.z;
+
+	} else {
+		//read from mipmaps
+		uint d = texelFetch(source_depth_mipmaps, mipP, mipLevel-1).r;
+		P.z = -(float(d)/65535.0)*camera_z_far;
+	}
+
+
+	// Offset to pixel center
+	P = reconstructCSPosition(vec2(ssP) + vec2(0.5), P.z);
+
+	return P;
+}
+
+
+
+/** Compute the occlusion due to sample with index \a i about the pixel at \a ssC that corresponds
+    to camera-space point \a C with unit normal \a n_C, using maximum screen-space sampling radius \a ssDiskRadius
+
+    Note that units of H() in the HPG12 paper are meters, not
+    unitless.  The whole falloff/sampling function is therefore
+    unitless.  In this implementation, we factor out (9 / radius).
+
+    Four versions of the falloff function are implemented below
+*/
+float sampleAO(in ivec2 ssC, in vec3 C, in vec3 n_C, in float ssDiskRadius,in float p_radius, in int tapIndex, in float randomPatternRotationAngle) {
+    // Offset on the unit disk, spun for this pixel
+    float ssR;
+    vec2 unitOffset = tapLocation(tapIndex, randomPatternRotationAngle, ssR);
+    ssR *= ssDiskRadius;
+
+    // The occluding point in camera space
+    vec3 Q = getOffsetPosition(ssC, unitOffset, ssR);
+
+    vec3 v = Q - C;
+
+    float vv = dot(v, v);
+    float vn = dot(v, n_C);
+
+    const float epsilon = 0.01;
+    float radius2 = p_radius*p_radius;
+
+    // A: From the HPG12 paper
+    // Note large epsilon to avoid overdarkening within cracks
+    //return float(vv < radius2) * max((vn - bias) / (epsilon + vv), 0.0) * radius2 * 0.6;
+
+    // B: Smoother transition to zero (lowers contrast, smoothing out corners). [Recommended]
+    float f=max(radius2 - vv, 0.0);
+    return f * f * f * max((vn - bias) / (epsilon + vv), 0.0);
+
+    // C: Medium contrast (which looks better at high radii), no division.  Note that the
+    // contribution still falls off with radius^2, but we've adjusted the rate in a way that is
+    // more computationally efficient and happens to be aesthetically pleasing.
+    // return 4.0 * max(1.0 - vv * invRadius2, 0.0) * max(vn - bias, 0.0);
+
+    // D: Low contrast, no division operation
+    // return 2.0 * float(vv < radius * radius) * max(vn - bias, 0.0);
+}
+
+
+
+void main() {
+
+
+	// Pixel being shaded
+	ivec2 ssC = ivec2(gl_FragCoord.xy);
+
+	// World space point being shaded
+	vec3 C = getPosition(ssC);
+
+/*	if (C.z <= -camera_z_far*0.999) {
+	       // We're on the skybox
+	       visibility=1.0;
+	       return;
+	}*/
+
+	//visibility=-C.z/camera_z_far;
+	//return;
+
+	//vec3 n_C = texelFetch(source_normal,ssC,0).rgb * 2.0 - 1.0;
+
+	vec3 n_C = reconstructCSFaceNormal(C);
+	n_C = -n_C;
+
+
+	// Hash function used in the HPG12 AlchemyAO paper
+	float randomPatternRotationAngle = (3 * ssC.x ^ ssC.y + ssC.x * ssC.y) * 10;
+
+	// Reconstruct normals from positions. These will lead to 1-pixel black lines
+	// at depth discontinuities, however the blur will wipe those out so they are not visible
+	// in the final image.
+
+	// Choose the screen-space sample radius
+	// proportional to the projected area of the sphere
+	float ssDiskRadius = -proj_scale * radius / C.z;
+
+	float sum = 0.0;
+	for (int i = 0; i < NUM_SAMPLES; ++i) {
+		sum += sampleAO(ssC, C, n_C, ssDiskRadius, radius,i, randomPatternRotationAngle);
+	}
+
+	float A = max(0.0, 1.0 - sum * intensity_div_r6 * (5.0 / NUM_SAMPLES));
+
+#ifdef ENABLE_RADIUS2
+
+	//go again for radius2
+	randomPatternRotationAngle = (5 * ssC.x ^ ssC.y + ssC.x * ssC.y) * 11;
+
+	// Reconstruct normals from positions. These will lead to 1-pixel black lines
+	// at depth discontinuities, however the blur will wipe those out so they are not visible
+	// in the final image.
+
+	// Choose the screen-space sample radius
+	// proportional to the projected area of the sphere
+	ssDiskRadius = -proj_scale * radius2 / C.z;
+
+	sum = 0.0;
+	for (int i = 0; i < NUM_SAMPLES; ++i) {
+		sum += sampleAO(ssC, C, n_C, ssDiskRadius,radius2, i, randomPatternRotationAngle);
+	}
+
+	A= min(A,max(0.0, 1.0 - sum * intensity_div_r62 * (5.0 / NUM_SAMPLES)));
+#endif
+	// Bilateral box-filter over a quad for free, respecting depth edges
+	// (the difference that this makes is subtle)
+	if (abs(dFdx(C.z)) < 0.02) {
+		A -= dFdx(A) * ((ssC.x & 1) - 0.5);
+	}
+	if (abs(dFdy(C.z)) < 0.02) {
+		A -= dFdy(A) * ((ssC.y & 1) - 0.5);
+	}
+
+	visibility = A;
+
+}
+
+
+
diff --git a/drivers/gles3/shaders/ssao_blur.glsl b/drivers/gles3/shaders/ssao_blur.glsl
new file mode 100644
index 0000000000..31f3841a2a
--- /dev/null
+++ b/drivers/gles3/shaders/ssao_blur.glsl
@@ -0,0 +1,113 @@
+[vertex]
+
+
+layout(location=0) in highp vec4 vertex_attrib;
+
+
+void main() {
+
+	gl_Position = vertex_attrib;
+	gl_Position.z=1.0;
+}
+
+[fragment]
+
+
+uniform sampler2D source_ssao; //texunit:0
+uniform sampler2D source_depth; //texunit:1
+
+
+layout(location = 0) out float visibility;
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+// Tunable Parameters:
+
+/** Increase to make depth edges crisper. Decrease to reduce flicker. */
+#define EDGE_SHARPNESS     (1.0)
+
+/** Step in 2-pixel intervals since we already blurred against neighbors in the
+    first AO pass.  This constant can be increased while R decreases to improve
+    performance at the expense of some dithering artifacts.
+
+    Morgan found that a scale of 3 left a 1-pixel checkerboard grid that was
+    unobjectionable after shading was applied but eliminated most temporal incoherence
+    from using small numbers of sample taps.
+    */
+#define SCALE               (3)
+
+/** Filter radius in pixels. This will be multiplied by SCALE. */
+#define R                   (4)
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+
+
+// Gaussian coefficients
+const float gaussian[R + 1] =
+//    float[](0.356642, 0.239400, 0.072410, 0.009869);
+//    float[](0.398943, 0.241971, 0.053991, 0.004432, 0.000134);  // stddev = 1.0
+    float[](0.153170, 0.144893, 0.122649, 0.092902, 0.062970);  // stddev = 2.0
+//      float[](0.111220, 0.107798, 0.098151, 0.083953, 0.067458, 0.050920, 0.036108); // stddev = 3.0
+
+/** (1, 0) or (0, 1)*/
+uniform ivec2       axis;
+
+uniform float camera_z_far;
+uniform float camera_z_near;
+
+void main() {
+
+	ivec2 ssC = ivec2(gl_FragCoord.xy);
+
+	float depth = texelFetch(source_depth, ssC, 0).r;
+
+	depth = depth * 2.0 - 1.0;
+	depth = 2.0 * camera_z_near * camera_z_far / (camera_z_far + camera_z_near - depth * (camera_z_far - camera_z_near));
+
+	float depth_divide = 1.0 / camera_z_far;
+
+	depth*=depth_divide;
+
+	//if (depth > camera_z_far*0.999) {
+	//	discard;//skybox
+	//}
+
+	float sum = texelFetch(source_ssao, ssC, 0).r;
+
+	// Base weight for depth falloff.  Increase this for more blurriness,
+	// decrease it for better edge discrimination
+	float BASE = gaussian[0];
+	float totalWeight = BASE;
+	sum *= totalWeight;
+
+
+	for (int r = -R; r <= R; ++r) {
+		// We already handled the zero case above.  This loop should be unrolled and the static branch optimized out,
+		// so the IF statement has no runtime cost
+		if (r != 0) {
+
+			ivec2 ppos = ssC + axis * (r * SCALE);
+			float value = texelFetch(source_ssao, ppos, 0).r;
+			float temp_depth = texelFetch(source_depth, ssC, 0).r;
+
+			temp_depth = temp_depth * 2.0 - 1.0;
+			temp_depth = 2.0 * camera_z_near * camera_z_far / (camera_z_far + camera_z_near - temp_depth * (camera_z_far - camera_z_near));
+			temp_depth *= depth_divide;
+
+			// spatial domain: offset gaussian tap
+			float weight = 0.3 + gaussian[abs(r)];
+
+			// range domain (the "bilateral" weight). As depth difference increases, decrease weight.
+			weight *= max(0.0, 1.0
+				      - (EDGE_SHARPNESS * 2000.0) * abs(temp_depth - depth)
+				      );
+
+			sum += value * weight;
+			totalWeight += weight;
+		}
+	}
+
+	const float epsilon = 0.0001;
+	visibility = sum / (totalWeight + epsilon);
+}
diff --git a/drivers/gles3/shaders/ssao_minify.glsl b/drivers/gles3/shaders/ssao_minify.glsl
new file mode 100644
index 0000000000..df9045c28a
--- /dev/null
+++ b/drivers/gles3/shaders/ssao_minify.glsl
@@ -0,0 +1,55 @@
+[vertex]
+
+
+layout(location=0) in highp vec4 vertex_attrib;
+
+void main() {
+
+	gl_Position = vertex_attrib;
+}
+
+[fragment]
+
+
+#ifdef MINIFY_START
+
+#define SDEPTH_TYPE highp sampler2D
+uniform float camera_z_far;
+uniform float camera_z_near;
+
+#else
+
+#define SDEPTH_TYPE mediump usampler2D
+
+#endif
+
+uniform SDEPTH_TYPE source_depth; //texunit:0
+
+uniform ivec2 from_size;
+uniform int source_mipmap;
+
+layout(location = 0) out mediump uint depth;
+
+void main() {
+
+
+	ivec2 ssP = ivec2(gl_FragCoord.xy);
+
+	  // Rotated grid subsampling to avoid XY directional bias or Z precision bias while downsampling.
+	  // On DX9, the bit-and can be implemented with floating-point modulo
+
+#ifdef MINIFY_START
+	float fdepth = texelFetch(source_depth, clamp(ssP * 2 + ivec2(ssP.y & 1, ssP.x & 1), ivec2(0), from_size - ivec2(1)), source_mipmap).r;
+	fdepth = fdepth * 2.0 - 1.0;
+	fdepth = 2.0 * camera_z_near * camera_z_far / (camera_z_far + camera_z_near - fdepth * (camera_z_far - camera_z_near));
+	fdepth /= camera_z_far;
+	depth = uint(clamp(fdepth*65535,0.0,65535.0));
+
+#else
+	depth = texelFetch(source_depth, clamp(ssP * 2 + ivec2(ssP.y & 1, ssP.x & 1), ivec2(0), from_size - ivec2(1)), source_mipmap).r;
+#endif
+
+
+}
+
+