348 files changed, 14177 insertions, 8816 deletions
diff --git a/.editorconfig b/.editorconfig
index b7ef43c340..ead5e14ca9 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -8,6 +8,10 @@ indent_style = tab
 [*.{cpp,hpp,c,h,mm}]
 trim_trailing_whitespace = true
 
+[*.py]
+indent_style = space
+indent_size = 4
+
 [.travis.yml]
 indent_style = space
 indent_size = 2
diff --git a/.travis.yml b/.travis.yml
index c11a21aeef..0dfeaf16e1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -48,14 +48,14 @@ addons:
       - pkg-config
       - libx11-dev
       - libxcursor-dev
-      - libasound2-dev
-      - libfreetype6-dev
+      - libxi-dev
+      - libxinerama-dev
+      - libxrandr-dev
       - libgl1-mesa-dev
       - libglu1-mesa-dev
+      - libasound2-dev
+      - libfreetype6-dev
       - libssl-dev
-      - libxinerama-dev
-      - libxrandr-dev
-      - libxi-dev
 
       # For cross-compiling to Windows.
       #- binutils-mingw-w64-i686
@@ -90,5 +90,5 @@ script:
   - if [ "$STATIC_CHECKS" = "yes" ]; then
       sh ./misc/travis/clang-format.sh;
     else
-      scons -j2 CC=$CC CXX=$CXX platform=$GODOT_TARGET TOOLS=$TOOLS verbose=yes progress=no;
+      scons -j2 CC=$CC CXX=$CXX platform=$GODOT_TARGET TOOLS=$TOOLS verbose=yes progress=no openmp=no;
     fi
diff --git a/SConstruct b/SConstruct
index b3e0672c94..af1ffb544e 100644
--- a/SConstruct
+++ b/SConstruct
@@ -168,6 +168,8 @@ opts.Add(BoolVariable('vsproj', "Generate Visual Studio Project.", False))
 opts.Add(EnumVariable('warnings', "Set the level of warnings emitted during compilation", 'no', ('extra', 'all', 'moderate', 'no')))
 opts.Add(BoolVariable('progress', "Show a progress indicator during build", True))
 opts.Add(BoolVariable('dev', "If yes, alias for verbose=yes warnings=all", False))
+opts.Add(BoolVariable('openmp', "If yes, enable OpenMP", True))
+opts.Add(EnumVariable('macports_clang', "Build using clang from MacPorts", 'no', ('no', '5.0', 'devel')))
 
 # Thirdparty libraries
 opts.Add(BoolVariable('builtin_enet', "Use the builtin enet library", True))
diff --git a/core/io/http_client.cpp b/core/io/http_client.cpp
index 5097898314..e457a4ac1e 100644
--- a/core/io/http_client.cpp
+++ b/core/io/http_client.cpp
@@ -30,27 +30,53 @@
 #include "http_client.h"
 #include "io/stream_peer_ssl.h"
 
+const char *HTTPClient::_methods[METHOD_MAX] = {
+	"GET",
+	"HEAD",
+	"POST",
+	"PUT",
+	"DELETE",
+	"OPTIONS",
+	"TRACE",
+	"CONNECT",
+	"PATCH"
+};
+
 #ifndef JAVASCRIPT_ENABLED
 Error HTTPClient::connect_to_host(const String &p_host, int p_port, bool p_ssl, bool p_verify_host) {
 
 	close();
+
 	conn_port = p_port;
 	conn_host = p_host;
 
-	if (conn_host.begins_with("http://")) {
+	ssl = p_ssl;
+	ssl_verify_host = p_verify_host;
+
+	String host_lower = conn_host.to_lower();
+	if (host_lower.begins_with("http://")) {
 
-		conn_host = conn_host.replace_first("http://", "");
-	} else if (conn_host.begins_with("https://")) {
-		//use https
-		conn_host = conn_host.replace_first("https://", "");
+		conn_host = conn_host.substr(7, conn_host.length() - 7);
+	} else if (host_lower.begins_with("https://")) {
+
+		ssl = true;
+		conn_host = conn_host.substr(8, conn_host.length() - 8);
+	}
+
+	ERR_FAIL_COND_V(conn_host.length() < HOST_MIN_LEN, ERR_INVALID_PARAMETER);
+
+	if (conn_port < 0) {
+		if (ssl) {
+			conn_port = PORT_HTTPS;
+		} else {
+			conn_port = PORT_HTTP;
+		}
 	}
 
-	ssl = p_ssl;
-	ssl_verify_host = p_verify_host;
 	connection = tcp_connection;
 
 	if (conn_host.is_valid_ip_address()) {
-		//is ip
+		// Host contains valid IP
 		Error err = tcp_connection->connect_to_host(IP_Address(conn_host), p_port);
 		if (err) {
 			status = STATUS_CANT_CONNECT;
@@ -59,7 +85,7 @@ Error HTTPClient::connect_to_host(const String &p_host, int p_port, bool p_ssl,
 
 		status = STATUS_CONNECTING;
 	} else {
-		//is hostname
+		// Host contains hostname and needs to be resolved to IP
 		resolving = IP::get_singleton()->resolve_hostname_queue_item(conn_host);
 		status = STATUS_RESOLVING;
 	}
@@ -82,23 +108,13 @@ Ref<StreamPeer> HTTPClient::get_connection() const {
 Error HTTPClient::request_raw(Method p_method, const String &p_url, const Vector<String> &p_headers, const PoolVector<uint8_t> &p_body) {
 
 	ERR_FAIL_INDEX_V(p_method, METHOD_MAX, ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V(!p_url.begins_with("/"), ERR_INVALID_PARAMETER);
 	ERR_FAIL_COND_V(status != STATUS_CONNECTED, ERR_INVALID_PARAMETER);
 	ERR_FAIL_COND_V(connection.is_null(), ERR_INVALID_DATA);
 
-	static const char *_methods[METHOD_MAX] = {
-		"GET",
-		"HEAD",
-		"POST",
-		"PUT",
-		"DELETE",
-		"OPTIONS",
-		"TRACE",
-		"CONNECT"
-	};
-
 	String request = String(_methods[p_method]) + " " + p_url + " HTTP/1.1\r\n";
-	if ((ssl && conn_port == 443) || (!ssl && conn_port == 80)) {
-		// don't append the standard ports
+	if ((ssl && conn_port == PORT_HTTPS) || (!ssl && conn_port == PORT_HTTP)) {
+		// Don't append the standard ports
 		request += "Host: " + conn_host + "\r\n";
 	} else {
 		request += "Host: " + conn_host + ":" + itos(conn_port) + "\r\n";
@@ -112,17 +128,20 @@ Error HTTPClient::request_raw(Method p_method, const String &p_url, const Vector
 	}
 	if (add_clen) {
 		request += "Content-Length: " + itos(p_body.size()) + "\r\n";
-		//should it add utf8 encoding? not sure
+		// Should it add utf8 encoding?
 	}
 	request += "\r\n";
 	CharString cs = request.utf8();
 
 	PoolVector<uint8_t> data;
-
-	//Maybe this goes faster somehow?
-	for (int i = 0; i < cs.length(); i++) {
-		data.append(cs[i]);
+	data.resize(cs.length());
+	{
+		PoolVector<uint8_t>::Write data_write = data.write();
+		for (int i = 0; i < cs.length(); i++) {
+			data_write[i] = cs[i];
+		}
 	}
+
 	data.append_array(p_body);
 
 	PoolVector<uint8_t>::Read r = data.read();
@@ -142,23 +161,13 @@ Error HTTPClient::request_raw(Method p_method, const String &p_url, const Vector
 Error HTTPClient::request(Method p_method, const String &p_url, const Vector<String> &p_headers, const String &p_body) {
 
 	ERR_FAIL_INDEX_V(p_method, METHOD_MAX, ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V(!p_url.begins_with("/"), ERR_INVALID_PARAMETER);
 	ERR_FAIL_COND_V(status != STATUS_CONNECTED, ERR_INVALID_PARAMETER);
 	ERR_FAIL_COND_V(connection.is_null(), ERR_INVALID_DATA);
 
-	static const char *_methods[METHOD_MAX] = {
-		"GET",
-		"HEAD",
-		"POST",
-		"PUT",
-		"DELETE",
-		"OPTIONS",
-		"TRACE",
-		"CONNECT"
-	};
-
 	String request = String(_methods[p_method]) + " " + p_url + " HTTP/1.1\r\n";
-	if ((ssl && conn_port == 443) || (!ssl && conn_port == 80)) {
-		// don't append the standard ports
+	if ((ssl && conn_port == PORT_HTTPS) || (!ssl && conn_port == PORT_HTTP)) {
+		// Don't append the standard ports
 		request += "Host: " + conn_host + "\r\n";
 	} else {
 		request += "Host: " + conn_host + ":" + itos(conn_port) + "\r\n";
@@ -172,7 +181,7 @@ Error HTTPClient::request(Method p_method, const String &p_url, const Vector<Str
 	}
 	if (add_clen) {
 		request += "Content-Length: " + itos(p_body.utf8().length()) + "\r\n";
-		//should it add utf8 encoding? not sure
+		// Should it add utf8 encoding?
 	}
 	request += "\r\n";
 	request += p_body;
@@ -251,7 +260,7 @@ Error HTTPClient::poll() {
 			IP::ResolverStatus rstatus = IP::get_singleton()->get_resolve_item_status(resolving);
 			switch (rstatus) {
 				case IP::RESOLVER_STATUS_WAITING:
-					return OK; //still resolving
+					return OK; // Still resolving
 
 				case IP::RESOLVER_STATUS_DONE: {
 
@@ -283,7 +292,7 @@ Error HTTPClient::poll() {
 			switch (s) {
 
 				case StreamPeerTCP::STATUS_CONNECTING: {
-					return OK; //do none
+					return OK;
 				} break;
 				case StreamPeerTCP::STATUS_CONNECTED: {
 					if (ssl) {
@@ -294,7 +303,6 @@ Error HTTPClient::poll() {
 							status = STATUS_SSL_HANDSHAKE_ERROR;
 							return ERR_CANT_CONNECT;
 						}
-						//print_line("SSL! TURNED ON!");
 						connection = ssl;
 					}
 					status = STATUS_CONNECTED;
@@ -310,7 +318,7 @@ Error HTTPClient::poll() {
 			}
 		} break;
 		case STATUS_CONNECTED: {
-			//request something please
+			// Connection established, requests can now be made
 			return OK;
 		} break;
 		case STATUS_REQUESTING: {
@@ -326,7 +334,7 @@ Error HTTPClient::poll() {
 				}
 
 				if (rec == 0)
-					return OK; //keep trying!
+					return OK; // Still requesting, keep trying!
 
 				response_str.push_back(byte);
 				int rs = response_str.size();
@@ -334,11 +342,10 @@ Error HTTPClient::poll() {
 						(rs >= 2 && response_str[rs - 2] == '\n' && response_str[rs - 1] == '\n') ||
 						(rs >= 4 && response_str[rs - 4] == '\r' && response_str[rs - 3] == '\n' && response_str[rs - 2] == '\r' && response_str[rs - 1] == '\n')) {
 
-					//end of response, parse.
+					// End of response, parse.
 					response_str.push_back(0);
 					String response;
 					response.parse_utf8((const char *)response_str.ptr());
-					//print_line("END OF RESPONSE? :\n"+response+"\n------");
 					Vector<String> responses = response.split("\n");
 					body_size = 0;
 					chunked = false;
@@ -361,7 +368,6 @@ Error HTTPClient::poll() {
 
 						if (s.begins_with("transfer-encoding:")) {
 							String encoding = header.substr(header.find(":") + 1, header.length()).strip_edges();
-							//print_line("TRANSFER ENCODING: "+encoding);
 							if (encoding == "chunked") {
 								chunked = true;
 							}
@@ -379,14 +385,14 @@ Error HTTPClient::poll() {
 
 					if (body_size == 0 && !chunked) {
 
-						status = STATUS_CONNECTED; //ask for something again?
+						status = STATUS_CONNECTED; // Ready for new requests
 					} else {
 						status = STATUS_BODY;
 					}
 					return OK;
 				}
 			}
-			//wait for response
+			// Wait for response
 			return OK;
 		} break;
 		case STATUS_DISCONNECTED: {
@@ -422,7 +428,7 @@ PoolByteArray HTTPClient::read_response_body_chunk() {
 		while (true) {
 
 			if (chunk_left == 0) {
-				//reading len
+				// Reading length
 				uint8_t b;
 				int rec = 0;
 				err = _get_http_data(&b, 1, rec);
@@ -465,7 +471,7 @@ PoolByteArray HTTPClient::read_response_body_chunk() {
 					}
 
 					if (len == 0) {
-						//end!
+						// End reached!
 						status = STATUS_CONNECTED;
 						chunk.clear();
 						return PoolByteArray();
@@ -523,7 +529,7 @@ PoolByteArray HTTPClient::read_response_body_chunk() {
 				to_read -= rec;
 				_offset += rec;
 			} else {
-				if (to_read > 0) //ended up reading less
+				if (to_read > 0) // Ended up reading less
 					ret.resize(_offset);
 				break;
 			}
@@ -538,7 +544,7 @@ PoolByteArray HTTPClient::read_response_body_chunk() {
 		close();
 		if (err == ERR_FILE_EOF) {
 
-			status = STATUS_DISCONNECTED; //server disconnected
+			status = STATUS_DISCONNECTED; // Server disconnected
 		} else {
 
 			status = STATUS_CONNECTION_ERROR;
@@ -591,7 +597,7 @@ HTTPClient::HTTPClient() {
 	tcp_connection = StreamPeerTCP::create_ref();
 	resolving = IP::RESOLVER_INVALID_ID;
 	status = STATUS_DISCONNECTED;
-	conn_port = 80;
+	conn_port = -1;
 	body_size = 0;
 	chunked = false;
 	body_left = 0;
@@ -651,7 +657,7 @@ PoolStringArray HTTPClient::_get_response_headers() {
 
 void HTTPClient::_bind_methods() {
 
-	ClassDB::bind_method(D_METHOD("connect_to_host", "host", "port", "use_ssl", "verify_host"), &HTTPClient::connect_to_host, DEFVAL(false), DEFVAL(true));
+	ClassDB::bind_method(D_METHOD("connect_to_host", "host", "port", "use_ssl", "verify_host"), &HTTPClient::connect_to_host, DEFVAL(-1), DEFVAL(false), DEFVAL(true));
 	ClassDB::bind_method(D_METHOD("set_connection", "connection"), &HTTPClient::set_connection);
 	ClassDB::bind_method(D_METHOD("get_connection"), &HTTPClient::get_connection);
 	ClassDB::bind_method(D_METHOD("request_raw", "method", "url", "headers", "body"), &HTTPClient::request_raw);
@@ -683,16 +689,17 @@ void HTTPClient::_bind_methods() {
 	BIND_ENUM_CONSTANT(METHOD_OPTIONS);
 	BIND_ENUM_CONSTANT(METHOD_TRACE);
 	BIND_ENUM_CONSTANT(METHOD_CONNECT);
+	BIND_ENUM_CONSTANT(METHOD_PATCH);
 	BIND_ENUM_CONSTANT(METHOD_MAX);
 
 	BIND_ENUM_CONSTANT(STATUS_DISCONNECTED);
-	BIND_ENUM_CONSTANT(STATUS_RESOLVING); //resolving hostname (if passed a hostname)
+	BIND_ENUM_CONSTANT(STATUS_RESOLVING); // Resolving hostname (if hostname was passed in)
 	BIND_ENUM_CONSTANT(STATUS_CANT_RESOLVE);
-	BIND_ENUM_CONSTANT(STATUS_CONNECTING); //connecting to ip
+	BIND_ENUM_CONSTANT(STATUS_CONNECTING); // Connecting to IP
 	BIND_ENUM_CONSTANT(STATUS_CANT_CONNECT);
-	BIND_ENUM_CONSTANT(STATUS_CONNECTED); //connected );  requests only accepted here
-	BIND_ENUM_CONSTANT(STATUS_REQUESTING); // request in progress
-	BIND_ENUM_CONSTANT(STATUS_BODY); // request resulted in body );  which must be read
+	BIND_ENUM_CONSTANT(STATUS_CONNECTED); // Connected, now accepting requests
+	BIND_ENUM_CONSTANT(STATUS_REQUESTING); // Request in progress
+	BIND_ENUM_CONSTANT(STATUS_BODY); // Request resulted in body which must be read
 	BIND_ENUM_CONSTANT(STATUS_CONNECTION_ERROR);
 	BIND_ENUM_CONSTANT(STATUS_SSL_HANDSHAKE_ERROR);
 
@@ -709,6 +716,7 @@ void HTTPClient::_bind_methods() {
 	BIND_ENUM_CONSTANT(RESPONSE_RESET_CONTENT);
 	BIND_ENUM_CONSTANT(RESPONSE_PARTIAL_CONTENT);
 	BIND_ENUM_CONSTANT(RESPONSE_MULTI_STATUS);
+	BIND_ENUM_CONSTANT(RESPONSE_ALREADY_REPORTED);
 	BIND_ENUM_CONSTANT(RESPONSE_IM_USED);
 
 	// 3xx redirection
@@ -718,7 +726,9 @@ void HTTPClient::_bind_methods() {
 	BIND_ENUM_CONSTANT(RESPONSE_SEE_OTHER);
 	BIND_ENUM_CONSTANT(RESPONSE_NOT_MODIFIED);
 	BIND_ENUM_CONSTANT(RESPONSE_USE_PROXY);
+	BIND_ENUM_CONSTANT(RESPONSE_SWITCH_PROXY);
 	BIND_ENUM_CONSTANT(RESPONSE_TEMPORARY_REDIRECT);
+	BIND_ENUM_CONSTANT(RESPONSE_PERMANENT_REDIRECT);
 
 	// 4xx client error
 	BIND_ENUM_CONSTANT(RESPONSE_BAD_REQUEST);
@@ -739,10 +749,16 @@ void HTTPClient::_bind_methods() {
 	BIND_ENUM_CONSTANT(RESPONSE_UNSUPPORTED_MEDIA_TYPE);
 	BIND_ENUM_CONSTANT(RESPONSE_REQUESTED_RANGE_NOT_SATISFIABLE);
 	BIND_ENUM_CONSTANT(RESPONSE_EXPECTATION_FAILED);
+	BIND_ENUM_CONSTANT(RESPONSE_IM_A_TEAPOT);
+	BIND_ENUM_CONSTANT(RESPONSE_MISDIRECTED_REQUEST);
 	BIND_ENUM_CONSTANT(RESPONSE_UNPROCESSABLE_ENTITY);
 	BIND_ENUM_CONSTANT(RESPONSE_LOCKED);
 	BIND_ENUM_CONSTANT(RESPONSE_FAILED_DEPENDENCY);
 	BIND_ENUM_CONSTANT(RESPONSE_UPGRADE_REQUIRED);
+	BIND_ENUM_CONSTANT(RESPONSE_PRECONDITION_REQUIRED);
+	BIND_ENUM_CONSTANT(RESPONSE_TOO_MANY_REQUESTS);
+	BIND_ENUM_CONSTANT(RESPONSE_REQUEST_HEADER_FIELDS_TOO_LARGE);
+	BIND_ENUM_CONSTANT(RESPONSE_UNAVAILABLE_FOR_LEGAL_REASONS);
 
 	// 5xx server error
 	BIND_ENUM_CONSTANT(RESPONSE_INTERNAL_SERVER_ERROR);
@@ -751,6 +767,9 @@ void HTTPClient::_bind_methods() {
 	BIND_ENUM_CONSTANT(RESPONSE_SERVICE_UNAVAILABLE);
 	BIND_ENUM_CONSTANT(RESPONSE_GATEWAY_TIMEOUT);
 	BIND_ENUM_CONSTANT(RESPONSE_HTTP_VERSION_NOT_SUPPORTED);
+	BIND_ENUM_CONSTANT(RESPONSE_VARIANT_ALSO_NEGOTIATES);
 	BIND_ENUM_CONSTANT(RESPONSE_INSUFFICIENT_STORAGE);
+	BIND_ENUM_CONSTANT(RESPONSE_LOOP_DETECTED);
 	BIND_ENUM_CONSTANT(RESPONSE_NOT_EXTENDED);
+	BIND_ENUM_CONSTANT(RESPONSE_NETWORK_AUTH_REQUIRED);
 }
diff --git a/core/io/http_client.h b/core/io/http_client.h
index db5dd115bd..3d8953c156 100644
--- a/core/io/http_client.h
+++ b/core/io/http_client.h
@@ -56,6 +56,7 @@ public:
 		RESPONSE_RESET_CONTENT = 205,
 		RESPONSE_PARTIAL_CONTENT = 206,
 		RESPONSE_MULTI_STATUS = 207,
+		RESPONSE_ALREADY_REPORTED = 208,
 		RESPONSE_IM_USED = 226,
 
 		// 3xx redirection
@@ -65,7 +66,9 @@ public:
 		RESPONSE_SEE_OTHER = 303,
 		RESPONSE_NOT_MODIFIED = 304,
 		RESPONSE_USE_PROXY = 305,
+		RESPONSE_SWITCH_PROXY = 306,
 		RESPONSE_TEMPORARY_REDIRECT = 307,
+		RESPONSE_PERMANENT_REDIRECT = 308,
 
 		// 4xx client error
 		RESPONSE_BAD_REQUEST = 400,
@@ -86,10 +89,16 @@ public:
 		RESPONSE_UNSUPPORTED_MEDIA_TYPE = 415,
 		RESPONSE_REQUESTED_RANGE_NOT_SATISFIABLE = 416,
 		RESPONSE_EXPECTATION_FAILED = 417,
+		RESPONSE_IM_A_TEAPOT = 418,
+		RESPONSE_MISDIRECTED_REQUEST = 421,
 		RESPONSE_UNPROCESSABLE_ENTITY = 422,
 		RESPONSE_LOCKED = 423,
 		RESPONSE_FAILED_DEPENDENCY = 424,
 		RESPONSE_UPGRADE_REQUIRED = 426,
+		RESPONSE_PRECONDITION_REQUIRED = 428,
+		RESPONSE_TOO_MANY_REQUESTS = 429,
+		RESPONSE_REQUEST_HEADER_FIELDS_TOO_LARGE = 431,
+		RESPONSE_UNAVAILABLE_FOR_LEGAL_REASONS = 451,
 
 		// 5xx server error
 		RESPONSE_INTERNAL_SERVER_ERROR = 500,
@@ -98,8 +107,11 @@ public:
 		RESPONSE_SERVICE_UNAVAILABLE = 503,
 		RESPONSE_GATEWAY_TIMEOUT = 504,
 		RESPONSE_HTTP_VERSION_NOT_SUPPORTED = 505,
+		RESPONSE_VARIANT_ALSO_NEGOTIATES = 506,
 		RESPONSE_INSUFFICIENT_STORAGE = 507,
+		RESPONSE_LOOP_DETECTED = 508,
 		RESPONSE_NOT_EXTENDED = 510,
+		RESPONSE_NETWORK_AUTH_REQUIRED = 511,
 
 	};
 
@@ -113,24 +125,37 @@ public:
 		METHOD_OPTIONS,
 		METHOD_TRACE,
 		METHOD_CONNECT,
+		METHOD_PATCH,
 		METHOD_MAX
+
 	};
 
 	enum Status {
+
 		STATUS_DISCONNECTED,
-		STATUS_RESOLVING, //resolving hostname (if passed a hostname)
+		STATUS_RESOLVING, // Resolving hostname (if passed a hostname)
 		STATUS_CANT_RESOLVE,
-		STATUS_CONNECTING, //connecting to ip
+		STATUS_CONNECTING, // Connecting to IP
 		STATUS_CANT_CONNECT,
-		STATUS_CONNECTED, //connected, requests only accepted here
-		STATUS_REQUESTING, // request in progress
-		STATUS_BODY, // request resulted in body, which must be read
+		STATUS_CONNECTED, // Connected, requests can be made
+		STATUS_REQUESTING, // Request in progress
+		STATUS_BODY, // Request resulted in body, which must be read
 		STATUS_CONNECTION_ERROR,
 		STATUS_SSL_HANDSHAKE_ERROR,
 
 	};
 
 private:
+	static const char *_methods[METHOD_MAX];
+	static const int HOST_MIN_LEN = 4;
+
+	enum Port {
+
+		PORT_HTTP = 80,
+		PORT_HTTPS = 443,
+
+	};
+
 #ifndef JAVASCRIPT_ENABLED
 	Status status;
 	IP::ResolverID resolving;
@@ -167,8 +192,7 @@ private:
 	static void _bind_methods();
 
 public:
-	//Error connect_and_get(const String& p_url,bool p_verify_host=true); //connects to a full url and perform request
-	Error connect_to_host(const String &p_host, int p_port, bool p_ssl = false, bool p_verify_host = true);
+	Error connect_to_host(const String &p_host, int p_port = -1, bool p_ssl = false, bool p_verify_host = true);
 
 	void set_connection(const Ref<StreamPeer> &p_connection);
 	Ref<StreamPeer> get_connection() const;
@@ -186,9 +210,9 @@ public:
 	Error get_response_headers(List<String> *r_response);
 	int get_response_body_length() const;
 
-	PoolByteArray read_response_body_chunk(); // can't get body as partial text because of most encodings UTF8, gzip, etc.
+	PoolByteArray read_response_body_chunk(); // Can't get body as partial text because of most encodings UTF8, gzip, etc.
 
-	void set_blocking_mode(bool p_enable); //useful mostly if running in a thread
+	void set_blocking_mode(bool p_enable); // Useful mostly if running in a thread
 	bool is_blocking_mode_enabled() const;
 
 	void set_read_chunk_size(int p_size);
diff --git a/core/io/packet_peer.cpp b/core/io/packet_peer.cpp
index 16c73c26e7..c6b12f73ae 100644
--- a/core/io/packet_peer.cpp
+++ b/core/io/packet_peer.cpp
@@ -49,7 +49,7 @@ bool PacketPeer::is_object_decoding_allowed() const {
 	return allow_object_decoding;
 }
 
-Error PacketPeer::get_packet_buffer(PoolVector<uint8_t> &r_buffer) const {
+Error PacketPeer::get_packet_buffer(PoolVector<uint8_t> &r_buffer) {
 
 	const uint8_t *buffer;
 	int buffer_size;
@@ -78,7 +78,7 @@ Error PacketPeer::put_packet_buffer(const PoolVector<uint8_t> &p_buffer) {
 	return put_packet(&r[0], len);
 }
 
-Error PacketPeer::get_var(Variant &r_variant) const {
+Error PacketPeer::get_var(Variant &r_variant) {
 
 	const uint8_t *buffer;
 	int buffer_size;
@@ -107,7 +107,7 @@ Error PacketPeer::put_var(const Variant &p_packet) {
 	return put_packet(buf, len);
 }
 
-Variant PacketPeer::_bnd_get_var() const {
+Variant PacketPeer::_bnd_get_var() {
 	Variant var;
 	get_var(var);
 
@@ -117,7 +117,7 @@ Variant PacketPeer::_bnd_get_var() const {
 Error PacketPeer::_put_packet(const PoolVector<uint8_t> &p_buffer) {
 	return put_packet_buffer(p_buffer);
 }
-PoolVector<uint8_t> PacketPeer::_get_packet() const {
+PoolVector<uint8_t> PacketPeer::_get_packet() {
 
 	PoolVector<uint8_t> raw;
 	last_get_error = get_packet_buffer(raw);
@@ -202,7 +202,7 @@ int PacketPeerStream::get_available_packet_count() const {
 	return count;
 }
 
-Error PacketPeerStream::get_packet(const uint8_t **r_buffer, int &r_buffer_size) const {
+Error PacketPeerStream::get_packet(const uint8_t **r_buffer, int &r_buffer_size) {
 
 	ERR_FAIL_COND_V(peer.is_null(), ERR_UNCONFIGURED);
 	_poll_buffer();
diff --git a/core/io/packet_peer.h b/core/io/packet_peer.h
index b08d44ad8a..a6d363ec12 100644
--- a/core/io/packet_peer.h
+++ b/core/io/packet_peer.h
@@ -37,13 +37,13 @@ class PacketPeer : public Reference {
 
 	GDCLASS(PacketPeer, Reference);
 
-	Variant _bnd_get_var() const;
+	Variant _bnd_get_var();
 	void _bnd_put_var(const Variant &p_var);
 
 	static void _bind_methods();
 
 	Error _put_packet(const PoolVector<uint8_t> &p_buffer);
-	PoolVector<uint8_t> _get_packet() const;
+	PoolVector<uint8_t> _get_packet();
 	Error _get_packet_error() const;
 
 	mutable Error last_get_error;
@@ -52,17 +52,17 @@ class PacketPeer : public Reference {
 
 public:
 	virtual int get_available_packet_count() const = 0;
-	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size) const = 0; ///< buffer is GONE after next get_packet
+	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size) = 0; ///< buffer is GONE after next get_packet
 	virtual Error put_packet(const uint8_t *p_buffer, int p_buffer_size) = 0;
 
 	virtual int get_max_packet_size() const = 0;
 
 	/* helpers / binders */
 
-	virtual Error get_packet_buffer(PoolVector<uint8_t> &r_buffer) const;
+	virtual Error get_packet_buffer(PoolVector<uint8_t> &r_buffer);
 	virtual Error put_packet_buffer(const PoolVector<uint8_t> &p_buffer);
 
-	virtual Error get_var(Variant &r_variant) const;
+	virtual Error get_var(Variant &r_variant);
 	virtual Error put_var(const Variant &p_packet);
 
 	void set_allow_object_decoding(bool p_enable);
@@ -91,7 +91,7 @@ protected:
 
 public:
 	virtual int get_available_packet_count() const;
-	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size) const;
+	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size);
 	virtual Error put_packet(const uint8_t *p_buffer, int p_buffer_size);
 
 	virtual int get_max_packet_size() const;
diff --git a/core/io/resource_format_binary.cpp b/core/io/resource_format_binary.cpp
index df0d41ea9d..92fdbc1581 100644
--- a/core/io/resource_format_binary.cpp
+++ b/core/io/resource_format_binary.cpp
@@ -104,7 +104,7 @@ StringName ResourceInteractiveLoaderBinary::_get_string() {
 
 	uint32_t id = f->get_32();
 	if (id & 0x80000000) {
-		int len = id & 0x7FFFFFFF;
+		uint32_t len = id & 0x7FFFFFFF;
 		if (len > str_buf.size()) {
 			str_buf.resize(len);
 		}
@@ -734,6 +734,7 @@ Error ResourceInteractiveLoaderBinary::poll() {
 	for (int i = 0; i < pc; i++) {
 
 		StringName name = _get_string();
+
 		if (name == StringName()) {
 			error = ERR_FILE_CORRUPT;
 			ERR_FAIL_V(ERR_FILE_CORRUPT);
@@ -902,7 +903,9 @@ void ResourceInteractiveLoaderBinary::open(FileAccess *p_f) {
 
 		ExtResource er;
 		er.type = get_unicode_string();
+
 		er.path = get_unicode_string();
+
 		external_resources.push_back(er);
 	}
 
@@ -1271,7 +1274,7 @@ String ResourceFormatLoaderBinary::get_resource_type(const String &p_path) const
 ///////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////
 
-void ResourceFormatSaverBinaryInstance::_pad_buffer(int p_bytes) {
+void ResourceFormatSaverBinaryInstance::_pad_buffer(FileAccess *f, int p_bytes) {
 
 	int extra = 4 - (p_bytes % 4);
 	if (extra < 4) {
@@ -1280,7 +1283,12 @@ void ResourceFormatSaverBinaryInstance::_pad_buffer(int p_bytes) {
 	}
 }
 
-void ResourceFormatSaverBinaryInstance::write_variant(const Variant &p_property, const PropertyInfo &p_hint) {
+void ResourceFormatSaverBinaryInstance::_write_variant(const Variant &p_property, const PropertyInfo &p_hint) {
+
+	write_variant(f, p_property, resource_set, external_resources, string_map, p_hint);
+}
+
+void ResourceFormatSaverBinaryInstance::write_variant(FileAccess *f, const Variant &p_property, Set<RES> &resource_set, Map<RES, int> &external_resources, Map<StringName, int> &string_map, const PropertyInfo &p_hint) {
 
 	switch (p_property.get_type()) {
 
@@ -1327,7 +1335,7 @@ void ResourceFormatSaverBinaryInstance::write_variant(const Variant &p_property,
 
 			f->store_32(VARIANT_STRING);
 			String val = p_property;
-			save_unicode_string(val);
+			save_unicode_string(f, val);
 
 		} break;
 		case Variant::VECTOR2: {
@@ -1453,10 +1461,20 @@ void ResourceFormatSaverBinaryInstance::write_variant(const Variant &p_property,
 			if (np.is_absolute())
 				snc |= 0x8000;
 			f->store_16(snc);
-			for (int i = 0; i < np.get_name_count(); i++)
-				f->store_32(get_string_index(np.get_name(i)));
-			for (int i = 0; i < np.get_subname_count(); i++)
-				f->store_32(get_string_index(np.get_subname(i)));
+			for (int i = 0; i < np.get_name_count(); i++) {
+				if (string_map.has(np.get_name(i))) {
+					f->store_32(string_map[np.get_name(i)]);
+				} else {
+					save_unicode_string(f, np.get_name(i), true);
+				}
+			}
+			for (int i = 0; i < np.get_subname_count(); i++) {
+				if (string_map.has(np.get_subname(i))) {
+					f->store_32(string_map[np.get_subname(i)]);
+				} else {
+					save_unicode_string(f, np.get_subname(i), true);
+				}
+			}
 
 		} break;
 		case Variant::_RID: {
@@ -1508,8 +1526,8 @@ void ResourceFormatSaverBinaryInstance::write_variant(const Variant &p_property,
 					continue;
 				*/
 
-				write_variant(E->get());
-				write_variant(d[E->get()]);
+				write_variant(f, E->get(), resource_set, external_resources, string_map);
+				write_variant(f, d[E->get()], resource_set, external_resources, string_map);
 			}
 
 		} break;
@@ -1520,7 +1538,7 @@ void ResourceFormatSaverBinaryInstance::write_variant(const Variant &p_property,
 			f->store_32(uint32_t(a.size()));
 			for (int i = 0; i < a.size(); i++) {
 
-				write_variant(a[i]);
+				write_variant(f, a[i], resource_set, external_resources, string_map);
 			}
 
 		} break;
@@ -1532,7 +1550,7 @@ void ResourceFormatSaverBinaryInstance::write_variant(const Variant &p_property,
 			f->store_32(len);
 			PoolVector<uint8_t>::Read r = arr.read();
 			f->store_buffer(r.ptr(), len);
-			_pad_buffer(len);
+			_pad_buffer(f, len);
 
 		} break;
 		case Variant::POOL_INT_ARRAY: {
@@ -1566,7 +1584,7 @@ void ResourceFormatSaverBinaryInstance::write_variant(const Variant &p_property,
 			f->store_32(len);
 			PoolVector<String>::Read r = arr.read();
 			for (int i = 0; i < len; i++) {
-				save_unicode_string(r[i]);
+				save_unicode_string(f, r[i]);
 			}
 
 		} break;
@@ -1693,10 +1711,14 @@ void ResourceFormatSaverBinaryInstance::_find_resources(const Variant &p_variant
 	}
 }
 
-void ResourceFormatSaverBinaryInstance::save_unicode_string(const String &p_string) {
+void ResourceFormatSaverBinaryInstance::save_unicode_string(FileAccess *f, const String &p_string, bool p_bit_on_len) {
 
 	CharString utf8 = p_string.utf8();
-	f->store_32(utf8.length() + 1);
+	if (p_bit_on_len) {
+		f->store_32(utf8.length() + 1 | 0x80000000);
+	} else {
+		f->store_32(utf8.length() + 1);
+	}
 	f->store_buffer((const uint8_t *)utf8.get_data(), utf8.length() + 1);
 }
 
@@ -1763,7 +1785,7 @@ Error ResourceFormatSaverBinaryInstance::save(const String &p_path, const RES &p
 		return ERR_CANT_CREATE;
 	}
 
-	save_unicode_string(p_resource->get_class());
+	save_unicode_string(f, p_resource->get_class());
 	f->store_64(0); //offset to import metadata
 	for (int i = 0; i < 14; i++)
 		f->store_32(0); // reserved
@@ -1800,7 +1822,7 @@ Error ResourceFormatSaverBinaryInstance::save(const String &p_path, const RES &p
 
 	f->store_32(strings.size()); //string table size
 	for (int i = 0; i < strings.size(); i++) {
-		save_unicode_string(strings[i]);
+		save_unicode_string(f, strings[i]);
 	}
 
 	// save external resource table
@@ -1814,10 +1836,10 @@ Error ResourceFormatSaverBinaryInstance::save(const String &p_path, const RES &p
 
 	for (int i = 0; i < save_order.size(); i++) {
 
-		save_unicode_string(save_order[i]->get_save_class());
+		save_unicode_string(f, save_order[i]->get_save_class());
 		String path = save_order[i]->get_path();
 		path = relative_paths ? local_path.path_to_file(path) : path;
-		save_unicode_string(path);
+		save_unicode_string(f, path);
 	}
 	// save internal resource table
 	f->store_32(saved_resources.size()); //amount of internal resources
@@ -1853,7 +1875,7 @@ Error ResourceFormatSaverBinaryInstance::save(const String &p_path, const RES &p
 				used_indices.insert(new_subindex);
 			}
 
-			save_unicode_string("local://" + itos(r->get_subindex()));
+			save_unicode_string(f, "local://" + itos(r->get_subindex()));
 			if (takeover_paths) {
 				r->set_path(p_path + "::" + itos(r->get_subindex()), true);
 			}
@@ -1861,7 +1883,7 @@ Error ResourceFormatSaverBinaryInstance::save(const String &p_path, const RES &p
 			r->set_edited(false);
 #endif
 		} else {
-			save_unicode_string(r->get_path()); //actual external
+			save_unicode_string(f, r->get_path()); //actual external
 		}
 		ofs_pos.push_back(f->get_position());
 		f->store_64(0); //offset in 64 bits
@@ -1875,14 +1897,14 @@ Error ResourceFormatSaverBinaryInstance::save(const String &p_path, const RES &p
 		ResourceData &rd = E->get();
 
 		ofs_table.push_back(f->get_position());
-		save_unicode_string(rd.type);
+		save_unicode_string(f, rd.type);
 		f->store_32(rd.properties.size());
 
 		for (List<Property>::Element *F = rd.properties.front(); F; F = F->next()) {
 
 			Property &p = F->get();
 			f->store_32(p.name_idx);
-			write_variant(p.value, F->get().pi);
+			_write_variant(p.value, F->get().pi);
 		}
 	}
 
diff --git a/core/io/resource_format_binary.h b/core/io/resource_format_binary.h
index 687da0a9b4..176b8350cf 100644
--- a/core/io/resource_format_binary.h
+++ b/core/io/resource_format_binary.h
@@ -140,14 +140,15 @@ class ResourceFormatSaverBinaryInstance {
 		List<Property> properties;
 	};
 
-	void _pad_buffer(int p_bytes);
-	void write_variant(const Variant &p_property, const PropertyInfo &p_hint = PropertyInfo());
+	static void _pad_buffer(FileAccess *f, int p_bytes);
+	void _write_variant(const Variant &p_property, const PropertyInfo &p_hint = PropertyInfo());
 	void _find_resources(const Variant &p_variant, bool p_main = false);
-	void save_unicode_string(const String &p_string);
+	static void save_unicode_string(FileAccess *f, const String &p_string, bool p_bit_on_len = false);
 	int get_string_index(const String &p_string);
 
 public:
 	Error save(const String &p_path, const RES &p_resource, uint32_t p_flags = 0);
+	static void write_variant(FileAccess *f, const Variant &p_property, Set<RES> &resource_set, Map<RES, int> &external_resources, Map<StringName, int> &string_map, const PropertyInfo &p_hint = PropertyInfo());
 };
 
 class ResourceFormatSaverBinary : public ResourceFormatSaver {
diff --git a/core/io/resource_loader.cpp b/core/io/resource_loader.cpp
index ed0d491679..d2aad1d63a 100644
--- a/core/io/resource_loader.cpp
+++ b/core/io/resource_loader.cpp
@@ -196,19 +196,19 @@ RES ResourceLoader::load(const String &p_path, const String &p_type_hint, bool p
 	else
 		local_path = ProjectSettings::get_singleton()->localize_path(p_path);
 
-	bool xl_remapped = false;
-	String path = _path_remap(local_path, &xl_remapped);
-
-	ERR_FAIL_COND_V(path == "", RES());
-
-	if (!p_no_cache && ResourceCache::has(path)) {
+	if (!p_no_cache && ResourceCache::has(local_path)) {
 
 		if (OS::get_singleton()->is_stdout_verbose())
-			print_line("load resource: " + path + " (cached)");
+			print_line("load resource: " + local_path + " (cached)");
 
-		return RES(ResourceCache::get(path));
+		return RES(ResourceCache::get(local_path));
 	}
 
+	bool xl_remapped = false;
+	String path = _path_remap(local_path, &xl_remapped);
+
+	ERR_FAIL_COND_V(path == "", RES());
+
 	if (OS::get_singleton()->is_stdout_verbose())
 		print_line("load resource: " + path);
 
@@ -247,23 +247,23 @@ Ref<ResourceInteractiveLoader> ResourceLoader::load_interactive(const String &p_
 	else
 		local_path = ProjectSettings::get_singleton()->localize_path(p_path);
 
-	bool xl_remapped = false;
-	String path = _path_remap(local_path, &xl_remapped);
-
-	ERR_FAIL_COND_V(path == "", Ref<ResourceInteractiveLoader>());
-
-	if (!p_no_cache && ResourceCache::has(path)) {
+	if (!p_no_cache && ResourceCache::has(local_path)) {
 
 		if (OS::get_singleton()->is_stdout_verbose())
-			print_line("load resource: " + path + " (cached)");
+			print_line("load resource: " + local_path + " (cached)");
 
-		Ref<Resource> res_cached = ResourceCache::get(path);
+		Ref<Resource> res_cached = ResourceCache::get(local_path);
 		Ref<ResourceInteractiveLoaderDefault> ril = Ref<ResourceInteractiveLoaderDefault>(memnew(ResourceInteractiveLoaderDefault));
 
 		ril->resource = res_cached;
 		return ril;
 	}
 
+	bool xl_remapped = false;
+	String path = _path_remap(local_path, &xl_remapped);
+
+	ERR_FAIL_COND_V(path == "", Ref<ResourceInteractiveLoader>());
+
 	if (OS::get_singleton()->is_stdout_verbose())
 		print_line("load resource: ");
 
@@ -426,9 +426,11 @@ String ResourceLoader::get_resource_type(const String &p_path) {
 
 String ResourceLoader::_path_remap(const String &p_path, bool *r_translation_remapped) {
 
-	if (translation_remaps.has(p_path)) {
+	String new_path = p_path;
 
-		Vector<String> &v = *translation_remaps.getptr(p_path);
+	if (translation_remaps.has(new_path)) {
+
+		Vector<String> &v = *translation_remaps.getptr(new_path);
 		String locale = TranslationServer::get_singleton()->get_locale();
 		if (r_translation_remapped) {
 			*r_translation_remapped = true;
@@ -443,12 +445,16 @@ String ResourceLoader::_path_remap(const String &p_path, bool *r_translation_rem
 				continue;
 
 			if (l.begins_with(locale)) {
-				return v[i].left(split);
+				new_path = v[i].left(split);
+				break;
 			}
 		}
 	}
 
-	return p_path;
+	if (path_remaps.has(new_path)) {
+		new_path = path_remaps[new_path];
+	}
+	return new_path;
 }
 
 String ResourceLoader::import_remap(const String &p_path) {
@@ -515,6 +521,27 @@ void ResourceLoader::clear_translation_remaps() {
 	translation_remaps.clear();
 }
 
+void ResourceLoader::load_path_remaps() {
+
+	if (!ProjectSettings::get_singleton()->has_setting("path_remap/remapped_paths"))
+		return;
+
+	PoolVector<String> remaps = ProjectSettings::get_singleton()->get("path_remap/remapped_paths");
+	int rc = remaps.size();
+	ERR_FAIL_COND(rc & 1); //must be even
+	PoolVector<String>::Read r = remaps.read();
+
+	for (int i = 0; i < rc; i += 2) {
+
+		path_remaps[r[i]] = r[i + 1];
+	}
+}
+
+void ResourceLoader::clear_path_remaps() {
+
+	path_remaps.clear();
+}
+
 ResourceLoadErrorNotify ResourceLoader::err_notify = NULL;
 void *ResourceLoader::err_notify_ud = NULL;
 
@@ -526,3 +553,4 @@ bool ResourceLoader::timestamp_on_load = false;
 
 SelfList<Resource>::List ResourceLoader::remapped_list;
 HashMap<String, Vector<String> > ResourceLoader::translation_remaps;
+HashMap<String, String> ResourceLoader::path_remaps;
diff --git a/core/io/resource_loader.h b/core/io/resource_loader.h
index 5deffbca1a..05f01d8d31 100644
--- a/core/io/resource_loader.h
+++ b/core/io/resource_loader.h
@@ -91,6 +91,7 @@ class ResourceLoader {
 	static DependencyErrorNotify dep_err_notify;
 	static bool abort_on_missing_resource;
 	static HashMap<String, Vector<String> > translation_remaps;
+	static HashMap<String, String> path_remaps;
 
 	static String _path_remap(const String &p_path, bool *r_translation_remapped = NULL);
 	friend class Resource;
@@ -137,6 +138,9 @@ public:
 	static String path_remap(const String &p_path);
 	static String import_remap(const String &p_path);
 
+	static void load_path_remaps();
+	static void clear_path_remaps();
+
 	static void reload_translation_remaps();
 	static void load_translation_remaps();
 	static void clear_translation_remaps();
diff --git a/core/os/os.cpp b/core/os/os.cpp
index 8088a6fa74..d81e70e612 100644
--- a/core/os/os.cpp
+++ b/core/os/os.cpp
@@ -606,10 +606,6 @@ bool OS::has_feature(const String &p_feature) {
 	return false;
 }
 
-void *OS::get_stack_bottom() const {
-	return _stack_bottom;
-}
-
 OS::OS() {
 	void *volatile stack_bottom;
 
diff --git a/core/os/os.h b/core/os/os.h
index 979ad7e92a..d9f7b91daa 100644
--- a/core/os/os.h
+++ b/core/os/os.h
@@ -191,7 +191,7 @@ public:
 	virtual bool is_window_maximized() const { return true; }
 	virtual void request_attention() {}
 
-	virtual void set_borderless_window(int p_borderless) {}
+	virtual void set_borderless_window(bool p_borderless) {}
 	virtual bool get_borderless_window() { return 0; }
 
 	virtual void set_ime_position(const Point2 &p_pos) {}
@@ -442,15 +442,9 @@ public:
 	virtual int get_power_seconds_left();
 	virtual int get_power_percent_left();
 
+	virtual void force_process_input(){};
 	bool has_feature(const String &p_feature);
 
-	/**
-	 * Returns the stack bottom of the main thread of the application.
-	 * This may be of use when integrating languages with garbage collectors that
-	 * need to check whether a pointer is on the stack.
-	 */
-	virtual void *get_stack_bottom() const;
-
 	bool is_hidpi_allowed() const { return _allow_hidpi; }
 	OS();
 	virtual ~OS();
diff --git a/core/ustring.cpp b/core/ustring.cpp
index 3a0708851e..1bf7d000c3 100644
--- a/core/ustring.cpp
+++ b/core/ustring.cpp
@@ -734,7 +734,7 @@ Vector<String> String::split_spaces() const {
 	return ret;
 }
 
-Vector<String> String::split(const String &p_splitter, bool p_allow_empty) const {
+Vector<String> String::split(const String &p_splitter, bool p_allow_empty, int p_maxsplit) const {
 
 	Vector<String> ret;
 	int from = 0;
@@ -745,8 +745,21 @@ Vector<String> String::split(const String &p_splitter, bool p_allow_empty) const
 		int end = find(p_splitter, from);
 		if (end < 0)
 			end = len;
-		if (p_allow_empty || (end > from))
-			ret.push_back(substr(from, end - from));
+		if (p_allow_empty || (end > from)) {
+			if (p_maxsplit <= 0)
+				ret.push_back(substr(from, end - from));
+			else if (p_maxsplit > 0) {
+
+				// Put rest of the string and leave cycle.
+				if (p_maxsplit == ret.size()) {
+					ret.push_back(substr(from, len));
+					break;
+				}
+
+				// Otherwise, push items until positive limit is reached.
+				ret.push_back(substr(from, end - from));
+			}
+		}
 
 		if (end == len)
 			break;
diff --git a/core/ustring.h b/core/ustring.h
index 9c24133b55..6541642bd1 100644
--- a/core/ustring.h
+++ b/core/ustring.h
@@ -162,7 +162,7 @@ public:
 	String get_slice(String p_splitter, int p_slice) const;
 	String get_slicec(CharType p_splitter, int p_slice) const;
 
-	Vector<String> split(const String &p_splitter, bool p_allow_empty = true) const;
+	Vector<String> split(const String &p_splitter, bool p_allow_empty = true, int p_maxsplit = 0) const;
 	Vector<String> split_spaces() const;
 	Vector<float> split_floats(const String &p_splitter, bool p_allow_empty = true) const;
 	Vector<float> split_floats_mk(const Vector<String> &p_splitters, bool p_allow_empty = true) const;
diff --git a/core/variant_call.cpp b/core/variant_call.cpp
index f66cce85c9..2b99a60ba5 100644
--- a/core/variant_call.cpp
+++ b/core/variant_call.cpp
@@ -254,7 +254,7 @@ struct _VariantCall {
 	VCALL_LOCALMEM2R(String, replacen);
 	VCALL_LOCALMEM2R(String, insert);
 	VCALL_LOCALMEM0R(String, capitalize);
-	VCALL_LOCALMEM2R(String, split);
+	VCALL_LOCALMEM3R(String, split);
 	VCALL_LOCALMEM2R(String, split_floats);
 	VCALL_LOCALMEM0R(String, to_upper);
 	VCALL_LOCALMEM0R(String, to_lower);
@@ -1446,7 +1446,7 @@ void register_variant_methods() {
 	ADDFUNC2R(STRING, STRING, String, replacen, STRING, "what", STRING, "forwhat", varray());
 	ADDFUNC2R(STRING, STRING, String, insert, INT, "position", STRING, "what", varray());
 	ADDFUNC0R(STRING, STRING, String, capitalize, varray());
-	ADDFUNC2R(STRING, POOL_STRING_ARRAY, String, split, STRING, "divisor", BOOL, "allow_empty", varray(true));
+	ADDFUNC3R(STRING, POOL_STRING_ARRAY, String, split, STRING, "divisor", BOOL, "allow_empty", INT, "maxsplit", varray(true, 0));
 	ADDFUNC2R(STRING, POOL_REAL_ARRAY, String, split_floats, STRING, "divisor", BOOL, "allow_empty", varray(true));
 
 	ADDFUNC0R(STRING, STRING, String, to_upper, varray());
diff --git a/doc/classes/Animation.xml b/doc/classes/Animation.xml
index 93b01a466b..dd248d18f7 100644
--- a/doc/classes/Animation.xml
+++ b/doc/classes/Animation.xml
@@ -38,6 +38,7 @@
 			<argument index="1" name="to_animation" type="Animation">
 			</argument>
 			<description>
+				Adds a new track that is a copy of the given track from [code]to_animation[/code].
 			</description>
 		</method>
 		<method name="find_track" qualifiers="const">
@@ -260,6 +261,7 @@
 			<argument index="0" name="idx" type="int">
 			</argument>
 			<description>
+				Returns [code]true[/code] if the track at index [code]idx[/code] is enabled.
 			</description>
 		</method>
 		<method name="track_is_imported" qualifiers="const">
@@ -319,6 +321,7 @@
 			<argument index="1" name="enabled" type="bool">
 			</argument>
 			<description>
+				Enables/disables the given track. Tracks are enabled by default.	
 			</description>
 		</method>
 		<method name="track_set_imported">
diff --git a/doc/classes/AnimationPlayer.xml b/doc/classes/AnimationPlayer.xml
index d61211bb6b..570f5e9741 100644
--- a/doc/classes/AnimationPlayer.xml
+++ b/doc/classes/AnimationPlayer.xml
@@ -90,6 +90,13 @@
 				Returns the list of stored animation names.
 			</description>
 		</method>
+		<method name="get_autoplay" qualifiers="const">
+			<return type="String">
+			</return>
+			<description>
+				Returns the name of the animation that will be automatically played when the scene is loaded.
+			</description>
+		</method>
 		<method name="get_blend_time" qualifiers="const">
 			<return type="float">
 			</return>
@@ -101,11 +108,18 @@
 				Get the blend time (in seconds) between two animations, referenced by their names.
 			</description>
 		</method>
+		<method name="get_current_animation" qualifiers="const">
+			<return type="String">
+			</return>
+			<description>
+				Returns the name of the animation being played.
+			</description>
+		</method>
 		<method name="get_current_animation_length" qualifiers="const">
 			<return type="float">
 			</return>
 			<description>
-				Get the length (in seconds) of the currently playing animation.
+				Get the length (in seconds) of the currently being played animation.
 			</description>
 		</method>
 		<method name="get_current_animation_position" qualifiers="const">
@@ -115,6 +129,12 @@
 				Get the position (in seconds) of the currently playing animation.
 			</description>
 		</method>
+		<method name="get_speed_scale" qualifiers="const">
+			<return type="float">
+			</return>
+			<description>
+			</description>
+		</method>
 		<method name="has_animation" qualifiers="const">
 			<return type="bool">
 			</return>
@@ -124,6 +144,13 @@
 				Returns [code]true[/code] if the [code]AnimationPlayer[/code] stores an [Animation] with key [code]name[/code].
 			</description>
 		</method>
+		<method name="is_active" qualifiers="const">
+			<return type="bool">
+			</return>
+			<description>
+				Returns [code]true[/code] if the player is active.
+			</description>
+		</method>
 		<method name="is_playing" qualifiers="const">
 			<return type="bool">
 			</return>
@@ -143,7 +170,8 @@
 			<argument index="3" name="from_end" type="bool" default="false">
 			</argument>
 			<description>
-				Play the animation with key [code]name[/code]. Custom speed and blend times can be set. If custom speed is negative (-1), 'from_end' being true can play the animation backwards.
+				Play the animation with key [code]name[/code]. Custom speed and blend times can be set. If custom speed is negative (-1), 'from_end' being true can play the
+				animation backwards.
 			</description>
 		</method>
 		<method name="play_backwards">
@@ -194,7 +222,25 @@
 			<argument index="1" name="update" type="bool" default="false">
 			</argument>
 			<description>
-				Seek the animation to the [code]seconds[/code] point in time (in seconds). If 'update' is true, the animation updates too, otherwise it updates at process time.
+				Seek the animation to the [code]seconds[/code] point in time (in seconds). If [code]update[/code] is [code]true[/code], the animation updates too, otherwise it updates at process time.
+			</description>
+		</method>
+		<method name="set_active">
+			<return type="void">
+			</return>
+			<argument index="0" name="active" type="bool">
+			</argument>
+			<description>
+				Sets the player as active (playing). If [code]true[/code], updates animations in response to process-related notifications. Default value: [code]true[/code].
+			</description>
+		</method>
+		<method name="set_autoplay">
+			<return type="void">
+			</return>
+			<argument index="0" name="name" type="String">
+			</argument>
+			<description>
+				Defines the name of the animation to play when the scene loads. Default value: [code]""[/code].
 			</description>
 		</method>
 		<method name="set_blend_time">
@@ -210,34 +256,37 @@
 				Specify a blend time (in seconds) between two animations, referenced by their names.
 			</description>
 		</method>
-		<method name="stop">
+		<method name="set_current_animation">
 			<return type="void">
 			</return>
-			<argument index="0" name="reset" type="bool" default="true">
+			<argument index="0" name="anim" type="String">
 			</argument>
 			<description>
-				Stop the currently playing animation. If [code]reset[/code] is [code]true[/code], the anim position is reset to [code]0[/code].
+				Sets the name of the current animation. If already playing, restarts the animation. Ensure [member active] is [code]true[/code] to simulate [method play]. Default value: [code]""[/code].
 			</description>
 		</method>
-		<method name="stop_all">
+		<method name="set_speed_scale">
 			<return type="void">
 			</return>
+			<argument index="0" name="speed" type="float">
+			</argument>
 			<description>
-				Stop playback of animations (deprecated).
+				Sets the speed scaling ratio in a given animation channel (or channel 0 if none is provided). Default value: [code]1[/code].
+			</description>
+		</method>
+		<method name="stop">
+			<return type="void">
+			</return>
+			<argument index="0" name="reset" type="bool" default="true">
+			</argument>
+			<description>
+				Stop the currently playing animation. If [code]reset[/code] is [code]true[/code], the anim position is reset to [code]0[/code].
 			</description>
 		</method>
 	</methods>
 	<members>
-		<member name="active" type="bool" setter="set_active" getter="is_active">
-			If [code]true[/code] updates animations in response to process-related notifications. Default value: [code]true[/code].
-		</member>
-		<member name="autoplay" type="String" setter="set_autoplay" getter="get_autoplay">
-			The name of the animation to play when the scene loads. Default value: [code]""[/code].
-		</member>
-		<member name="current_animation" type="String" setter="set_current_animation" getter="get_current_animation">
-			The name of the current animation. Default value: [code]""[/code].
-		</member>
 		<member name="playback_default_blend_time" type="float" setter="set_default_blend_time" getter="get_default_blend_time">
+			The default time in which to blend animations. Ranges from 0 to 4096 with 0.01 precision. Default value: [code]0[/code].
 		</member>
 		<member name="playback_process_mode" type="int" setter="set_animation_process_mode" getter="get_animation_process_mode" enum="AnimationPlayer.AnimationProcessMode">
 			The process notification in which to update animations. Default value: [enum ANIMATION_PROCESS_IDLE].
@@ -245,9 +294,6 @@
 		<member name="root_node" type="NodePath" setter="set_root" getter="get_root">
 			The node from which node path references will travel. Default value: [code]".."[/code].
 		</member>
-		<member name="speed_scale" type="float" setter="set_speed_scale" getter="get_speed_scale">
-			The speed scaling ratio in a given animation channel (or channel 0 if none is provided). Default value: [code]1[/code].
-		</member>
 	</members>
 	<signals>
 		<signal name="animation_changed">
@@ -256,21 +302,21 @@
 			<argument index="1" name="new_name" type="String">
 			</argument>
 			<description>
-				Emitted when the [Animation] with key [member current_anim] is modified.
+				If the currently being played animation changes, this signal will notify of such change.
 			</description>
 		</signal>
 		<signal name="animation_finished">
 			<argument index="0" name="name" type="String">
 			</argument>
 			<description>
-				Emitted when an animation finishes.
+				Notifies when an animation finished playing.
 			</description>
 		</signal>
 		<signal name="animation_started">
 			<argument index="0" name="name" type="String">
 			</argument>
 			<description>
-				Emitted when an animation starts.
+				Notifies when an animation starts playing.
 			</description>
 		</signal>
 	</signals>
diff --git a/doc/classes/CanvasItem.xml b/doc/classes/CanvasItem.xml
index cf0b482b07..bd20cfcf5d 100644
--- a/doc/classes/CanvasItem.xml
+++ b/doc/classes/CanvasItem.xml
@@ -36,7 +36,7 @@
 			<argument index="4" name="modulate" type="Color" default="Color( 1, 1, 1, 1 )">
 			</argument>
 			<description>
-				Draw a string character using a custom font. Returns the advance, depending on the char width and kerning with an optional next char.
+				Draws a string character using a custom font. Returns the advance, depending on the char width and kerning with an optional next char.
 			</description>
 		</method>
 		<method name="draw_circle">
@@ -49,7 +49,7 @@
 			<argument index="2" name="color" type="Color">
 			</argument>
 			<description>
-				Draw a colored circle.
+				Draws a colored circle.
 			</description>
 		</method>
 		<method name="draw_colored_polygon">
@@ -68,7 +68,7 @@
 			<argument index="5" name="antialiased" type="bool" default="false">
 			</argument>
 			<description>
-				Draw a colored polygon of any amount of points, convex or concave.
+				Draws a colored polygon of any amount of points, convex or concave.
 			</description>
 		</method>
 		<method name="draw_line">
@@ -85,7 +85,7 @@
 			<argument index="4" name="antialiased" type="bool" default="false">
 			</argument>
 			<description>
-				Draw a line from a 2D point to another, with a given color and width. It can be optionally antialiased.
+				Draws a line from a 2D point to another, with a given color and width. It can be optionally antialiased.
 			</description>
 		</method>
 		<method name="draw_multiline">
@@ -100,6 +100,7 @@
 			<argument index="3" name="antialiased" type="bool" default="false">
 			</argument>
 			<description>
+				Draws multiple, parallel lines with a uniform [code]color[/code] and [code]width[/code] and optional antialiasing.
 			</description>
 		</method>
 		<method name="draw_multiline_colors">
@@ -114,6 +115,7 @@
 			<argument index="3" name="antialiased" type="bool" default="false">
 			</argument>
 			<description>
+				Draws multiple, parallel lines with a uniform [code]width[/code], segment-by-segment coloring, and optional antialiasing. Colors assigned to line segments match by index between [code]points[/code] and [code]colors[/code].
 			</description>
 		</method>
 		<method name="draw_polygon">
@@ -132,7 +134,7 @@
 			<argument index="5" name="antialiased" type="bool" default="false">
 			</argument>
 			<description>
-				Draw a polygon of any amount of points, convex or concave.
+				Draws a polygon of any amount of points, convex or concave.
 			</description>
 		</method>
 		<method name="draw_polyline">
@@ -147,7 +149,7 @@
 			<argument index="3" name="antialiased" type="bool" default="false">
 			</argument>
 			<description>
-				Draw a polyline with a uniform [code]color[/code] and [code]width[/code] and optional antialiasing.
+				Draws interconnected line segments with a uniform [code]color[/code] and [code]width[/code] and optional antialiasing.
 			</description>
 		</method>
 		<method name="draw_polyline_colors">
@@ -162,7 +164,7 @@
 			<argument index="3" name="antialiased" type="bool" default="false">
 			</argument>
 			<description>
-				Draw a polyline with a uniform [code]width[/code], segment-by-segment coloring, and optional antialiasing. Colors assigned to line segments match by index between [code]points[/code] and [code]colors[/code].
+				Draws interconnected line segments with a uniform [code]width[/code], segment-by-segment coloring, and optional antialiasing. Colors assigned to line segments match by index between [code]points[/code] and [code]colors[/code].
 			</description>
 		</method>
 		<method name="draw_primitive">
@@ -181,7 +183,7 @@
 			<argument index="5" name="normal_map" type="Texture" default="null">
 			</argument>
 			<description>
-				Draw a custom primitive, 1 point for a point, 2 points for a line, 3 points for a triangle and 4 points for a quad.
+				Draws a custom primitive, 1 point for a point, 2 points for a line, 3 points for a triangle and 4 points for a quad.
 			</description>
 		</method>
 		<method name="draw_rect">
@@ -194,7 +196,7 @@
 			<argument index="2" name="filled" type="bool" default="true">
 			</argument>
 			<description>
-				Draw a colored rectangle.
+				Draws a colored rectangle.
 			</description>
 		</method>
 		<method name="draw_set_transform">
@@ -233,7 +235,7 @@
 			<argument index="4" name="clip_w" type="int" default="-1">
 			</argument>
 			<description>
-				Draw a string using a custom font.
+				Draws a string using a custom font.
 			</description>
 		</method>
 		<method name="draw_style_box">
@@ -244,7 +246,7 @@
 			<argument index="1" name="rect" type="Rect2">
 			</argument>
 			<description>
-				Draw a styled rectangle.
+				Draws a styled rectangle.
 			</description>
 		</method>
 		<method name="draw_texture">
@@ -259,7 +261,7 @@
 			<argument index="3" name="normal_map" type="Texture" default="null">
 			</argument>
 			<description>
-				Draw a texture at a given position.
+				Draws a texture at a given position.
 			</description>
 		</method>
 		<method name="draw_texture_rect">
@@ -278,7 +280,7 @@
 			<argument index="5" name="normal_map" type="Texture" default="null">
 			</argument>
 			<description>
-				Draw a textured rectangle at a given position, optionally modulated by a color. Transpose swaps the x and y coordinates when reading the texture.
+				Draws a textured rectangle at a given position, optionally modulated by a color. Transpose swaps the x and y coordinates when reading the texture.
 			</description>
 		</method>
 		<method name="draw_texture_rect_region">
@@ -299,7 +301,7 @@
 			<argument index="6" name="clip_uv" type="bool" default="true">
 			</argument>
 			<description>
-				Draw a textured rectangle region at a given position, optionally modulated by a color. Transpose swaps the x and y coordinates when reading the texture.
+				Draws a textured rectangle region at a given position, optionally modulated by a color. Transpose swaps the x and y coordinates when reading the texture.
 			</description>
 		</method>
 		<method name="get_canvas" qualifiers="const">
diff --git a/doc/classes/Curve.xml b/doc/classes/Curve.xml
index 3e1158ca3b..f7ef9a182c 100644
--- a/doc/classes/Curve.xml
+++ b/doc/classes/Curve.xml
@@ -1,8 +1,10 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="Curve" inherits="Resource" category="Core" version="3.0-beta">
 	<brief_description>
+		A mathematic curve.
 	</brief_description>
 	<description>
+		A curve that can be saved and re-used for other objects. By default it ranges between [code]0[/code] and [code]1[/code] on the y-axis and positions points relative to the [code]0.5[/code] y-position.
 	</description>
 	<tutorials>
 	</tutorials>
@@ -23,24 +25,28 @@
 			<argument index="4" name="right_mode" type="int" enum="Curve.TangentMode" default="0">
 			</argument>
 			<description>
+				Adds a point to the curve. For each side, if the [code]*_mode[/code] is [code]TANGENT_LINEAR[/code], the [code]*_tangent[/code] angle (in degrees) uses the slope of the curve halfway to the adjacent point. Allows custom assignments to the [code]*_tangent[/code] angle if [code]*_mode[/code] is set to [code]TANGENT_FREE[/code].
 			</description>
 		</method>
 		<method name="bake">
 			<return type="void">
 			</return>
 			<description>
+				Recomputes the baked cache of points for the curve.
 			</description>
 		</method>
 		<method name="clean_dupes">
 			<return type="void">
 			</return>
 			<description>
+				Removes points that are closer than [code]CMP_EPSILON[/code] (0.00001) units to their neighbor on the curve.
 			</description>
 		</method>
 		<method name="clear_points">
 			<return type="void">
 			</return>
 			<description>
+				Removes all points from the curve.
 			</description>
 		</method>
 		<method name="get_point_left_mode" qualifiers="const">
@@ -49,6 +55,7 @@
 			<argument index="0" name="index" type="int">
 			</argument>
 			<description>
+				Returns the left [code]TangentMode[/code] for the point at [code]index[/code].
 			</description>
 		</method>
 		<method name="get_point_left_tangent" qualifiers="const">
@@ -57,6 +64,7 @@
 			<argument index="0" name="index" type="int">
 			</argument>
 			<description>
+				Returns the left tangent angle (in degrees) for the point at [code]index[/code].
 			</description>
 		</method>
 		<method name="get_point_position" qualifiers="const">
@@ -65,6 +73,7 @@
 			<argument index="0" name="index" type="int">
 			</argument>
 			<description>
+				Returns the curve coordinates for the point at [code]index[/code].
 			</description>
 		</method>
 		<method name="get_point_right_mode" qualifiers="const">
@@ -73,6 +82,7 @@
 			<argument index="0" name="index" type="int">
 			</argument>
 			<description>
+				Returns the right [code]TangentMode[/code] for the point at [code]index[/code].
 			</description>
 		</method>
 		<method name="get_point_right_tangent" qualifiers="const">
@@ -81,6 +91,7 @@
 			<argument index="0" name="index" type="int">
 			</argument>
 			<description>
+				Returns the right tangent angle (in degrees) for the point at [code]index[/code].
 			</description>
 		</method>
 		<method name="interpolate" qualifiers="const">
@@ -89,12 +100,14 @@
 			<argument index="0" name="offset" type="float">
 			</argument>
 			<description>
+				Returns the y value for the point that would exist at x-position [code]offset[/code] along the curve.
 			</description>
 		</method>
 		<method name="interpolate_baked">
 			<return type="float">
 			</return>
 			<argument index="0" name="offset" type="float">
+				Returns the y value for the point that would exist at x-position [code]offset[/code] along the curve using the baked cache. Bakes the curve's points if not already baked.
 			</argument>
 			<description>
 			</description>
@@ -105,6 +118,7 @@
 			<argument index="0" name="index" type="int">
 			</argument>
 			<description>
+				Removes the point at [code]index[/code] from the curve.
 			</description>
 		</method>
 		<method name="set_point_left_mode">
@@ -115,6 +129,7 @@
 			<argument index="1" name="mode" type="int" enum="Curve.TangentMode">
 			</argument>
 			<description>
+				Sets the left [code]TangentMode[/code] for the point at [code]index[/code] to [code]mode[/code].
 			</description>
 		</method>
 		<method name="set_point_left_tangent">
@@ -125,6 +140,7 @@
 			<argument index="1" name="tangent" type="float">
 			</argument>
 			<description>
+				Sets the left tangent angle for the point at [code]index[/code] to [code]tangent[/code].
 			</description>
 		</method>
 		<method name="set_point_offset">
@@ -135,6 +151,7 @@
 			<argument index="1" name="offset" type="float">
 			</argument>
 			<description>
+				Sets the offset from [code]0.5[/code]
 			</description>
 		</method>
 		<method name="set_point_right_mode">
@@ -145,6 +162,7 @@
 			<argument index="1" name="mode" type="int" enum="Curve.TangentMode">
 			</argument>
 			<description>
+				Sets the right [code]TangentMode[/code] for the point at [code]index[/code] to [code]mode[/code].
 			</description>
 		</method>
 		<method name="set_point_right_tangent">
@@ -155,6 +173,7 @@
 			<argument index="1" name="tangent" type="float">
 			</argument>
 			<description>
+				Sets the right tangent angle for the point at [code]index[/code] to [code]tangent[/code].
 			</description>
 		</method>
 		<method name="set_point_value">
@@ -165,29 +184,37 @@
 			<argument index="1" name="y" type="float">
 			</argument>
 			<description>
+				Assigns the vertical position [code]y[/code] to the point at [code]index[/code].
 			</description>
 		</method>
 	</methods>
 	<members>
 		<member name="bake_resolution" type="int" setter="set_bake_resolution" getter="get_bake_resolution">
+			The number of points to include in the baked (i.e. cached) curve data.
 		</member>
 		<member name="max_value" type="float" setter="set_max_value" getter="get_max_value">
+			The maximum value the curve can reach. Default value: [code]1[/code].
 		</member>
 		<member name="min_value" type="float" setter="set_min_value" getter="get_min_value">
+			The minimum value the curve can reach. Default value: [code]0[/code].
 		</member>
 	</members>
 	<signals>
 		<signal name="range_changed">
 			<description>
+				Emitted when [member max_value] or [member min_value] is changed.
 			</description>
 		</signal>
 	</signals>
 	<constants>
 		<constant name="TANGENT_FREE" value="0" enum="TangentMode">
+			The tangent on this side of the point is user-defined.
 		</constant>
 		<constant name="TANGENT_LINEAR" value="1" enum="TangentMode">
+			The curve calculates the tangent on this side of the point as the slope halfway towards the adjacent point.
 		</constant>
 		<constant name="TANGENT_MODE_COUNT" value="2" enum="TangentMode">
+			The total number of available tangent modes.
 		</constant>
 	</constants>
 </class>
diff --git a/doc/classes/Curve3D.xml b/doc/classes/Curve3D.xml
index e30ae85617..91d7a9bed8 100644
--- a/doc/classes/Curve3D.xml
+++ b/doc/classes/Curve3D.xml
@@ -32,6 +32,7 @@
 			<return type="void">
 			</return>
 			<description>
+				Removes all points from the curve.
 			</description>
 		</method>
 		<method name="get_baked_length" qualifiers="const">
@@ -203,6 +204,7 @@
 	</methods>
 	<members>
 		<member name="bake_interval" type="float" setter="set_bake_interval" getter="get_bake_interval">
+			The distance in meters between two adjacent cached points. Changing it forces the cache to be recomputed the next time the [method get_baked_points] or [method get_baked_length] function is called. The smaller the distance, the more points in the cache and the more memory it will consume, so use with care.
 		</member>
 	</members>
 	<constants>
diff --git a/doc/classes/HTTPClient.xml b/doc/classes/HTTPClient.xml
index b90c49b5c0..9d4b45a8d7 100644
--- a/doc/classes/HTTPClient.xml
+++ b/doc/classes/HTTPClient.xml
@@ -224,7 +224,10 @@
 		<constant name="METHOD_CONNECT" value="7" enum="Method">
 			HTTP CONNECT method. The CONNECT method establishes a tunnel to the server identified by the target resource. Rarely used.
 		</constant>
-		<constant name="METHOD_MAX" value="8" enum="Method">
+		<constant name="METHOD_PATCH" value="8" enum="Method">
+			HTTP PATCH method. The PATCH method is used to apply partial modifications to a resource.
+		</constant>
+		<constant name="METHOD_MAX" value="9" enum="Method">
 			Marker for end of [code]METHOD_*[/code] enum. Not used.
 		</constant>
 		<constant name="STATUS_DISCONNECTED" value="0" enum="Status">
@@ -290,6 +293,9 @@
 		<constant name="RESPONSE_MULTI_STATUS" value="207" enum="ResponseCode">
 			HTTP status code [code]207 Multi-Status[/code] (WebDAV). A Multi-Status response conveys information about multiple resources in situations where multiple status codes might be appropriate.
 		</constant>
+		<constant name="RESPONSE_ALREADY_REPORTED" value="208" enum="ResponseCode">
+			HTTP status code [code]208 Already Reported[/code] (WebDAV). Used inside a DAV: propstat response element to avoid enumerating the internal members of multiple bindings to the same collection repeatedly.
+		</constant>
 		<constant name="RESPONSE_IM_USED" value="226" enum="ResponseCode">
 			HTTP status code [code]226 IM Used[/code] (WebDAV). The server has fulfilled a GET request for the resource, and the response is a representation of the result of one or more instance-manipulations applied to the current instance.
 		</constant>
@@ -311,9 +317,15 @@
 		<constant name="RESPONSE_USE_PROXY" value="305" enum="ResponseCode">
 			HTTP status code [code]305 Use Proxy[/code]. Deprecated. Do not use.
 		</constant>
+		<constant name="RESPONSE_SWITCH_PROXY" value="306" enum="ResponseCode">
+			HTTP status code [code]306 Switch Proxy[/code]. Deprecated. Do not use.
+		</constant>
 		<constant name="RESPONSE_TEMPORARY_REDIRECT" value="307" enum="ResponseCode">
 			HTTP status code [code]307 Temporary Redirect[/code]. The target resource resides temporarily under a different URI and the user agent MUST NOT change the request method if it performs an automatic redirection to that URI.
 		</constant>
+		<constant name="RESPONSE_PERMANENT_REDIRECT" value="308" enum="ResponseCode">
+			HTTP status code [code]308 Permanent Redirect[/code]. The target resource has been assigned a new permanent URI and any future references to this resource ought to use one of the enclosed URIs.
+		</constant>
 		<constant name="RESPONSE_BAD_REQUEST" value="400" enum="ResponseCode">
 			HTTP status code [code]400 Bad Request[/code]. The request was invalid. The server cannot or will not process the request due to something that is perceived to be a client error (e.g., malformed request syntax, invalid request message framing, invalid request contents, or deceptive request routing).
 		</constant>
@@ -368,6 +380,12 @@
 		<constant name="RESPONSE_EXPECTATION_FAILED" value="417" enum="ResponseCode">
 			HTTP status code [code]417 Expectation Failed[/code]. The expectation given in the request's Expect header field could not be met by at least one of the inbound servers.
 		</constant>
+		<constant name="RESPONSE_IM_A_TEAPOT" value="418" enum="ResponseCode">
+			HTTP status code [code]418 I'm A Teapot[/code]. Any attempt to brew coffee with a teapot should result in the error code "418 I'm a teapot". The resulting entity body MAY be short and stout.
+		</constant>
+		<constant name="RESPONSE_MISDIRECTED_REQUEST" value="421" enum="ResponseCode">
+			HTTP status code [code]421 Misdirected Request[/code]. The request was directed at a server that is not able to produce a response. This can be sent by a server that is not configured to produce responses for the combination of scheme and authority that are included in the request URI.
+		</constant>
 		<constant name="RESPONSE_UNPROCESSABLE_ENTITY" value="422" enum="ResponseCode">
 			HTTP status code [code]422 Unprocessable Entity[/code] (WebDAV). The server understands the content type of the request entity (hence a 415 Unsupported Media Type status code is inappropriate), and the syntax of the request entity is correct (thus a 400 Bad Request status code is inappropriate) but was unable to process the contained instructions.
 		</constant>
@@ -380,6 +398,18 @@
 		<constant name="RESPONSE_UPGRADE_REQUIRED" value="426" enum="ResponseCode">
 			HTTP status code [code]426 Upgrade Required[/code]. The server refuses to perform the request using the current protocol but might be willing to do so after the client upgrades to a different protocol.
 		</constant>
+		<constant name="RESPONSE_PRECONDITION_REQUIRED" value="428" enum="ResponseCode">
+			HTTP status code [code]428 Precondition Required[/code]. The origin server requires the request to be conditional.
+		</constant>
+		<constant name="RESPONSE_TOO_MANY_REQUESTS" value="429" enum="ResponseCode">
+			HTTP status code [code]429 Too Many Requests[/code]. The user has sent too many requests in a given amount of time (see "rate limiting"). Back off and increase time between requests or try again later.
+		</constant>
+		<constant name="RESPONSE_REQUEST_HEADER_FIELDS_TOO_LARGE" value="431" enum="ResponseCode">
+			HTTP status code [code]431 Rquest Header Fields Too Large[/code]. The server is unwilling to process the request because its header fields are too large. The request MAY be resubmitted after reducing the size of the request header fields.
+		</constant>
+		<constant name="RESPONSE_UNAVAILABLE_FOR_LEGAL_REASONS" value="451" enum="ResponseCode">
+			HTTP status code [code]451 Response Unavailable For Legal Reasons[/code]. The server is denying access to the resource as a consequence of a legal demand.
+		</constant>
 		<constant name="RESPONSE_INTERNAL_SERVER_ERROR" value="500" enum="ResponseCode">
 			HTTP status code [code]500 Internal Server Error[/code]. The server encountered an unexpected condition that prevented it from fulfilling the request.
 		</constant>
@@ -398,11 +428,20 @@
 		<constant name="RESPONSE_HTTP_VERSION_NOT_SUPPORTED" value="505" enum="ResponseCode">
 			HTTP status code [code]505 HTTP Version Not Supported[/code]. The server does not support, or refuses to support, the major version of HTTP that was used in the request message.
 		</constant>
+		<constant name="RESPONSE_VARIANT_ALSO_NEGOTIATES" value="506" enum="ResponseCode">
+			HTTP status code [code]506 Variant Also Negotiates[/code]. The server has an internal configuration error: the chosen variant resource is configured to engage in transparent content negotiation itself, and is therefore not a proper end point in the negotiation process.
+		</constant>
 		<constant name="RESPONSE_INSUFFICIENT_STORAGE" value="507" enum="ResponseCode">
 			HTTP status code [code]507 Insufficient Storage[/code]. The method could not be performed on the resource because the server is unable to store the representation needed to successfully complete the request.
 		</constant>
+		<constant name="RESPONSE_LOOP_DETECTED" value="508" enum="ResponseCode">
+			HTTP status code [code]508 Loop Detected[/code]. The server terminated an operation because it encountered an infinite loop while processing a request with "Depth: infinity". This status indicates that the entire operation failed.
+		</constant>
 		<constant name="RESPONSE_NOT_EXTENDED" value="510" enum="ResponseCode">
 			HTTP status code [code]510 Not Extended[/code]. The policy for accessing the resource has not been met in the request. The server should send back all the information necessary for the client to issue an extended request.
 		</constant>
+		<constant name="RESPONSE_NETWORK_AUTH_REQUIRED" value="511" enum="ResponseCode">
+			HTTP status code [code]511 Network Authentication Required[/code]. The client needs to authenticate to gain network access.
+		</constant>
 	</constants>
 </class>
diff --git a/doc/classes/JSON.xml b/doc/classes/JSON.xml
index bb48833878..28f04c3eb0 100644
--- a/doc/classes/JSON.xml
+++ b/doc/classes/JSON.xml
@@ -4,7 +4,7 @@
 		Helper class for parsing JSON data.
 	</brief_description>
 	<description>
-		Helper class for parsing JSON data.
+		Helper class for parsing JSON data. For usage example, see [JSONParseResult].
 	</description>
 	<tutorials>
 	</tutorials>
diff --git a/doc/classes/JSONParseResult.xml b/doc/classes/JSONParseResult.xml
index 77145eff6a..835920818a 100644
--- a/doc/classes/JSONParseResult.xml
+++ b/doc/classes/JSONParseResult.xml
@@ -14,23 +14,23 @@
 	</methods>
 	<members>
 		<member name="error" type="int" setter="set_error" getter="get_error" enum="Error">
-			The error type if JSON source was not successfully parsed. See [@GlobalScope]ERR_* constants.
+			The error type if JSON source was not successfully parsed. See [@GlobalScope] ERR_* constants.
 		</member>
 		<member name="error_line" type="int" setter="set_error_line" getter="get_error_line">
 			The line number where the error occurred if JSON source was not successfully parsed.
 		</member>
 		<member name="error_string" type="String" setter="set_error_string" getter="get_error_string">
-			The error message if JSON source was not successfully parsed. See [@GlobalScope]ERR_* constants.
+			The error message if JSON source was not successfully parsed. See [@GlobalScope] ERR_* constants.
 		</member>
 		<member name="result" type="Variant" setter="set_result" getter="get_result">
-			A [Variant] containing the parsed JSON. Use typeof() to check if it is what you expect. For example, if JSON source starts with braces [code]{}[/code] a [Dictionary] will be returned, if JSON source starts with array braces [code][][/code] an [Array] will be returned.
+			A [Variant] containing the parsed JSON. Use typeof() to check if it is what you expect. For example, if JSON source starts with curly braces ([code]{}[/code]) a [Dictionary] will be returned, if JSON source starts with braces ([code][][/code]) an [Array] will be returned.
 			[i]Be aware that the JSON specification does not define integer or float types, but only a number type. Therefore, parsing a JSON text will convert all numerical values to float types.[/i]
 			[codeblock]
 			p = JSON.parse('["hello", "world", "!"]')
 			if typeof(p) == TYPE_ARRAY:
-				print(p[0]) # prints 'hello'
+			    print(p[0]) # prints 'hello'
 			else:
-				print("unexpected results")
+			    print("unexpected results")
 			[/codeblock]
 		</member>
 	</members>
diff --git a/doc/classes/Navigation.xml b/doc/classes/Navigation.xml
index 4bfe964a4d..8fe520f853 100644
--- a/doc/classes/Navigation.xml
+++ b/doc/classes/Navigation.xml
@@ -1,8 +1,10 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="Navigation" inherits="Spatial" category="Core" version="3.0-beta">
 	<brief_description>
+		A collection of [code]NavigationMesh[/code] resources and methods used for pathfinding.
 	</brief_description>
 	<description>
+		The Navigation node is used for basic or advanced navigation. By default it will automatically collect all child [code]NavigationMesh[/code] resources, but they can also be added on the fly through scripting. It can be used for generating a simple path between two points or it can be used to ensure that a navigation agent is angled perfectly to the terrain it is navigating.
 	</description>
 	<tutorials>
 	</tutorials>
@@ -15,6 +17,7 @@
 			<argument index="0" name="to_point" type="Vector3">
 			</argument>
 			<description>
+				Returns the closest navigation point to the point passed.
 			</description>
 		</method>
 		<method name="get_closest_point_normal">
@@ -23,6 +26,7 @@
 			<argument index="0" name="to_point" type="Vector3">
 			</argument>
 			<description>
+				Returns the surface normal of the navigation mesh at the point passed. For instance, if the point passed was at a 45 degree slope it would return something like (0.5,0.5,0). This is useful for rotating a navigation agent in accordance with the [code]NavigationMesh[/code].
 			</description>
 		</method>
 		<method name="get_closest_point_owner">
@@ -31,6 +35,7 @@
 			<argument index="0" name="to_point" type="Vector3">
 			</argument>
 			<description>
+				Returns the nearest [code]NavigationMeshInstance[/code] to the point passed.
 			</description>
 		</method>
 		<method name="get_closest_point_to_segment">
@@ -43,6 +48,7 @@
 			<argument index="2" name="use_collision" type="bool" default="false">
 			</argument>
 			<description>
+				Returns the nearest point to the line segment passed. The third optional parameter takes collisions into account.
 			</description>
 		</method>
 		<method name="get_simple_path">
@@ -55,9 +61,10 @@
 			<argument index="2" name="optimize" type="bool" default="true">
 			</argument>
 			<description>
+				Returns a path of points as a [code]PoolVector3Array[/code]. If [code]optimize[/code] is false the [code]NavigationMesh[/code] agent properties will be taken into account, otherwise it will return the nearest path and ignore agent radius, height, etc.
 			</description>
 		</method>
-		<method name="navmesh_create">
+		<method name="navmesh_add">
 			<return type="int">
 			</return>
 			<argument index="0" name="mesh" type="NavigationMesh">
@@ -67,6 +74,7 @@
 			<argument index="2" name="owner" type="Object" default="null">
 			</argument>
 			<description>
+				Adds a [code]NavigationMesh[/code] to the list of NavigationMesh's in this node. Returns an id. Its position, rotation and scale are associated with the [code]Transform[/code] passed. The [code]Node[/code] (or [code]Object[/code]) that owns this node is an optional parameter.
 			</description>
 		</method>
 		<method name="navmesh_remove">
@@ -75,6 +83,7 @@
 			<argument index="0" name="id" type="int">
 			</argument>
 			<description>
+				Removes a [code]NavigationMesh[/code] from the list of NavigationMesh's in this node.
 			</description>
 		</method>
 		<method name="navmesh_set_transform">
@@ -85,11 +94,13 @@
 			<argument index="1" name="xform" type="Transform">
 			</argument>
 			<description>
+				Associates a [code]NavigationMesh[/code]'s id with a [code]Transform[/code]. Its position, rotation and scale are based on the [code]Transform[/code] passed.
 			</description>
 		</method>
 	</methods>
 	<members>
 		<member name="up_vector" type="Vector3" setter="set_up_vector" getter="get_up_vector">
+			Defines which direction is up. The default defines 0,1,0 as up which is the world up direction. To make this a ceiling use 0,-1,0 to define down as up.
 		</member>
 	</members>
 	<constants>
diff --git a/doc/classes/Navigation2D.xml b/doc/classes/Navigation2D.xml
index 8868348cf9..18c15a616a 100644
--- a/doc/classes/Navigation2D.xml
+++ b/doc/classes/Navigation2D.xml
@@ -37,7 +37,7 @@
 			<description>
 			</description>
 		</method>
-		<method name="navpoly_create">
+		<method name="navpoly_add">
 			<return type="int">
 			</return>
 			<argument index="0" name="mesh" type="NavigationPolygon">
diff --git a/doc/classes/Node.xml b/doc/classes/Node.xml
index 78591e2bf8..e56733f102 100644
--- a/doc/classes/Node.xml
+++ b/doc/classes/Node.xml
@@ -1,19 +1,19 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="Node" inherits="Object" category="Core" version="3.0-beta">
 	<brief_description>
-		Base class for all the [i]scene[/i] elements.
+		Base class for all [i]scene[/i] objects.
 	</brief_description>
 	<description>
-		Nodes are the base bricks with which Godot games are developed. They can be set as children of other nodes, resulting in a tree arrangement. A given node can contain any number of nodes as children (but there is only one scene tree root node) with the requirement that all siblings (direct children of a node) should have unique names.
-		Any tree of nodes is called a [i]scene[/i]. Scenes can be saved to the disk and then instanced into other scenes. This allows for very high flexibility in the architecture and data model of the projects. Nodes can optionally be added to groups. This makes it easy to reach a number of nodes from the code (for example an "enemies" group) to perform grouped actions.
-		[b]Scene tree:[/b] The [SceneTree] contains the active tree of nodes. When a node is added to the scene tree, it receives the NOTIFICATION_ENTER_TREE notification and its [method _enter_tree] callback is triggered. Children nodes are always added [i]after[/i] their parent node, i.e. the [method _enter_tree] callback of a parent node will be triggered before its child's.
-		Once all nodes have been added in the scene tree, they receive the NOTIFICATION_READY notification and their respective [method _ready] callbacks are triggered. For groups of nodes, the [method _ready] callback is called in reverse order, from the children up to the parent nodes.
-		It means that when adding a scene to the scene tree, the following order will be used for the callbacks: [method _enter_tree] of the parent, [method _enter_tree] of the children, [method _ready] of the children and finally [method _ready] of the parent (and that recursively for the whole scene).
-		[b]Processing:[/b] Nodes can be set to the "process" state, so that they receive a callback on each frame requesting them to process (do something). Normal processing (callback [method _process], toggled with [method set_process]) happens as fast as possible and is dependent on the frame rate, so the processing time [i]delta[/i] is variable. Physics processing (callback [method _physics_process], toggled with [method set_physics_process]) happens a fixed amount of times per second (by default 60) and is useful to link itself to the physics.
-		Nodes can also process input events. When set, the [method _input] function will be called for each input that the program receives. In many cases, this can be overkill (unless used for simple projects), and the [method _unhandled_input] function might be preferred; it is called when the input event was not handled by anyone else (typically, GUI [Control] nodes), ensuring that the node only receives the events that were meant for it.
+		Nodes are Godot's building blocks. They can be assigned as the child of another node, resulting in a tree arrangement. A given node can contain any number of nodes as children with the requirement that all siblings (direct children of a node) should have unique names.
+		A tree of nodes is called a [i]scene[/i]. Scenes can be saved to the disk and then instanced into other scenes. This allows for very high flexibility in the architecture and data model of Godot projects. Nodes can also optionally be added to groups. This makes it possible to access a number of nodes from code (an "enemies" group, for example) to perform grouped actions.
+		[b]Scene tree:[/b] The [SceneTree] contains the active tree of nodes. When a node is added to the scene tree, it receives the NOTIFICATION_ENTER_TREE notification and its [method _enter_tree] callback is triggered. Child nodes are always added [i]after[/i] their parent node, i.e. the [method _enter_tree] callback of a parent node will be triggered before its child's.
+		Once all nodes have been added in the scene tree, they receive the NOTIFICATION_READY notification and their respective [method _ready] callbacks are triggered. For groups of nodes, the [method _ready] callback is called in reverse order, starting with the children and moving up to the parent nodes.
+		This means that when adding a node to the scene tree, the following order will be used for the callbacks: [method _enter_tree] of the parent, [method _enter_tree] of the children, [method _ready] of the children and finally [method _ready] of the parent (recursively for the entire scene tree).
+		[b]Processing:[/b] Nodes can override the "process" state, so that they receive a callback on each frame requesting them to process (do something). Normal processing (callback [method _process], toggled with [method set_process]) happens as fast as possible and is dependent on the frame rate, so the processing time [i]delta[/i] is passed as an argument. Physics processing (callback [method _physics_process], toggled with [method set_physics_process]) happens a fixed number of times per second (60 by default) and is useful for code related to the physics engine.
+		Nodes can also process input events. When present, the [method _input] function will be called for each input that the program receives. In many cases, this can be overkill (unless used for simple projects), and the [method _unhandled_input] function might be preferred; it is called when the input event was not handled by anyone else (typically, GUI [Control] nodes), ensuring that the node only receives the events that were meant for it.
 		To keep track of the scene hierarchy (especially when instancing scenes into other scenes), an "owner" can be set for the node with [method set_owner]. This keeps track of who instanced what. This is mostly useful when writing editors and tools, though.
 		Finally, when a node is freed with [method free] or [method queue_free], it will also free all its children.
-		[b]Networking with nodes:[/b] After connecting to a server (or making one, see [NetworkedMultiplayerENet]) it is possible to use the built-in RPC (remote procedure call) system to easily communicate over the network. By calling [method rpc] with a method name, it will be called locally, and in all connected peers (peers = clients and the server that accepts connections), with behaviour varying depending on the network mode ([method set_network_mode]) on the receiving peer. To identify which [code]Node[/code] receives the RPC call Godot will use its [NodePath] (make sure node names are the same on all peers).
+		[b]Networking with nodes:[/b] After connecting to a server (or making one, see [NetworkedMultiplayerENet]) it is possible to use the built-in RPC (remote procedure call) system to communicate over the network. By calling [method rpc] with a method name, it will be called locally and in all connected peers (peers = clients and the server that accepts connections), with behaviour varying depending on the network mode ([method set_network_mode]) of the receiving peer. To identify which node receives the RPC call Godot will use its [NodePath] (make sure node names are the same on all peers).
 	</description>
 	<tutorials>
 	</tutorials>
@@ -24,7 +24,7 @@
 			<return type="void">
 			</return>
 			<description>
-				Called when the node enters the [SceneTree] (e.g. upon instancing, scene changing or after calling [method add_child] in a script). If the node has children, its [method _enter_tree] callback will be called first, and then that of the children.
+				Called when the node enters the [SceneTree] (e.g. upon instancing, scene changing, or after calling [method add_child] in a script). If the node has children, its [method _enter_tree] callback will be called first, and then that of the children.
 				Corresponds to the NOTIFICATION_ENTER_TREE notification in [method Object._notification].
 			</description>
 		</method>
@@ -32,7 +32,7 @@
 			<return type="void">
 			</return>
 			<description>
-				Called when the node leaves the [SceneTree] (e.g. upon freeing, scene changing or after calling [method remove_child] in a script). If the node has children, its [method _exit_tree] callback will be called last, after all its children have left the tree.
+				Called when the node leaves the [SceneTree] (e.g. upon freeing, scene changing, or after calling [method remove_child] in a script). If the node has children, its [method _exit_tree] callback will be called last, after all its children have left the tree.
 				Corresponds to the NOTIFICATION_EXIT_TREE notification in [method Object._notification].
 			</description>
 		</method>
@@ -42,7 +42,7 @@
 			<argument index="0" name="event" type="InputEvent">
 			</argument>
 			<description>
-				Called when there is a change to input devices. Propagated through the node tree until a Node consumes it.
+				Called when there is an input event. The input event propagates through the node tree until a node consumes it.
 				It is only called if input processing is enabled, which is done automatically if this method is overriden, and can be toggled with [method set_process_input].
 			</description>
 		</method>
@@ -72,7 +72,7 @@
 			<return type="void">
 			</return>
 			<description>
-				Called when the node is "ready", i.e. when both the node and its children have entered the scene tree. If the node has children, their [method _ready] callback gets triggered first, and the node will receive the ready notification only afterwards.
+				Called when the node is "ready", i.e. when both the node and its children have entered the scene tree. If the node has children, their [method _ready] callbacks get triggered first, and the parent node will receive the ready notification afterwards.
 				Corresponds to the NOTIFICATION_READY notification in [method Object._notification].
 			</description>
 		</method>
@@ -102,8 +102,8 @@
 			<argument index="1" name="legible_unique_name" type="bool" default="false">
 			</argument>
 			<description>
-				Add a child [code]Node[/code]. Nodes can have as many children as they want, but every child must have a unique name. Children nodes are automatically deleted when the parent node is deleted, so deleting a whole scene is performed by deleting its topmost node.
-				The optional boolean argument enforces creating child nodes with human-readable names, based on the name of the node being instanced instead of its type only.
+				Adds a child node. Nodes can have any number of children, but every child must have a unique name. Child nodes are automatically deleted when the parent node is deleted, so an entire scene can be removed by deleting its topmost node.
+				Setting "legible_unique_name" [code]true[/code] creates child nodes with human-readable names, based on the name of the node being instanced instead of its type.
 			</description>
 		</method>
 		<method name="add_child_below_node">
@@ -116,6 +116,8 @@
 			<argument index="2" name="legible_unique_name" type="bool" default="false">
 			</argument>
 			<description>
+				Adds a child node. The child is placed below the given node in the list of children.
+				Setting "legible_unique_name" [code]true[/code] creates child nodes with human-readable names, based on the name of the node being instanced instead of its type.
 			</description>
 		</method>
 		<method name="add_to_group">
@@ -126,14 +128,14 @@
 			<argument index="1" name="persistent" type="bool" default="false">
 			</argument>
 			<description>
-				Add a node to a group. Groups are helpers to name and organize a subset of nodes, like for example "enemies" or "collectables". A [code]Node[/code] can be in any number of groups. Nodes can be assigned a group at any time, but will not be added to it until they are inside the scene tree (see [method is_inside_tree]).
+				Adds the node to a group. Groups are helpers to name and organize a subset of nodes, for example "enemies" or "collectables". A node can be in any number of groups. Nodes can be assigned a group at any time, but will not be added until they are inside the scene tree (see [method is_inside_tree]).
 			</description>
 		</method>
 		<method name="can_process" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
-				Return true if the node can process, i.e. whether its pause mode allows processing while the scene tree is paused (see [method set_pause_mode]). Always returns true if the scene tree is not paused, and false if the node is not in the tree. FIXME: Why FAIL_COND?
+				Returns [code]true[/code] if the node can process while the scene tree is paused (see [method set_pause_mode]). Always returns [code]true[/code] if the scene tree is not paused, and [code]false[/code] if the node is not in the tree. FIXME: Why FAIL_COND?
 			</description>
 		</method>
 		<method name="duplicate" qualifiers="const">
@@ -142,8 +144,8 @@
 			<argument index="0" name="flags" type="int" default="15">
 			</argument>
 			<description>
-				Duplicate the node, returning a new [code]Node[/code].
-				You can fine-tune the behavior using the [code]flags[/code], which are based on the DUPLICATE_* constants.
+				Duplicates the node, returning a new node.
+				You can fine-tune the behavior using the [code]flags[/code]. See DUPLICATE_* constants.
 			</description>
 		</method>
 		<method name="find_node" qualifiers="const">
@@ -172,35 +174,35 @@
 			<return type="int">
 			</return>
 			<description>
-				Returns the amount of child nodes.
+				Returns the number of child nodes.
 			</description>
 		</method>
 		<method name="get_children" qualifiers="const">
 			<return type="Array">
 			</return>
 			<description>
-				Returns an array of references ([code]Node[/code]) to the child nodes.
+				Returns an array of references to node's children.
 			</description>
 		</method>
 		<method name="get_filename" qualifiers="const">
 			<return type="String">
 			</return>
 			<description>
-				Returns a filename that may be contained by the node. When a scene is instanced from a file, it topmost node contains the filename from where it was loaded (see [method set_filename]).
+				Returns a filename that may be contained by the node. When a scene is instanced from a file, its topmost node contains the filename from which it was loaded (see [method set_filename]).
 			</description>
 		</method>
 		<method name="get_groups" qualifiers="const">
 			<return type="Array">
 			</return>
 			<description>
-				Returns an array listing the groups that the node is part of.
+				Returns an array listing the groups that the node is a member of.
 			</description>
 		</method>
 		<method name="get_index" qualifiers="const">
 			<return type="int">
 			</return>
 			<description>
-				Returns the node index, i.e. its position among the siblings of its parent.
+				Returns the node's index, i.e. its position among the siblings of its parent.
 			</description>
 		</method>
 		<method name="get_name" qualifiers="const">
@@ -223,7 +225,7 @@
 			<argument index="0" name="path" type="NodePath">
 			</argument>
 			<description>
-				Fetches a node. The [NodePath] must be valid (or else an error will be raised) and can be either the path to child node, a relative path (from the current node to another node), or an absolute path to a node.
+				Fetches a node. The [NodePath] can be either a relative path (from the current node) or an absolute path (in the scene tree) to a node. If the path does not exist, a [code]null instance[/code] is returned and attempts to access it will result in an "Attempt to call &lt;method&gt; on a null instance." error.
 				Note: fetching absolute paths only works when the node is inside the scene tree (see [method is_inside_tree]).
 				[i]Example:[/i] Assume your current node is Character and the following tree:
 				[codeblock]
@@ -280,7 +282,7 @@
 			<argument index="0" name="node" type="Node">
 			</argument>
 			<description>
-				Returns the relative path from the current node to the specified node in "node" argument. Both nodes must be in the same scene, or else the function will fail.
+				Returns the relative path from the current node to the specified node in "node" argument. Both nodes must be in the same scene, or the function will fail.
 			</description>
 		</method>
 		<method name="get_physics_process_delta_time" qualifiers="const">
@@ -294,14 +296,14 @@
 			<return type="int">
 			</return>
 			<description>
-				Returns the order in the node tree branch, i.e. if called by the first child Node, return 0.
+				Returns the node's order in the scene tree branch. For example, if called on the first child node the position is [code]0[/code].
 			</description>
 		</method>
 		<method name="get_process_delta_time" qualifiers="const">
 			<return type="float">
 			</return>
 			<description>
-				Returns the time elapsed (in seconds) since the last process callback. This is almost always different each time.
+				Returns the time elapsed (in seconds) since the last process callback. This value may vary from frame to frame.
 			</description>
 		</method>
 		<method name="get_scene_instance_load_placeholder" qualifiers="const">
@@ -314,14 +316,14 @@
 			<return type="SceneTree">
 			</return>
 			<description>
-				Returns the [SceneTree] that this node is inside.
+				Returns the [SceneTree] that contains this node.
 			</description>
 		</method>
 		<method name="get_viewport" qualifiers="const">
 			<return type="Viewport">
 			</return>
 			<description>
-				Returns the [Viewport] for this node.
+				Returns the node's [Viewport].
 			</description>
 		</method>
 		<method name="has_node" qualifiers="const">
@@ -347,13 +349,14 @@
 			<argument index="0" name="node" type="Node">
 			</argument>
 			<description>
-				Returns [code]true[/code] if the "node" argument is a direct or indirect child of the current node, otherwise return [code]false[code].
+				Returns [code]true[/code] if the given node is a direct or indirect child of the current node.
 			</description>
 		</method>
 		<method name="is_displayed_folded" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
+				Returns [code]true[/code] if the node is folded (collapsed) in the Scene dock.
 			</description>
 		</method>
 		<method name="is_greater_than" qualifiers="const">
@@ -362,7 +365,7 @@
 			<argument index="0" name="node" type="Node">
 			</argument>
 			<description>
-				Returns [code]true[/code] if [code]node[/code] occurs later in the scene hierarchy than the current node, otherwise return [code]false[/code].
+				Returns [code]true[/code] if the given node occurs later in the scene hierarchy than the current node.
 			</description>
 		</method>
 		<method name="is_in_group" qualifiers="const">
@@ -371,14 +374,14 @@
 			<argument index="0" name="group" type="String">
 			</argument>
 			<description>
-				Returns [code]true[/code] if this Node is in the specified group.
+				Returns [code]true[/code] if this node is in the specified group.
 			</description>
 		</method>
 		<method name="is_inside_tree" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
-				Returns [code]true[/code] if this Node is currently inside a [SceneTree].
+				Returns [code]true[/code] if this node is currently inside a [SceneTree].
 			</description>
 		</method>
 		<method name="is_network_master" qualifiers="const">
@@ -468,7 +471,7 @@
 			<argument index="2" name="parent_first" type="bool" default="false">
 			</argument>
 			<description>
-				Calls the method (if present) with the arguments given in "args" on this Node and recursively on all children. If the parent_first argument is true then the method will be called on the current [code]Node[/code] first, then on all children. If it is false then the children will get called first.
+				Calls the given method (if present) with the arguments given in [code]args[/code] on this node and recursively on all its children. If the parent_first argument is [code]true[/code] then the method will be called on the current node first, then on all children. If it is [code]false[/code] then the children will be called first.
 			</description>
 		</method>
 		<method name="propagate_notification">
@@ -477,28 +480,28 @@
 			<argument index="0" name="what" type="int">
 			</argument>
 			<description>
-				Notify the current node and all its children recursively by calling notification() in all of them.
+				Notifies the current node and all its children recursively by calling notification() on all of them.
 			</description>
 		</method>
 		<method name="queue_free">
 			<return type="void">
 			</return>
 			<description>
-				Queues a node for deletion at the end of the current frame. When deleted, all of its children nodes will be deleted as well. This method ensures it's safe to delete the node, contrary to [method Object.free]. Use [method Object.is_queued_for_deletion] to know whether a node will be deleted at the end of the frame.
+				Queues a node for deletion at the end of the current frame. When deleted, all of its child nodes will be deleted as well. This method ensures it's safe to delete the node, contrary to [method Object.free]. Use [method Object.is_queued_for_deletion] to check whether a node will be deleted at the end of the frame.
 			</description>
 		</method>
 		<method name="raise">
 			<return type="void">
 			</return>
 			<description>
-				Moves this node to the top of the array of nodes of the parent node. This is often useful on GUIs ([Control]), because their order of drawing fully depends on their order in the tree.
+				Moves this node to the top of the array of nodes of the parent node. This is often useful in GUIs ([Control] nodes), because their order of drawing depends on their order in the tree.
 			</description>
 		</method>
 		<method name="remove_and_skip">
 			<return type="void">
 			</return>
 			<description>
-				Removes a node and set all its children as children of the parent node (if exists). All event subscriptions that pass by the removed node will be unsubscribed.
+				Removes a node and sets all its children as children of the parent node (if it exists). All event subscriptions that pass by the removed node will be unsubscribed.
 			</description>
 		</method>
 		<method name="remove_child">
@@ -507,7 +510,7 @@
 			<argument index="0" name="node" type="Node">
 			</argument>
 			<description>
-				Removes a child [code]Node[/code]. Node is NOT deleted and will have to be deleted manually.
+				Removes a child node. The node is NOT deleted and must be deleted manually.
 			</description>
 		</method>
 		<method name="remove_from_group">
@@ -527,7 +530,7 @@
 			<argument index="1" name="keep_data" type="bool" default="false">
 			</argument>
 			<description>
-				Replaces a node in a scene by a given one. Subscriptions that pass through this node will be lost.
+				Replaces a node in a scene by the given one. Subscriptions that pass through this node will be lost.
 			</description>
 		</method>
 		<method name="request_ready">
@@ -653,6 +656,7 @@
 			<argument index="0" name="fold" type="bool">
 			</argument>
 			<description>
+				Sets the folded state of the node in the Scene dock.
 			</description>
 		</method>
 		<method name="set_filename">
@@ -661,7 +665,7 @@
 			<argument index="0" name="filename" type="String">
 			</argument>
 			<description>
-				A node can contain a filename. This filename should not be changed by the user, unless writing editors and tools. When a scene is instanced from a file, it topmost node contains the filename from where it was loaded.
+				A node can contain a filename. This filename should not be changed by the user, unless writing editors and tools. When a scene is instanced from a file, its topmost node contains the filename from which it was loaded.
 			</description>
 		</method>
 		<method name="set_name">
@@ -670,7 +674,7 @@
 			<argument index="0" name="name" type="String">
 			</argument>
 			<description>
-				Sets the name of the [code]Node[/code]. Name must be unique within parent, and setting an already existing name will cause for the node to be automatically renamed.
+				Sets the name of the node. The name must be unique within the parent. Using an existing name will cause the node to be automatically renamed.
 			</description>
 		</method>
 		<method name="set_network_master">
@@ -690,7 +694,7 @@
 			<argument index="0" name="owner" type="Node">
 			</argument>
 			<description>
-				Sets the node owner. A node can have any other node as owner (as long as a valid parent, grandparent, etc ascending in the tree). When saving a node (using SceneSaver) all the nodes it owns will be saved with it. This allows to create complex SceneTrees, with instancing and subinstancing.
+				Sets the node owner. A node can have any other node as owner (as long as it is a valid parent, grandparent, etc ascending in the tree). When saving a node (using SceneSaver) all the nodes it owns will be saved with it. This allows for the creation of complex [SceneTree]s, with instancing and subinstancing.
 			</description>
 		</method>
 		<method name="set_physics_process">
@@ -699,7 +703,7 @@
 			<argument index="0" name="enable" type="bool">
 			</argument>
 			<description>
-				Enables or disables the node's physics (alias fixed framerate) processing. When a node is being processed, it will receive a NOTIFICATION_PHYSICS_PROCESS at a fixed (usually 60 fps, check [OS] to change that) interval (and the [method _physics_process] callback will be called if exists). Enabled automatically if [method _physics_process] is overriden. Any calls to this before [method _ready] will be ignored.
+				Enables or disables physics (i.e. fixed framerate) processing. When a node is being processed, it will receive a NOTIFICATION_PHYSICS_PROCESS at a fixed (usually 60 fps, see [OS] to change) interval (and the [method _physics_process] callback will be called if exists). Enabled automatically if [method _physics_process] is overriden. Any calls to this before [method _ready] will be ignored.
 			</description>
 		</method>
 		<method name="set_physics_process_internal">
@@ -716,7 +720,7 @@
 			<argument index="0" name="enable" type="bool">
 			</argument>
 			<description>
-				Enables or disables node processing. When a node is being processed, it will receive a NOTIFICATION_PROCESS on every drawn frame (and the [method _process] callback will be called if exists). Enabled automatically if [method _process] is overriden. Any calls to this before [method _ready] will be ignored.
+				Enables or disables processing. When a node is being processed, it will receive a NOTIFICATION_PROCESS on every drawn frame (and the [method _process] callback will be called if exists). Enabled automatically if [method _process] is overriden. Any calls to this before [method _ready] will be ignored.
 			</description>
 		</method>
 		<method name="set_process_input">
@@ -725,7 +729,7 @@
 			<argument index="0" name="enable" type="bool">
 			</argument>
 			<description>
-				Enables input processing for node. This is not required for GUI controls! It hooks up the node to receive all input (see [method _input]). Enabled automatically if [method _input] is overriden. Any calls to this before [method _ready] will be ignored.
+				Enables or disables input processing. This is not required for GUI controls! Enabled automatically if [method _input] is overriden. Any calls to this before [method _ready] will be ignored.
 			</description>
 		</method>
 		<method name="set_process_internal">
@@ -742,7 +746,7 @@
 			<argument index="0" name="enable" type="bool">
 			</argument>
 			<description>
-				Enables unhandled input processing for node. This is not required for GUI controls! It hooks up the node to receive all input that was not previously handled before (usually by a [Control]). Enabled automatically if [method _unhandled_input] is overriden. Any calls to this before [method _ready] will be ignored.
+				Enables unhandled input processing. This is not required for GUI controls! It enables the node to receive all input that was not previously handled (usually by a [Control]). Enabled automatically if [method _unhandled_input] is overriden. Any calls to this before [method _ready] will be ignored.
 			</description>
 		</method>
 		<method name="set_process_unhandled_key_input">
@@ -751,7 +755,7 @@
 			<argument index="0" name="enable" type="bool">
 			</argument>
 			<description>
-				Enables unhandled key input processing for node. Enabled automatically if [method _unhandled_key_input] is overriden. Any calls to this before [method _ready] will be ignored.
+				Enables unhandled key input processing. Enabled automatically if [method _unhandled_key_input] is overriden. Any calls to this before [method _ready] will be ignored.
 			</description>
 		</method>
 		<method name="set_scene_instance_load_placeholder">
@@ -775,12 +779,12 @@
 		</signal>
 		<signal name="tree_entered">
 			<description>
-				Emitted when Node enters the tree.
+				Emitted when the node enters the tree.
 			</description>
 		</signal>
 		<signal name="tree_exited">
 			<description>
-				Emitted when Node exits the tree.
+				Emitted when the node exits the tree.
 			</description>
 		</signal>
 	</signals>
diff --git a/doc/classes/PopupMenu.xml b/doc/classes/PopupMenu.xml
index 42ed57e4af..13cf16d2ee 100644
--- a/doc/classes/PopupMenu.xml
+++ b/doc/classes/PopupMenu.xml
@@ -379,24 +379,24 @@
 				Sets the metadata of an item, which might be of any type. You can later get it with [method get_item_metadata], which provides a simple way of assigning context data to items.
 			</description>
 		</method>
-		<method name="set_item_shortcut">
+		<method name="set_item_multistate">
 			<return type="void">
 			</return>
 			<argument index="0" name="idx" type="int">
 			</argument>
-			<argument index="1" name="shortcut" type="ShortCut">
-			</argument>
-			<argument index="2" name="global" type="bool" default="false">
+			<argument index="1" name="state" type="int">
 			</argument>
 			<description>
 			</description>
 		</method>
-		<method name="set_item_statable">
+		<method name="set_item_shortcut">
 			<return type="void">
 			</return>
 			<argument index="0" name="idx" type="int">
 			</argument>
-			<argument index="1" name="state" type="int">
+			<argument index="1" name="shortcut" type="ShortCut">
+			</argument>
+			<argument index="2" name="global" type="bool" default="false">
 			</argument>
 			<description>
 			</description>
@@ -441,7 +441,7 @@
 			<description>
 			</description>
 		</method>
-		<method name="toggle_item_statable">
+		<method name="toggle_item_multistate">
 			<return type="void">
 			</return>
 			<argument index="0" name="idx" type="int">
diff --git a/doc/classes/String.xml b/doc/classes/String.xml
index 78e9f3cd3f..8bbd52b417 100644
--- a/doc/classes/String.xml
+++ b/doc/classes/String.xml
@@ -662,8 +662,11 @@
 			</argument>
 			<argument index="1" name="allow_empty" type="bool" default="True">
 			</argument>
+			<argument index="2" name="maxsplit" type="int" default="0">
+			</argument>
 			<description>
 				Splits the string by a divisor string and returns an array of the substrings. Example "One,Two,Three" will return ["One","Two","Three"] if split by ",".
+				If [code]maxsplit[/code] is given, at most maxsplit number of splits occur, and the remainder of the string is returned as the final element of the list (thus, the list will have at most maxsplit+1 elements)
 			</description>
 		</method>
 		<method name="split_floats">
diff --git a/drivers/gles3/rasterizer_scene_gles3.cpp b/drivers/gles3/rasterizer_scene_gles3.cpp
index d38ec2a1f9..29ab531177 100644
--- a/drivers/gles3/rasterizer_scene_gles3.cpp
+++ b/drivers/gles3/rasterizer_scene_gles3.cpp
@@ -1557,7 +1557,7 @@ void RasterizerSceneGLES3::_render_geometry(RenderList::Element *e) {
 
 					glEnableVertexAttribArray(VS::ARRAY_NORMAL);
 					glBufferSubData(GL_ARRAY_BUFFER, buf_ofs, sizeof(Vector3) * vertices, c.normals.ptr());
-					glVertexAttribPointer(VS::ARRAY_NORMAL, 3, GL_FLOAT, false, sizeof(Vector3) * vertices, ((uint8_t *)NULL) + buf_ofs);
+					glVertexAttribPointer(VS::ARRAY_NORMAL, 3, GL_FLOAT, false, sizeof(Vector3), ((uint8_t *)NULL) + buf_ofs);
 					buf_ofs += sizeof(Vector3) * vertices;
 
 				} else {
@@ -1569,7 +1569,7 @@ void RasterizerSceneGLES3::_render_geometry(RenderList::Element *e) {
 
 					glEnableVertexAttribArray(VS::ARRAY_TANGENT);
 					glBufferSubData(GL_ARRAY_BUFFER, buf_ofs, sizeof(Plane) * vertices, c.tangents.ptr());
-					glVertexAttribPointer(VS::ARRAY_TANGENT, 4, GL_FLOAT, false, sizeof(Plane) * vertices, ((uint8_t *)NULL) + buf_ofs);
+					glVertexAttribPointer(VS::ARRAY_TANGENT, 4, GL_FLOAT, false, sizeof(Plane), ((uint8_t *)NULL) + buf_ofs);
 					buf_ofs += sizeof(Plane) * vertices;
 
 				} else {
@@ -1849,6 +1849,20 @@ void RasterizerSceneGLES3::_setup_light(RenderList::Element *e, const Transform
 
 			state.scene_shader.set_uniform(SceneShaderGLES3::GI_PROBE2_ENABLED, false);
 		}
+	} else if (!e->instance->lightmap_capture_data.empty()) {
+
+		glUniform4fv(state.scene_shader.get_uniform_location(SceneShaderGLES3::LIGHTMAP_CAPTURES), 12, (const GLfloat *)e->instance->lightmap_capture_data.ptr());
+		state.scene_shader.set_uniform(SceneShaderGLES3::LIGHTMAP_CAPTURE_SKY, false);
+
+	} else if (e->instance->lightmap.is_valid()) {
+		RasterizerStorageGLES3::Texture *lightmap = storage->texture_owner.getornull(e->instance->lightmap);
+		RasterizerStorageGLES3::LightmapCapture *capture = storage->lightmap_capture_data_owner.getornull(e->instance->lightmap_capture->base);
+
+		if (lightmap && capture) {
+			glActiveTexture(GL_TEXTURE0 + storage->config.max_texture_image_units - 9);
+			glBindTexture(GL_TEXTURE_2D, lightmap->tex_id);
+			state.scene_shader.set_uniform(SceneShaderGLES3::LIGHTMAP_ENERGY, capture->energy);
+		}
 	}
 }
 
@@ -1971,6 +1985,8 @@ void RasterizerSceneGLES3::_render_list(RenderList::Element **p_elements, int p_
 					state.scene_shader.set_conditional(SceneShaderGLES3::SHADOW_MODE_PCF_5, false);
 					state.scene_shader.set_conditional(SceneShaderGLES3::SHADOW_MODE_PCF_13, false);
 					state.scene_shader.set_conditional(SceneShaderGLES3::USE_GI_PROBES, false);
+					state.scene_shader.set_conditional(SceneShaderGLES3::USE_LIGHTMAP_CAPTURE, false);
+					state.scene_shader.set_conditional(SceneShaderGLES3::USE_LIGHTMAP, false);
 					state.scene_shader.set_conditional(SceneShaderGLES3::USE_RADIANCE_MAP, false);
 					state.scene_shader.set_conditional(SceneShaderGLES3::USE_CONTACT_SHADOWS, false);
 
@@ -1978,6 +1994,8 @@ void RasterizerSceneGLES3::_render_list(RenderList::Element **p_elements, int p_
 				} else {
 
 					state.scene_shader.set_conditional(SceneShaderGLES3::USE_GI_PROBES, e->instance->gi_probe_instances.size() > 0);
+					state.scene_shader.set_conditional(SceneShaderGLES3::USE_LIGHTMAP, e->instance->lightmap.is_valid() && e->instance->gi_probe_instances.size() == 0);
+					state.scene_shader.set_conditional(SceneShaderGLES3::USE_LIGHTMAP_CAPTURE, !e->instance->lightmap_capture_data.empty() && !e->instance->lightmap.is_valid() && e->instance->gi_probe_instances.size() == 0);
 
 					state.scene_shader.set_conditional(SceneShaderGLES3::SHADELESS, false);
 
@@ -2148,6 +2166,8 @@ void RasterizerSceneGLES3::_render_list(RenderList::Element **p_elements, int p_
 	state.scene_shader.set_conditional(SceneShaderGLES3::SHADOW_MODE_PCF_5, false);
 	state.scene_shader.set_conditional(SceneShaderGLES3::SHADOW_MODE_PCF_13, false);
 	state.scene_shader.set_conditional(SceneShaderGLES3::USE_GI_PROBES, false);
+	state.scene_shader.set_conditional(SceneShaderGLES3::USE_LIGHTMAP, false);
+	state.scene_shader.set_conditional(SceneShaderGLES3::USE_LIGHTMAP_CAPTURE, false);
 	state.scene_shader.set_conditional(SceneShaderGLES3::USE_CONTACT_SHADOWS, false);
 	state.scene_shader.set_conditional(SceneShaderGLES3::USE_VERTEX_LIGHTING, false);
 	state.scene_shader.set_conditional(SceneShaderGLES3::USE_OPAQUE_PREPASS, false);
@@ -2274,6 +2294,14 @@ void RasterizerSceneGLES3::_add_geometry_with_material(RasterizerStorageGLES3::G
 			e->sort_key |= SORT_KEY_GI_PROBES_FLAG;
 		}
 
+		if (e->instance->lightmap.is_valid()) {
+			e->sort_key |= SORT_KEY_LIGHTMAP_FLAG;
+		}
+
+		if (!e->instance->lightmap_capture_data.empty()) {
+			e->sort_key |= SORT_KEY_LIGHTMAP_CAPTURE_FLAG;
+		}
+
 		e->sort_key |= uint64_t(p_material->render_priority + 128) << RenderList::SORT_KEY_PRIORITY_SHIFT;
 	} else {
 		e->sort_key |= uint64_t(e->instance->depth_layer) << RenderList::SORT_KEY_OPAQUE_DEPTH_LAYER_SHIFT;
diff --git a/drivers/gles3/rasterizer_scene_gles3.h b/drivers/gles3/rasterizer_scene_gles3.h
index ffbe10fb60..6df223c961 100644
--- a/drivers/gles3/rasterizer_scene_gles3.h
+++ b/drivers/gles3/rasterizer_scene_gles3.h
@@ -662,19 +662,21 @@ public:
 			SORT_KEY_OPAQUE_DEPTH_LAYER_SHIFT = 52,
 			SORT_KEY_OPAQUE_DEPTH_LAYER_MASK = 0xF,
 //64 bits unsupported in MSVC
-#define SORT_KEY_UNSHADED_FLAG (uint64_t(1) << 51)
-#define SORT_KEY_NO_DIRECTIONAL_FLAG (uint64_t(1) << 50)
-#define SORT_KEY_GI_PROBES_FLAG (uint64_t(1) << 49)
-#define SORT_KEY_VERTEX_LIT_FLAG (uint64_t(1) << 48)
-			SORT_KEY_SHADING_SHIFT = 48,
-			SORT_KEY_SHADING_MASK = 15,
-			//48-32 material index
-			SORT_KEY_MATERIAL_INDEX_SHIFT = 32,
-			//32-12 geometry index
-			SORT_KEY_GEOMETRY_INDEX_SHIFT = 12,
-			//bits 12-8 geometry type
-			SORT_KEY_GEOMETRY_TYPE_SHIFT = 8,
-			//bits 0-7 for flags
+#define SORT_KEY_UNSHADED_FLAG (uint64_t(1) << 49)
+#define SORT_KEY_NO_DIRECTIONAL_FLAG (uint64_t(1) << 48)
+#define SORT_KEY_LIGHTMAP_CAPTURE_FLAG (uint64_t(1) << 47)
+#define SORT_KEY_LIGHTMAP_FLAG (uint64_t(1) << 46)
+#define SORT_KEY_GI_PROBES_FLAG (uint64_t(1) << 45)
+#define SORT_KEY_VERTEX_LIT_FLAG (uint64_t(1) << 44)
+			SORT_KEY_SHADING_SHIFT = 44,
+			SORT_KEY_SHADING_MASK = 63,
+			//44-28 material index
+			SORT_KEY_MATERIAL_INDEX_SHIFT = 28,
+			//28-8 geometry index
+			SORT_KEY_GEOMETRY_INDEX_SHIFT = 8,
+			//bits 5-7 geometry type
+			SORT_KEY_GEOMETRY_TYPE_SHIFT = 5,
+			//bits 0-5 for flags
 			SORT_KEY_OPAQUE_PRE_PASS = 8,
 			SORT_KEY_CULL_DISABLED_FLAG = 4,
 			SORT_KEY_SKELETON_FLAG = 2,
diff --git a/drivers/gles3/rasterizer_storage_gles3.cpp b/drivers/gles3/rasterizer_storage_gles3.cpp
index cba9f08537..ee6c738a05 100644
--- a/drivers/gles3/rasterizer_storage_gles3.cpp
+++ b/drivers/gles3/rasterizer_storage_gles3.cpp
@@ -5216,6 +5216,104 @@ void RasterizerStorageGLES3::gi_probe_dynamic_data_update(RID p_gi_probe_data, i
 	//glTexImage3D(GL_TEXTURE_3D,p_mipmap,GL_RGBA8,gipd->width>>p_mipmap,gipd->height>>p_mipmap,gipd->depth>>p_mipmap,0,GL_RGBA,GL_UNSIGNED_BYTE,p_data);
 	//glTexImage3D(GL_TEXTURE_3D,p_mipmap,GL_RGBA8,gipd->width>>p_mipmap,gipd->height>>p_mipmap,gipd->depth>>p_mipmap,0,GL_RGBA,GL_UNSIGNED_BYTE,data.ptr());
 }
+/////////////////////////////
+
+RID RasterizerStorageGLES3::lightmap_capture_create() {
+
+	LightmapCapture *capture = memnew(LightmapCapture);
+	return lightmap_capture_data_owner.make_rid(capture);
+}
+
+void RasterizerStorageGLES3::lightmap_capture_set_bounds(RID p_capture, const AABB &p_bounds) {
+
+	LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND(!capture);
+	capture->bounds = p_bounds;
+	capture->instance_change_notify();
+}
+AABB RasterizerStorageGLES3::lightmap_capture_get_bounds(RID p_capture) const {
+
+	const LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND_V(!capture, AABB());
+	return capture->bounds;
+}
+void RasterizerStorageGLES3::lightmap_capture_set_octree(RID p_capture, const PoolVector<uint8_t> &p_octree) {
+
+	LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND(!capture);
+
+	ERR_FAIL_COND(p_octree.size() == 0 || (p_octree.size() % sizeof(LightmapCaptureOctree)) != 0);
+
+	capture->octree.resize(p_octree.size() / sizeof(LightmapCaptureOctree));
+	if (p_octree.size()) {
+		PoolVector<LightmapCaptureOctree>::Write w = capture->octree.write();
+		PoolVector<uint8_t>::Read r = p_octree.read();
+		copymem(w.ptr(), r.ptr(), p_octree.size());
+	}
+	capture->instance_change_notify();
+}
+PoolVector<uint8_t> RasterizerStorageGLES3::lightmap_capture_get_octree(RID p_capture) const {
+
+	const LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND_V(!capture, PoolVector<uint8_t>());
+
+	if (capture->octree.size() == 0)
+		return PoolVector<uint8_t>();
+
+	PoolVector<uint8_t> ret;
+	ret.resize(capture->octree.size() * sizeof(LightmapCaptureOctree));
+	{
+		PoolVector<LightmapCaptureOctree>::Read r = capture->octree.read();
+		PoolVector<uint8_t>::Write w = ret.write();
+		copymem(w.ptr(), r.ptr(), ret.size());
+	}
+
+	return ret;
+}
+
+void RasterizerStorageGLES3::lightmap_capture_set_octree_cell_transform(RID p_capture, const Transform &p_xform) {
+	LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND(!capture);
+	capture->cell_xform = p_xform;
+}
+
+Transform RasterizerStorageGLES3::lightmap_capture_get_octree_cell_transform(RID p_capture) const {
+	const LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND_V(!capture, Transform());
+	return capture->cell_xform;
+}
+
+void RasterizerStorageGLES3::lightmap_capture_set_octree_cell_subdiv(RID p_capture, int p_subdiv) {
+	LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND(!capture);
+	capture->cell_subdiv = p_subdiv;
+}
+
+int RasterizerStorageGLES3::lightmap_capture_get_octree_cell_subdiv(RID p_capture) const {
+	const LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND_V(!capture, 0);
+	return capture->cell_subdiv;
+}
+
+void RasterizerStorageGLES3::lightmap_capture_set_energy(RID p_capture, float p_energy) {
+
+	LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND(!capture);
+	capture->energy = p_energy;
+}
+
+float RasterizerStorageGLES3::lightmap_capture_get_energy(RID p_capture) const {
+
+	const LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND_V(!capture, 0);
+	return capture->energy;
+}
+
+const PoolVector<RasterizerStorage::LightmapCaptureOctree> *RasterizerStorageGLES3::lightmap_capture_get_octree_ptr(RID p_capture) const {
+	const LightmapCapture *capture = lightmap_capture_data_owner.getornull(p_capture);
+	ERR_FAIL_COND_V(!capture, NULL);
+	return &capture->octree;
+}
 
 ///////
 
@@ -5817,6 +5915,10 @@ void RasterizerStorageGLES3::instance_add_dependency(RID p_base, RasterizerScene
 			inst = gi_probe_owner.getornull(p_base);
 			ERR_FAIL_COND(!inst);
 		} break;
+		case VS::INSTANCE_LIGHTMAP_CAPTURE: {
+			inst = lightmap_capture_data_owner.getornull(p_base);
+			ERR_FAIL_COND(!inst);
+		} break;
 		default: {
 			if (!inst) {
 				ERR_FAIL();
@@ -5860,6 +5962,10 @@ void RasterizerStorageGLES3::instance_remove_dependency(RID p_base, RasterizerSc
 			inst = gi_probe_owner.getornull(p_base);
 			ERR_FAIL_COND(!inst);
 		} break;
+		case VS::INSTANCE_LIGHTMAP_CAPTURE: {
+			inst = lightmap_capture_data_owner.getornull(p_base);
+			ERR_FAIL_COND(!inst);
+		} break;
 		default: {
 
 			if (!inst) {
@@ -6609,6 +6715,10 @@ VS::InstanceType RasterizerStorageGLES3::get_base_type(RID p_rid) const {
 		return VS::INSTANCE_GI_PROBE;
 	}
 
+	if (lightmap_capture_data_owner.owns(p_rid)) {
+		return VS::INSTANCE_LIGHTMAP_CAPTURE;
+	}
+
 	return VS::INSTANCE_NONE;
 }
 
@@ -6795,6 +6905,13 @@ bool RasterizerStorageGLES3::free(RID p_rid) {
 		glDeleteTextures(1, &gi_probe_data->tex_id);
 		gi_probe_owner.free(p_rid);
 		memdelete(gi_probe_data);
+	} else if (lightmap_capture_data_owner.owns(p_rid)) {
+
+		// delete the texture
+		LightmapCapture *lightmap_capture = lightmap_capture_data_owner.get(p_rid);
+
+		gi_probe_owner.free(p_rid);
+		memdelete(lightmap_capture);
 
 	} else if (canvas_occluder_owner.owns(p_rid)) {
 
diff --git a/drivers/gles3/rasterizer_storage_gles3.h b/drivers/gles3/rasterizer_storage_gles3.h
index 25327af0a5..6647372688 100644
--- a/drivers/gles3/rasterizer_storage_gles3.h
+++ b/drivers/gles3/rasterizer_storage_gles3.h
@@ -1069,6 +1069,38 @@ public:
 	virtual RID gi_probe_dynamic_data_create(int p_width, int p_height, int p_depth, GIProbeCompression p_compression);
 	virtual void gi_probe_dynamic_data_update(RID p_gi_probe_data, int p_depth_slice, int p_slice_count, int p_mipmap, const void *p_data);
 
+	/* LIGHTMAP CAPTURE */
+
+	virtual RID lightmap_capture_create();
+	virtual void lightmap_capture_set_bounds(RID p_capture, const AABB &p_bounds);
+	virtual AABB lightmap_capture_get_bounds(RID p_capture) const;
+	virtual void lightmap_capture_set_octree(RID p_capture, const PoolVector<uint8_t> &p_octree);
+	virtual PoolVector<uint8_t> lightmap_capture_get_octree(RID p_capture) const;
+	virtual void lightmap_capture_set_octree_cell_transform(RID p_capture, const Transform &p_xform);
+	virtual Transform lightmap_capture_get_octree_cell_transform(RID p_capture) const;
+	virtual void lightmap_capture_set_octree_cell_subdiv(RID p_capture, int p_subdiv);
+	virtual int lightmap_capture_get_octree_cell_subdiv(RID p_capture) const;
+
+	virtual void lightmap_capture_set_energy(RID p_capture, float p_energy);
+	virtual float lightmap_capture_get_energy(RID p_capture) const;
+
+	virtual const PoolVector<LightmapCaptureOctree> *lightmap_capture_get_octree_ptr(RID p_capture) const;
+
+	struct LightmapCapture : public Instantiable {
+
+		PoolVector<LightmapCaptureOctree> octree;
+		AABB bounds;
+		Transform cell_xform;
+		int cell_subdiv;
+		float energy;
+		LightmapCapture() {
+			energy = 1.0;
+			cell_subdiv = 1;
+		}
+	};
+
+	mutable RID_Owner<LightmapCapture> lightmap_capture_data_owner;
+
 	/* PARTICLES */
 
 	struct Particles : public GeometryOwner {
diff --git a/drivers/gles3/shader_compiler_gles3.cpp b/drivers/gles3/shader_compiler_gles3.cpp
index 21102e8c25..101978548d 100644
--- a/drivers/gles3/shader_compiler_gles3.cpp
+++ b/drivers/gles3/shader_compiler_gles3.cpp
@@ -812,6 +812,7 @@ ShaderCompilerGLES3::ShaderCompilerGLES3() {
 	//for light
 	actions[VS::SHADER_SPATIAL].renames["VIEW"] = "view";
 	actions[VS::SHADER_SPATIAL].renames["LIGHT_COLOR"] = "light_color";
+	actions[VS::SHADER_SPATIAL].renames["LIGHT"] = "light";
 	actions[VS::SHADER_SPATIAL].renames["ATTENUATION"] = "attenuation";
 	actions[VS::SHADER_SPATIAL].renames["DIFFUSE_LIGHT"] = "diffuse_light";
 	actions[VS::SHADER_SPATIAL].renames["SPECULAR_LIGHT"] = "specular_light";
diff --git a/drivers/gles3/shaders/scene.glsl b/drivers/gles3/shaders/scene.glsl
index 9b817c7a4e..9bc2bc079d 100644
--- a/drivers/gles3/shaders/scene.glsl
+++ b/drivers/gles3/shaders/scene.glsl
@@ -35,14 +35,14 @@ layout(location=3) in vec4 color_attrib;
 layout(location=4) in vec2 uv_attrib;
 #endif
 
-#if defined(ENABLE_UV2_INTERP)
+#if defined(ENABLE_UV2_INTERP) || defined(USE_LIGHTMAP)
 layout(location=5) in vec2 uv2_attrib;
 #endif
 
 uniform float normal_mult;
 
 #ifdef USE_SKELETON
-layout(location=6) in ivec4 bone_indices; // attrib:6
+layout(location=6) in uvec4 bone_indices; // attrib:6
 layout(location=7) in vec4 bone_weights; // attrib:7
 #endif
 
@@ -223,7 +223,7 @@ out vec4 color_interp;
 out vec2 uv_interp;
 #endif
 
-#if defined(ENABLE_UV2_INTERP)
+#if defined(ENABLE_UV2_INTERP) || defined (USE_LIGHTMAP)
 out vec2 uv2_interp;
 #endif
 
@@ -234,9 +234,6 @@ out vec3 binormal_interp;
 #endif
 
 
-
-
-
 #if defined(USE_MATERIAL)
 
 layout(std140) uniform UniformData { //ubo:1
@@ -302,14 +299,16 @@ void main() {
 #ifdef USE_SKELETON
 	{
 		//skeleton transform
-		ivec2 tex_ofs = ivec2( bone_indices.x%256, (bone_indices.x/256)*3 );
+		ivec4 bone_indicesi = ivec4(bone_indices); // cast to signed int
+
+		ivec2 tex_ofs = ivec2( bone_indicesi.x%256, (bone_indicesi.x/256)*3 );
 		highp mat3x4 m = mat3x4(
 			texelFetch(skeleton_texture,tex_ofs,0),
 			texelFetch(skeleton_texture,tex_ofs+ivec2(0,1),0),
 			texelFetch(skeleton_texture,tex_ofs+ivec2(0,2),0)
 		) * bone_weights.x;
 
-		tex_ofs = ivec2( bone_indices.y%256, (bone_indices.y/256)*3 );
+		tex_ofs = ivec2( bone_indicesi.y%256, (bone_indicesi.y/256)*3 );
 
 		m+= mat3x4(
 					texelFetch(skeleton_texture,tex_ofs,0),
@@ -317,7 +316,7 @@ void main() {
 					texelFetch(skeleton_texture,tex_ofs+ivec2(0,2),0)
 				) * bone_weights.y;
 
-		tex_ofs = ivec2( bone_indices.z%256, (bone_indices.z/256)*3 );
+		tex_ofs = ivec2( bone_indicesi.z%256, (bone_indicesi.z/256)*3 );
 
 		m+= mat3x4(
 					texelFetch(skeleton_texture,tex_ofs,0),
@@ -326,7 +325,7 @@ void main() {
 				) * bone_weights.z;
 
 
-		tex_ofs = ivec2( bone_indices.w%256, (bone_indices.w/256)*3 );
+		tex_ofs = ivec2( bone_indicesi.w%256, (bone_indicesi.w/256)*3 );
 
 		m+= mat3x4(
 					texelFetch(skeleton_texture,tex_ofs,0),
@@ -354,7 +353,7 @@ void main() {
 	uv_interp = uv_attrib;
 #endif
 
-#if defined(ENABLE_UV2_INTERP)
+#if defined(ENABLE_UV2_INTERP) || defined(USE_LIGHTMAP)
 	uv2_interp = uv2_attrib;
 #endif
 
@@ -547,7 +546,7 @@ in vec4 color_interp;
 in vec2 uv_interp;
 #endif
 
-#if defined(ENABLE_UV2_INTERP)
+#if defined(ENABLE_UV2_INTERP) || defined(USE_LIGHTMAP)
 in vec2 uv2_interp;
 #endif
 
@@ -1355,7 +1354,7 @@ void reflection_process(int idx, vec3 vertex, vec3 normal,vec3 binormal, vec3 ta
 
 		reflection_accum+=reflection;
 	}
-
+#ifndef USE_LIGHTMAP
 	if (reflections[idx].ambient.a>0.0) { //compute ambient using skybox
 
 
@@ -1401,8 +1400,20 @@ void reflection_process(int idx, vec3 vertex, vec3 normal,vec3 binormal, vec3 ta
 		ambient_accum+=ambient_out;
 
 	}
+#endif
 }
 
+#ifdef USE_LIGHTMAP
+uniform mediump sampler2D lightmap; //texunit:-9
+uniform mediump float lightmap_energy;
+#endif
+
+#ifdef USE_LIGHTMAP_CAPTURE
+uniform mediump vec4[12] lightmap_captures;
+uniform bool lightmap_capture_sky;
+
+#endif
+
 #ifdef USE_GI_PROBES
 
 uniform mediump sampler3D gi_probe1; //texunit:-9
@@ -1630,7 +1641,7 @@ void main() {
 	vec2 uv = uv_interp;
 #endif
 
-#if defined(ENABLE_UV2_INTERP)
+#if defined(ENABLE_UV2_INTERP) || defined (USE_LIGHTMAP)
 	vec2 uv2 = uv2_interp;
 #endif
 
@@ -1743,7 +1754,7 @@ FRAGMENT_SHADER_CODE
 			//vec3 radiance = textureLod(radiance_cube, r, lod).xyz * ( brdf.x + brdf.y);
 
 		}
-
+#ifndef USE_LIGHTMAP
 		{
 
 			vec3 ambient_dir=normalize((radiance_inverse_xform * vec4(normal,0.0)).xyz);
@@ -1752,6 +1763,7 @@ FRAGMENT_SHADER_CODE
 			ambient_light=mix(ambient_light_color.rgb,env_ambient,radiance_ambient_contribution);
 			//ambient_light=vec3(0.0,0.0,0.0);
 		}
+#endif
 	}
 
 #else
@@ -1936,6 +1948,48 @@ FRAGMENT_SHADER_CODE
 
 #endif
 
+#ifdef USE_LIGHTMAP
+	ambient_light = texture(lightmap,uv2).rgb * lightmap_energy;
+#endif
+
+#ifdef USE_LIGHTMAP_CAPTURE
+	{
+		vec3 cone_dirs[12] = vec3[] (
+			vec3(0, 0, 1),
+			vec3(0.866025, 0, 0.5),
+			vec3(0.267617, 0.823639, 0.5),
+			vec3(-0.700629, 0.509037, 0.5),
+			vec3(-0.700629, -0.509037, 0.5),
+			vec3(0.267617, -0.823639, 0.5),
+			vec3(0, 0, -1),
+			vec3(0.866025, 0, -0.5),
+			vec3(0.267617, 0.823639, -0.5),
+			vec3(-0.700629, 0.509037, -0.5),
+			vec3(-0.700629, -0.509037, -0.5),
+			vec3(0.267617, -0.823639, -0.5)
+		);
+
+
+		vec3 local_normal = normalize(camera_matrix * vec4(normal,0.0)).xyz;
+		vec4 captured = vec4(0.0);
+		float sum = 0.0;
+		for(int i=0;i<12;i++) {
+			float amount = max(0.0,dot(local_normal,cone_dirs[i])); //not correct, but creates a nice wrap around effect
+			captured += lightmap_captures[i]*amount;
+			sum+=amount;
+		}
+
+		captured/=sum;
+
+		if (lightmap_capture_sky) {
+			ambient_light = mix( ambient_light, captured.rgb, captured.a);
+		} else {
+			ambient_light = captured.rgb;
+		}
+
+	}
+#endif
+
 #ifdef USE_FORWARD_LIGHTING
 
 
@@ -1950,11 +2004,11 @@ FRAGMENT_SHADER_CODE
 	} else {
 		specular_light+=env_reflection_light;
 	}
-
+#ifndef USE_LIGHTMAP
 	if (ambient_accum.a>0.0) {
 		ambient_light+=ambient_accum.rgb/ambient_accum.a;
 	}
-
+#endif
 
 
 #ifdef USE_VERTEX_LIGHTING
diff --git a/drivers/unix/packet_peer_udp_posix.cpp b/drivers/unix/packet_peer_udp_posix.cpp
index 61d2737555..f6742d8114 100644
--- a/drivers/unix/packet_peer_udp_posix.cpp
+++ b/drivers/unix/packet_peer_udp_posix.cpp
@@ -65,7 +65,7 @@ int PacketPeerUDPPosix::get_available_packet_count() const {
 	return queue_count;
 }
 
-Error PacketPeerUDPPosix::get_packet(const uint8_t **r_buffer, int &r_buffer_size) const {
+Error PacketPeerUDPPosix::get_packet(const uint8_t **r_buffer, int &r_buffer_size) {
 
 	Error err = const_cast<PacketPeerUDPPosix *>(this)->_poll(false);
 	if (err != OK)
diff --git a/drivers/unix/packet_peer_udp_posix.h b/drivers/unix/packet_peer_udp_posix.h
index e580d336b2..ad7be5bbe0 100644
--- a/drivers/unix/packet_peer_udp_posix.h
+++ b/drivers/unix/packet_peer_udp_posix.h
@@ -41,12 +41,12 @@ class PacketPeerUDPPosix : public PacketPeerUDP {
 		PACKET_BUFFER_SIZE = 65536
 	};
 
-	mutable RingBuffer<uint8_t> rb;
+	RingBuffer<uint8_t> rb;
 	uint8_t recv_buffer[PACKET_BUFFER_SIZE];
-	mutable uint8_t packet_buffer[PACKET_BUFFER_SIZE];
-	mutable IP_Address packet_ip;
-	mutable int packet_port;
-	mutable int queue_count;
+	uint8_t packet_buffer[PACKET_BUFFER_SIZE];
+	IP_Address packet_ip;
+	int packet_port;
+	int queue_count;
 	int sockfd;
 	bool sock_blocking;
 	IP::Type sock_type;
@@ -62,7 +62,7 @@ class PacketPeerUDPPosix : public PacketPeerUDP {
 
 public:
 	virtual int get_available_packet_count() const;
-	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size) const;
+	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size);
 	virtual Error put_packet(const uint8_t *p_buffer, int p_buffer_size);
 
 	virtual int get_max_packet_size() const;
diff --git a/platform/windows/packet_peer_udp_winsock.cpp b/drivers/windows/packet_peer_udp_winsock.cpp
index d414ec891e..119ee68bd2 100644
--- a/platform/windows/packet_peer_udp_winsock.cpp
+++ b/drivers/windows/packet_peer_udp_winsock.cpp
@@ -27,6 +27,8 @@
 /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
+#ifdef WINDOWS_ENABLED
+
 #include "packet_peer_udp_winsock.h"
 
 #include <winsock2.h>
@@ -43,7 +45,7 @@ int PacketPeerUDPWinsock::get_available_packet_count() const {
 	return queue_count;
 }
 
-Error PacketPeerUDPWinsock::get_packet(const uint8_t **r_buffer, int &r_buffer_size) const {
+Error PacketPeerUDPWinsock::get_packet(const uint8_t **r_buffer, int &r_buffer_size) {
 
 	Error err = const_cast<PacketPeerUDPWinsock *>(this)->_poll(false);
 	if (err != OK)
@@ -291,3 +293,5 @@ PacketPeerUDPWinsock::~PacketPeerUDPWinsock() {
 
 	close();
 }
+
+#endif
diff --git a/platform/windows/packet_peer_udp_winsock.h b/drivers/windows/packet_peer_udp_winsock.h
index 8a6951fd6e..8ce2cff741 100644
--- a/platform/windows/packet_peer_udp_winsock.h
+++ b/drivers/windows/packet_peer_udp_winsock.h
@@ -27,6 +27,8 @@
 /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
+#ifdef WINDOWS_ENABLED
+
 #ifndef PACKET_PEER_UDP_WINSOCK_H
 #define PACKET_PEER_UDP_WINSOCK_H
 
@@ -39,12 +41,12 @@ class PacketPeerUDPWinsock : public PacketPeerUDP {
 		PACKET_BUFFER_SIZE = 65536
 	};
 
-	mutable RingBuffer<uint8_t> rb;
+	RingBuffer<uint8_t> rb;
 	uint8_t recv_buffer[PACKET_BUFFER_SIZE];
-	mutable uint8_t packet_buffer[PACKET_BUFFER_SIZE];
-	mutable IP_Address packet_ip;
-	mutable int packet_port;
-	mutable int queue_count;
+	uint8_t packet_buffer[PACKET_BUFFER_SIZE];
+	IP_Address packet_ip;
+	int packet_port;
+	int queue_count;
 	int sockfd;
 	bool sock_blocking;
 	IP::Type sock_type;
@@ -62,7 +64,7 @@ class PacketPeerUDPWinsock : public PacketPeerUDP {
 
 public:
 	virtual int get_available_packet_count() const;
-	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size) const;
+	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size);
 	virtual Error put_packet(const uint8_t *p_buffer, int p_buffer_size);
 
 	virtual int get_max_packet_size() const;
@@ -82,3 +84,5 @@ public:
 	~PacketPeerUDPWinsock();
 };
 #endif // PACKET_PEER_UDP_WINSOCK_H
+
+#endif
diff --git a/platform/windows/stream_peer_winsock.cpp b/drivers/windows/stream_peer_tcp_winsock.cpp
index 8b83215325..f4cd38079d 100644
--- a/platform/windows/stream_peer_winsock.cpp
+++ b/drivers/windows/stream_peer_tcp_winsock.cpp
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  stream_peer_winsock.cpp                                              */
+/*  stream_peer_tcp_winsock.cpp                                          */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -29,7 +29,7 @@
 /*************************************************************************/
 #ifdef WINDOWS_ENABLED
 
-#include "stream_peer_winsock.h"
+#include "stream_peer_tcp_winsock.h"
 
 #include <winsock2.h>
 #include <ws2tcpip.h>
@@ -38,14 +38,14 @@
 
 int winsock_refcount = 0;
 
-StreamPeerTCP *StreamPeerWinsock::_create() {
+StreamPeerTCP *StreamPeerTCPWinsock::_create() {
 
-	return memnew(StreamPeerWinsock);
+	return memnew(StreamPeerTCPWinsock);
 };
 
-void StreamPeerWinsock::make_default() {
+void StreamPeerTCPWinsock::make_default() {
 
-	StreamPeerTCP::_create = StreamPeerWinsock::_create;
+	StreamPeerTCP::_create = StreamPeerTCPWinsock::_create;
 
 	if (winsock_refcount == 0) {
 		WSADATA data;
@@ -54,7 +54,7 @@ void StreamPeerWinsock::make_default() {
 	++winsock_refcount;
 };
 
-void StreamPeerWinsock::cleanup() {
+void StreamPeerTCPWinsock::cleanup() {
 
 	--winsock_refcount;
 	if (winsock_refcount == 0) {
@@ -63,7 +63,7 @@ void StreamPeerWinsock::cleanup() {
 	};
 };
 
-Error StreamPeerWinsock::_block(int p_sockfd, bool p_read, bool p_write) const {
+Error StreamPeerTCPWinsock::_block(int p_sockfd, bool p_read, bool p_write) const {
 
 	fd_set read, write;
 	FD_ZERO(&read);
@@ -78,7 +78,7 @@ Error StreamPeerWinsock::_block(int p_sockfd, bool p_read, bool p_write) const {
 	return ret < 0 ? FAILED : OK;
 };
 
-Error StreamPeerWinsock::_poll_connection() const {
+Error StreamPeerTCPWinsock::_poll_connection() const {
 
 	ERR_FAIL_COND_V(status != STATUS_CONNECTING || sockfd == INVALID_SOCKET, FAILED);
 
@@ -108,7 +108,7 @@ Error StreamPeerWinsock::_poll_connection() const {
 	return OK;
 };
 
-Error StreamPeerWinsock::write(const uint8_t *p_data, int p_bytes, int &r_sent, bool p_block) {
+Error StreamPeerTCPWinsock::write(const uint8_t *p_data, int p_bytes, int &r_sent, bool p_block) {
 
 	if (status == STATUS_NONE || status == STATUS_ERROR) {
 
@@ -166,7 +166,7 @@ Error StreamPeerWinsock::write(const uint8_t *p_data, int p_bytes, int &r_sent,
 	return OK;
 };
 
-Error StreamPeerWinsock::read(uint8_t *p_buffer, int p_bytes, int &r_received, bool p_block) {
+Error StreamPeerTCPWinsock::read(uint8_t *p_buffer, int p_bytes, int &r_received, bool p_block) {
 
 	if (!is_connected_to_host()) {
 
@@ -224,29 +224,29 @@ Error StreamPeerWinsock::read(uint8_t *p_buffer, int p_bytes, int &r_received, b
 	return OK;
 };
 
-Error StreamPeerWinsock::put_data(const uint8_t *p_data, int p_bytes) {
+Error StreamPeerTCPWinsock::put_data(const uint8_t *p_data, int p_bytes) {
 
 	int total;
 	return write(p_data, p_bytes, total, true);
 };
 
-Error StreamPeerWinsock::put_partial_data(const uint8_t *p_data, int p_bytes, int &r_sent) {
+Error StreamPeerTCPWinsock::put_partial_data(const uint8_t *p_data, int p_bytes, int &r_sent) {
 
 	return write(p_data, p_bytes, r_sent, false);
 };
 
-Error StreamPeerWinsock::get_data(uint8_t *p_buffer, int p_bytes) {
+Error StreamPeerTCPWinsock::get_data(uint8_t *p_buffer, int p_bytes) {
 
 	int total;
 	return read(p_buffer, p_bytes, total, true);
 };
 
-Error StreamPeerWinsock::get_partial_data(uint8_t *p_buffer, int p_bytes, int &r_received) {
+Error StreamPeerTCPWinsock::get_partial_data(uint8_t *p_buffer, int p_bytes, int &r_received) {
 
 	return read(p_buffer, p_bytes, r_received, false);
 };
 
-StreamPeerTCP::Status StreamPeerWinsock::get_status() const {
+StreamPeerTCP::Status StreamPeerTCPWinsock::get_status() const {
 
 	if (status == STATUS_CONNECTING) {
 		_poll_connection();
@@ -255,7 +255,7 @@ StreamPeerTCP::Status StreamPeerWinsock::get_status() const {
 	return status;
 };
 
-bool StreamPeerWinsock::is_connected_to_host() const {
+bool StreamPeerTCPWinsock::is_connected_to_host() const {
 
 	if (status == STATUS_NONE || status == STATUS_ERROR) {
 
@@ -268,7 +268,7 @@ bool StreamPeerWinsock::is_connected_to_host() const {
 	return (sockfd != INVALID_SOCKET);
 };
 
-void StreamPeerWinsock::disconnect_from_host() {
+void StreamPeerTCPWinsock::disconnect_from_host() {
 
 	if (sockfd != INVALID_SOCKET)
 		closesocket(sockfd);
@@ -281,7 +281,7 @@ void StreamPeerWinsock::disconnect_from_host() {
 	peer_port = 0;
 };
 
-void StreamPeerWinsock::set_socket(int p_sockfd, IP_Address p_host, int p_port, IP::Type p_sock_type) {
+void StreamPeerTCPWinsock::set_socket(int p_sockfd, IP_Address p_host, int p_port, IP::Type p_sock_type) {
 
 	sockfd = p_sockfd;
 	sock_type = p_sock_type;
@@ -290,7 +290,7 @@ void StreamPeerWinsock::set_socket(int p_sockfd, IP_Address p_host, int p_port,
 	peer_port = p_port;
 };
 
-Error StreamPeerWinsock::connect_to_host(const IP_Address &p_host, uint16_t p_port) {
+Error StreamPeerTCPWinsock::connect_to_host(const IP_Address &p_host, uint16_t p_port) {
 
 	ERR_FAIL_COND_V(!p_host.is_valid(), ERR_INVALID_PARAMETER);
 
@@ -331,13 +331,13 @@ Error StreamPeerWinsock::connect_to_host(const IP_Address &p_host, uint16_t p_po
 	return OK;
 };
 
-void StreamPeerWinsock::set_nodelay(bool p_enabled) {
+void StreamPeerTCPWinsock::set_nodelay(bool p_enabled) {
 	ERR_FAIL_COND(!is_connected_to_host());
 	int flag = p_enabled ? 1 : 0;
 	setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
 }
 
-int StreamPeerWinsock::get_available_bytes() const {
+int StreamPeerTCPWinsock::get_available_bytes() const {
 
 	unsigned long len;
 	int ret = ioctlsocket(sockfd, FIONREAD, &len);
@@ -345,17 +345,17 @@ int StreamPeerWinsock::get_available_bytes() const {
 	return len;
 }
 
-IP_Address StreamPeerWinsock::get_connected_host() const {
+IP_Address StreamPeerTCPWinsock::get_connected_host() const {
 
 	return peer_host;
 };
 
-uint16_t StreamPeerWinsock::get_connected_port() const {
+uint16_t StreamPeerTCPWinsock::get_connected_port() const {
 
 	return peer_port;
 };
 
-StreamPeerWinsock::StreamPeerWinsock() {
+StreamPeerTCPWinsock::StreamPeerTCPWinsock() {
 
 	sock_type = IP::TYPE_NONE;
 	sockfd = INVALID_SOCKET;
@@ -363,7 +363,7 @@ StreamPeerWinsock::StreamPeerWinsock() {
 	peer_port = 0;
 };
 
-StreamPeerWinsock::~StreamPeerWinsock() {
+StreamPeerTCPWinsock::~StreamPeerTCPWinsock() {
 
 	disconnect_from_host();
 };
diff --git a/platform/windows/stream_peer_winsock.h b/drivers/windows/stream_peer_tcp_winsock.h
index 26e2a3e4c9..fef457c43f 100644
--- a/platform/windows/stream_peer_winsock.h
+++ b/drivers/windows/stream_peer_tcp_winsock.h
@@ -29,15 +29,15 @@
 /*************************************************************************/
 #ifdef WINDOWS_ENABLED
 
-#ifndef STREAM_PEER_WINSOCK_H
-#define STREAM_PEER_WINSOCK_H
+#ifndef STREAM_PEER_TCP_WINSOCK_H
+#define STREAM_PEER_TCP_WINSOCK_H
 
 #include "error_list.h"
 
 #include "core/io/ip_address.h"
 #include "core/io/stream_peer_tcp.h"
 
-class StreamPeerWinsock : public StreamPeerTCP {
+class StreamPeerTCPWinsock : public StreamPeerTCP {
 
 protected:
 	mutable Status status;
@@ -82,10 +82,10 @@ public:
 
 	virtual void set_nodelay(bool p_enabled);
 
-	StreamPeerWinsock();
-	~StreamPeerWinsock();
+	StreamPeerTCPWinsock();
+	~StreamPeerTCPWinsock();
 };
 
-#endif // TCP_SOCKET_POSIX_H
+#endif // STREAM_PEER_TCP_WINSOCK_H
 
 #endif
diff --git a/platform/windows/tcp_server_winsock.cpp b/drivers/windows/tcp_server_winsock.cpp
index de300befa7..49de279793 100644
--- a/platform/windows/tcp_server_winsock.cpp
+++ b/drivers/windows/tcp_server_winsock.cpp
@@ -27,9 +27,11 @@
 /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
+#ifdef WINDOWS_ENABLED
+
 #include "tcp_server_winsock.h"
 
-#include "stream_peer_winsock.h"
+#include "stream_peer_tcp_winsock.h"
 
 #include <winsock2.h>
 #include <ws2tcpip.h>
@@ -151,7 +153,7 @@ Ref<StreamPeerTCP> TCPServerWinsock::take_connection() {
 	int fd = accept(listen_sockfd, (struct sockaddr *)&their_addr, &sin_size);
 	ERR_FAIL_COND_V(fd == INVALID_SOCKET, NULL);
 
-	Ref<StreamPeerWinsock> conn = memnew(StreamPeerWinsock);
+	Ref<StreamPeerTCPWinsock> conn = memnew(StreamPeerTCPWinsock);
 	IP_Address ip;
 	int port;
 	_set_ip_addr_port(ip, port, &their_addr);
@@ -181,3 +183,5 @@ TCPServerWinsock::~TCPServerWinsock() {
 
 	stop();
 };
+
+#endif
diff --git a/platform/windows/tcp_server_winsock.h b/drivers/windows/tcp_server_winsock.h
index a3e01098ed..fd16480167 100644
--- a/platform/windows/tcp_server_winsock.h
+++ b/drivers/windows/tcp_server_winsock.h
@@ -27,6 +27,8 @@
 /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
+#ifdef WINDOWS_ENABLED
+
 #ifndef TCP_SERVER_WINSOCK_H
 #define TCP_SERVER_WINSOCK_H
 
@@ -54,3 +56,5 @@ public:
 };
 
 #endif
+
+#endif
diff --git a/editor/animation_editor.cpp b/editor/animation_editor.cpp
index c6757ba98f..c6381864b3 100644
--- a/editor/animation_editor.cpp
+++ b/editor/animation_editor.cpp
@@ -324,7 +324,7 @@ public:
 			int existing = animation->track_find_key(track, new_time, true);
 
 			setting = true;
-			undo_redo->create_action(TTR("Move Add Key"), UndoRedo::MERGE_ENDS);
+			undo_redo->create_action(TTR("Anim Change Keyframe Time"), UndoRedo::MERGE_ENDS);
 
 			Variant val = animation->track_get_key_value(track, key);
 			float trans = animation->track_get_key_transition(track, key);
@@ -391,7 +391,7 @@ public:
 					}
 
 					setting = true;
-					undo_redo->create_action(TTR("Anim Change Value"), UndoRedo::MERGE_ENDS);
+					undo_redo->create_action(TTR("Anim Change Keyframe Value"), UndoRedo::MERGE_ENDS);
 					Variant prev = animation->track_get_key_value(track, key);
 					undo_redo->add_do_method(animation.ptr(), "track_set_key_value", track, key, value);
 					undo_redo->add_undo_method(animation.ptr(), "track_set_key_value", track, key, prev);
diff --git a/editor/connections_dialog.cpp b/editor/connections_dialog.cpp
index cd60455f4f..c095229374 100644
--- a/editor/connections_dialog.cpp
+++ b/editor/connections_dialog.cpp
@@ -88,12 +88,6 @@ public:
 
 void ConnectDialog::_notification(int p_what) {
 
-	if (p_what == NOTIFICATION_DRAW) {
-
-		//RID ci = get_canvas_item();
-		//get_stylebox("panel","PopupMenu")->draw(ci,Rect2(Point2(),get_size()));
-	}
-
 	if (p_what == NOTIFICATION_ENTER_TREE) {
 		bind_editor->edit(cdbinds);
 	}
@@ -117,11 +111,6 @@ void ConnectDialog::_tree_node_selected() {
 	dst_path->set_text(node->get_path_to(current));
 }
 
-void ConnectDialog::_dst_method_list_selected(int p_idx) {
-
-	//dst_method->set_text( dst_method_list->get_popup()->get_item_text(p_idx));
-}
-
 void ConnectDialog::edit(Node *p_node) {
 
 	node = p_node;
@@ -247,9 +236,7 @@ void ConnectDialog::set_dst_method(const StringName &p_method) {
 
 void ConnectDialog::_bind_methods() {
 
-	//ClassDB::bind_method("_ok",&ConnectDialog::_ok_pressed);
 	ClassDB::bind_method("_cancel", &ConnectDialog::_cancel_pressed);
-	//ClassDB::bind_method("_dst_method_list_selected",&ConnectDialog::_dst_method_list_selected);
 	ClassDB::bind_method("_tree_node_selected", &ConnectDialog::_tree_node_selected);
 
 	ClassDB::bind_method("_add_bind", &ConnectDialog::_add_bind);
@@ -355,18 +342,6 @@ ConnectDialog::ConnectDialog() {
 	oneshot->set_text(TTR("Oneshot"));
 	dstm_hb->add_child(oneshot);
 
-	/*
-	realtime = memnew( CheckButton );
-	realtime->set_anchor( MARGIN_TOP, ANCHOR_END );
-	realtime->set_anchor( MARGIN_BOTTOM, ANCHOR_END );
-	realtime->set_anchor( MARGIN_RIGHT, ANCHOR_END );
-	realtime->set_begin( Point2( 120, button_margin-10 ) );
-	realtime->set_end( Point2( 80, margin ) );
-	realtime->set_text("Realtime");
-	add_child(realtime);
-*/
-
-	//dst_method_list->get_popup()->connect("id_pressed", this,"_dst_method_list_selected");
 	tree->connect("node_selected", this, "_tree_node_selected");
 
 	set_as_toplevel(true);
@@ -377,7 +352,6 @@ ConnectDialog::ConnectDialog() {
 	add_child(error);
 	error->get_ok()->set_text(TTR("Close"));
 	get_ok()->set_text(TTR("Connect"));
-	//error->get_cancel()->set_text("Close");
 }
 
 ConnectDialog::~ConnectDialog() {
@@ -386,12 +360,6 @@ ConnectDialog::~ConnectDialog() {
 
 void ConnectionsDock::_notification(int p_what) {
 
-	if (p_what == NOTIFICATION_DRAW) {
-
-		//RID ci = get_canvas_item();
-		//get_stylebox("panel","PopupMenu")->draw(ci,Rect2(Point2(),get_size()));
-	}
-
 	if (p_what == EditorSettings::NOTIFICATION_EDITOR_SETTINGS_CHANGED) {
 		update_tree();
 	}
@@ -478,7 +446,7 @@ void ConnectionsDock::_connect_pressed() {
 		Connection c = item->get_metadata(0);
 		ERR_FAIL_COND(c.source != node); //shouldn't happen but...bugcheck
 
-		undo_redo->create_action(TTR("Create Subscription"));
+		undo_redo->create_action(vformat(TTR("Disconnect '%s' from '%s'"), c.signal, c.method));
 		undo_redo->add_do_method(node, "disconnect", c.signal, c.target, c.method);
 		undo_redo->add_undo_method(node, "connect", c.signal, c.target, c.method, Vector<Variant>(), c.flags);
 		undo_redo->add_do_method(this, "update_tree");
@@ -491,42 +459,6 @@ void ConnectionsDock::_connect_pressed() {
 		update_tree();
 	}
 }
-/*
-void ConnectionsDock::_remove() {
-
-	if (!tree->get_selected())
-		return;
-
-	TreeItem *selected=tree->get_selected();
-	if (!selected)
-		return;
-
-	Dictionary meta=selected->get_metadata(0);
-
-	remove_confirm->set_text(String()+"Remove Connection \""+meta["from_event"].operator String()+"\" ?");
-	remove_confirm->popup_centered(Size2(340,80));
-}
-*/
-/*
-void ConnectionsDock::_remove_confirm() {
-
-	if (!tree->get_selected())
-		return;
-	TreeItem *selected=tree->get_selected();
-	if (!selected)
-		return;
-
-	Dictionary meta=selected->get_metadata(0);
-
-	undo_redo->create_action("Remove Subscription");
-	undo_redo->add_do_method(node,"unsubscribe_path_event",meta["from_event"].operator String(),meta["from_path"].operator NodePath(),meta["to_method"].operator String());
-	undo_redo->add_undo_method(node,"subscribe_path_event_persist",meta["from_event"].operator String(),meta["from_path"].operator NodePath(),meta["to_method"].operator String(),Array(),false);
-	undo_redo->add_do_method(this,"update_tree");
-	undo_redo->add_undo_method(this,"update_tree");
-	undo_redo->commit_action();
-
-}
-*/
 
 struct _ConnectionsDockMethodInfoSort {
 
diff --git a/editor/connections_dialog.h b/editor/connections_dialog.h
index 53f4d857bf..99a83ff599 100644
--- a/editor/connections_dialog.h
+++ b/editor/connections_dialog.h
@@ -55,7 +55,6 @@ class ConnectDialog : public ConfirmationDialog {
 	LineEdit *dst_path;
 	LineEdit *dst_method;
 	SceneTreeEditor *tree;
-	//MenuButton *dst_method_list;
 	OptionButton *type_list;
 	CheckButton *deferred;
 	CheckButton *oneshot;
@@ -66,7 +65,6 @@ class ConnectDialog : public ConfirmationDialog {
 	void ok_pressed();
 	void _cancel_pressed();
 	void _tree_node_selected();
-	void _dst_method_list_selected(int p_idx);
 	void _add_bind();
 	void _remove_bind();
 
@@ -84,8 +82,6 @@ public:
 	void set_dst_method(const StringName &p_method);
 	void set_dst_node(Node *p_node);
 
-	//Button *get_ok() { return ok; }
-	//Button *get_cancel() { return cancel; }
 	void edit(Node *p_node);
 
 	ConnectDialog();
diff --git a/editor/create_dialog.cpp b/editor/create_dialog.cpp
index c058d290bf..2584d26fc4 100644
--- a/editor/create_dialog.cpp
+++ b/editor/create_dialog.cpp
@@ -370,7 +370,7 @@ void CreateDialog::_notification(int p_what) {
 void CreateDialog::set_base_type(const String &p_base) {
 
 	base_type = p_base;
-	set_title(TTR("Create New") + " " + p_base);
+	set_title(vformat(TTR("Create New %s"), p_base));
 	_update_search();
 }
 
diff --git a/editor/dependency_editor.cpp b/editor/dependency_editor.cpp
index ec0ca3add5..f357f1e51f 100644
--- a/editor/dependency_editor.cpp
+++ b/editor/dependency_editor.cpp
@@ -678,7 +678,7 @@ bool OrphanResourcesDialog::_fill_owners(EditorFileSystemDirectory *efsd, HashMa
 				int ds = efsd->get_file_deps(i).size();
 				ti->set_text(1, itos(ds));
 				if (ds) {
-					ti->add_button(1, get_icon("Visible", "EditorIcons"));
+					ti->add_button(1, get_icon("GuiVisibilityVisible", "EditorIcons"));
 				}
 				ti->set_metadata(0, path);
 				has_childs = true;
diff --git a/editor/editor_export.cpp b/editor/editor_export.cpp
index a746a0b140..3585417d13 100644
--- a/editor/editor_export.cpp
+++ b/editor/editor_export.cpp
@@ -39,10 +39,10 @@
 #include "io/zip_io.h"
 #include "os/file_access.h"
 #include "project_settings.h"
+#include "scene/resources/scene_format_text.h"
 #include "script_language.h"
-#include "version.h"
-
 #include "thirdparty/misc/md5.h"
+#include "version.h"
 
 static int _get_pad(int p_alignment, int p_n) {
 
@@ -1409,3 +1409,30 @@ EditorExportPlatformPC::EditorExportPlatformPC() {
 
 	chmod_flags = -1;
 }
+
+///////////////////////
+
+void EditorExportTextSceneToBinaryPlugin::_export_file(const String &p_path, const String &p_type, const Set<String> &p_features) {
+
+	String extension = p_path.get_extension().to_lower();
+	if (extension != "tres" && extension != "tscn") {
+		return;
+	}
+
+	print_line("exporting " + p_path);
+
+	bool convert = GLOBAL_GET("editor/convert_text_resources_to_binary_on_export");
+	if (!convert)
+		return;
+	String tmp_path = EditorSettings::get_singleton()->get_cache_dir().plus_file("file.res");
+	Error err = ResourceFormatLoaderText::convert_file_to_binary(p_path, tmp_path);
+	ERR_FAIL_COND(err != OK);
+	Vector<uint8_t> data = FileAccess::get_file_as_array(tmp_path);
+	ERR_FAIL_COND(data.size() == 0);
+	add_file(p_path + ".converted.res", data, true);
+}
+
+EditorExportTextSceneToBinaryPlugin::EditorExportTextSceneToBinaryPlugin() {
+
+	GLOBAL_DEF("editor/convert_text_resources_to_binary_on_export", false);
+}
diff --git a/editor/editor_export.h b/editor/editor_export.h
index 6621b80602..02b15aff10 100644
--- a/editor/editor_export.h
+++ b/editor/editor_export.h
@@ -408,4 +408,13 @@ public:
 	EditorExportPlatformPC();
 };
 
+class EditorExportTextSceneToBinaryPlugin : public EditorExportPlugin {
+
+	GDCLASS(EditorExportTextSceneToBinaryPlugin, EditorExportPlugin)
+
+public:
+	virtual void _export_file(const String &p_path, const String &p_type, const Set<String> &p_features);
+	EditorExportTextSceneToBinaryPlugin();
+};
+
 #endif // EDITOR_IMPORT_EXPORT_H
diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index 27ed53bb42..cb8407386d 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -67,6 +67,7 @@
 #include "editor/plugins/animation_player_editor_plugin.h"
 #include "editor/plugins/animation_tree_editor_plugin.h"
 #include "editor/plugins/asset_library_editor_plugin.h"
+#include "editor/plugins/baked_lightmap_editor_plugin.h"
 #include "editor/plugins/camera_editor_plugin.h"
 #include "editor/plugins/canvas_item_editor_plugin.h"
 #include "editor/plugins/collision_polygon_2d_editor_plugin.h"
@@ -3384,14 +3385,14 @@ void EditorNode::stop_child_process() {
 	_menu_option_confirm(RUN_STOP, false);
 }
 
-void EditorNode::progress_add_task(const String &p_task, const String &p_label, int p_steps) {
+void EditorNode::progress_add_task(const String &p_task, const String &p_label, int p_steps, bool p_can_cancel) {
 
-	singleton->progress_dialog->add_task(p_task, p_label, p_steps);
+	singleton->progress_dialog->add_task(p_task, p_label, p_steps, p_can_cancel);
 }
 
-void EditorNode::progress_task_step(const String &p_task, const String &p_state, int p_step, bool p_force_refresh) {
+bool EditorNode::progress_task_step(const String &p_task, const String &p_state, int p_step, bool p_force_refresh) {
 
-	singleton->progress_dialog->task_step(p_task, p_state, p_step, p_force_refresh);
+	return singleton->progress_dialog->task_step(p_task, p_state, p_step, p_force_refresh);
 }
 
 void EditorNode::progress_end_task(const String &p_task) {
@@ -4707,6 +4708,7 @@ EditorNode::EditorNode() {
 	EditorHelp::generate_doc(); //before any editor classes are crated
 	SceneState::set_disable_placeholders(true);
 	ResourceLoader::clear_translation_remaps(); //no remaps using during editor
+	ResourceLoader::clear_path_remaps();
 	editor_initialize_certificates(); //for asset sharing
 
 	InputDefault *id = Object::cast_to<InputDefault>(Input::get_singleton());
@@ -5659,6 +5661,7 @@ EditorNode::EditorNode() {
 	add_editor_plugin(memnew(TextureRegionEditorPlugin(this)));
 	add_editor_plugin(memnew(Particles2DEditorPlugin(this)));
 	add_editor_plugin(memnew(GIProbeEditorPlugin(this)));
+	add_editor_plugin(memnew(BakedLightmapEditorPlugin(this)));
 	add_editor_plugin(memnew(Path2DEditorPlugin(this)));
 	add_editor_plugin(memnew(PathEditorPlugin(this)));
 	add_editor_plugin(memnew(Line2DEditorPlugin(this)));
@@ -5715,6 +5718,11 @@ EditorNode::EditorNode() {
 	editor_plugins_force_over = memnew(EditorPluginList);
 	editor_plugins_force_input_forwarding = memnew(EditorPluginList);
 
+	Ref<EditorExportTextSceneToBinaryPlugin> export_text_to_binary_plugin;
+	export_text_to_binary_plugin.instance();
+
+	EditorExport::get_singleton()->add_export_plugin(export_text_to_binary_plugin);
+
 	_edit_current();
 	current = NULL;
 
diff --git a/editor/editor_node.h b/editor/editor_node.h
index 658d5dc0ae..e7ef9eefb5 100644
--- a/editor/editor_node.h
+++ b/editor/editor_node.h
@@ -745,8 +745,8 @@ public:
 
 	static void add_io_error(const String &p_error);
 
-	static void progress_add_task(const String &p_task, const String &p_label, int p_steps);
-	static void progress_task_step(const String &p_task, const String &p_state, int p_step = -1, bool p_force_refresh = true);
+	static void progress_add_task(const String &p_task, const String &p_label, int p_steps, bool p_can_cancel = false);
+	static bool progress_task_step(const String &p_task, const String &p_state, int p_step = -1, bool p_force_refresh = true);
 	static void progress_end_task(const String &p_task);
 
 	static void progress_add_task_bg(const String &p_task, const String &p_label, int p_steps);
@@ -807,9 +807,9 @@ public:
 struct EditorProgress {
 
 	String task;
-	void step(const String &p_state, int p_step = -1, bool p_force_refresh = true) { EditorNode::progress_task_step(task, p_state, p_step, p_force_refresh); }
-	EditorProgress(const String &p_task, const String &p_label, int p_amount) {
-		EditorNode::progress_add_task(p_task, p_label, p_amount);
+	bool step(const String &p_state, int p_step = -1, bool p_force_refresh = true) { return EditorNode::progress_task_step(task, p_state, p_step, p_force_refresh); }
+	EditorProgress(const String &p_task, const String &p_label, int p_amount, bool p_can_cancel = false) {
+		EditorNode::progress_add_task(p_task, p_label, p_amount, p_can_cancel);
 		task = p_task;
 	}
 	~EditorProgress() { EditorNode::progress_end_task(task); }
diff --git a/editor/editor_sub_scene.cpp b/editor/editor_sub_scene.cpp
index b81dfd3f46..fad9346b38 100644
--- a/editor/editor_sub_scene.cpp
+++ b/editor/editor_sub_scene.cpp
@@ -96,14 +96,54 @@ void EditorSubScene::_fill_tree(Node *p_node, TreeItem *p_parent) {
 	}
 }
 
-void EditorSubScene::ok_pressed() {
+void EditorSubScene::_selected_changed() {
+	selection.clear();
+	is_root = false;
+}
 
-	TreeItem *s = tree->get_selected();
-	if (!s)
-		return;
-	Node *selnode = s->get_metadata(0);
-	if (!selnode)
+void EditorSubScene::_item_multi_selected(Object *p_object, int p_cell, bool p_selected) {
+	if (!is_root) {
+		TreeItem *item = Object::cast_to<TreeItem>(p_object);
+		ERR_FAIL_COND(!item);
+
+		Node *n = item->get_metadata(0);
+
+		if (!n)
+			return;
+		if (p_selected) {
+			if (n == scene) {
+				is_root = true;
+				selection.clear();
+			}
+			selection.push_back(n);
+		}
+	}
+}
+
+void EditorSubScene::_remove_selection_child(Node *n) {
+	if (n->get_child_count() > 0) {
+		for (int i = 0; i < n->get_child_count(); i++) {
+			Node *c = n->get_child(i);
+			List<Node *>::Element *E = selection.find(c);
+			if (E) {
+				selection.move_to_back(E);
+				selection.pop_back();
+			}
+			if (c->get_child_count() > 0) {
+				_remove_selection_child(c);
+			}
+		}
+	}
+}
+
+void EditorSubScene::ok_pressed() {
+	if (selection.size() <= 0) {
 		return;
+	}
+	for (List<Node *>::Element *E = selection.front(); E; E = E->next()) {
+		Node *c = E->get();
+		_remove_selection_child(c);
+	}
 	emit_signal("subscene_selected");
 	hide();
 	clear();
@@ -127,37 +167,34 @@ void EditorSubScene::_reown(Node *p_node, List<Node *> *p_to_reown) {
 }
 
 void EditorSubScene::move(Node *p_new_parent, Node *p_new_owner) {
-
 	if (!scene) {
 		return;
 	}
-	TreeItem *s = tree->get_selected();
-	if (!s) {
-		return;
-	}
 
-	Node *selnode = s->get_metadata(0);
-	if (!selnode) {
+	if (selection.size() <= 0) {
 		return;
 	}
 
-	List<Node *> to_reown;
-	_reown(selnode, &to_reown);
-
-	if (selnode != scene) {
-		selnode->get_parent()->remove_child(selnode);
-	}
+	for (List<Node *>::Element *E = selection.front(); E; E = E->next()) {
+		Node *selnode = E->get();
+		if (!selnode) {
+			return;
+		}
+		List<Node *> to_reown;
+		_reown(selnode, &to_reown);
+		if (selnode != scene) {
+			selnode->get_parent()->remove_child(selnode);
+		}
 
-	p_new_parent->add_child(selnode);
-	for (List<Node *>::Element *E = to_reown.front(); E; E = E->next()) {
-		E->get()->set_owner(p_new_owner);
+		p_new_parent->add_child(selnode);
+		for (List<Node *>::Element *E = to_reown.front(); E; E = E->next()) {
+			E->get()->set_owner(p_new_owner);
+		}
 	}
-
-	if (selnode != scene) {
+	if (!is_root) {
 		memdelete(scene);
 	}
 	scene = NULL;
-
 	//return selnode;
 }
 
@@ -172,12 +209,15 @@ void EditorSubScene::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("_path_selected"), &EditorSubScene::_path_selected);
 	ClassDB::bind_method(D_METHOD("_path_changed"), &EditorSubScene::_path_changed);
 	ClassDB::bind_method(D_METHOD("_path_browse"), &EditorSubScene::_path_browse);
+	ClassDB::bind_method(D_METHOD("_item_multi_selected"), &EditorSubScene::_item_multi_selected);
+	ClassDB::bind_method(D_METHOD("_selected_changed"), &EditorSubScene::_selected_changed);
 	ADD_SIGNAL(MethodInfo("subscene_selected"));
 }
 
 EditorSubScene::EditorSubScene() {
 
 	scene = NULL;
+	is_root = false;
 
 	set_title(TTR("Select Node(s) to Import"));
 	set_hide_on_ok(false);
@@ -200,6 +240,11 @@ EditorSubScene::EditorSubScene() {
 	tree = memnew(Tree);
 	tree->set_v_size_flags(SIZE_EXPAND_FILL);
 	vb->add_margin_child(TTR("Import From Node:"), tree, true);
+	tree->set_select_mode(Tree::SELECT_MULTI);
+	tree->connect("multi_selected", this, "_item_multi_selected");
+	//tree->connect("nothing_selected", this, "_deselect_items");
+	tree->connect("cell_selected", this, "_selected_changed");
+
 	tree->connect("item_activated", this, "_ok", make_binds(), CONNECT_DEFERRED);
 
 	file_dialog = memnew(EditorFileDialog);
diff --git a/editor/editor_sub_scene.h b/editor/editor_sub_scene.h
index 13ce19bbb2..db9d91018a 100644
--- a/editor/editor_sub_scene.h
+++ b/editor/editor_sub_scene.h
@@ -38,13 +38,18 @@ class EditorSubScene : public ConfirmationDialog {
 
 	GDCLASS(EditorSubScene, ConfirmationDialog);
 
+	List<Node *> selection;
 	LineEdit *path;
 	Tree *tree;
 	Node *scene;
+	bool is_root;
 
 	EditorFileDialog *file_dialog;
 
 	void _fill_tree(Node *p_node, TreeItem *p_parent);
+	void _selected_changed();
+	void _item_multi_selected(Object *p_object, int p_cell, bool p_selected);
+	void _remove_selection_child(Node *c);
 	void _reown(Node *p_node, List<Node *> *p_to_reown);
 
 	void ok_pressed();
diff --git a/editor/export_template_manager.cpp b/editor/export_template_manager.cpp
index 2aad4774b0..cdb7256329 100644
--- a/editor/export_template_manager.cpp
+++ b/editor/export_template_manager.cpp
@@ -385,7 +385,7 @@ void ExportTemplateManager::_http_download_templates_completed(int p_status, int
 			template_list_state->set_text(TTR("No response."));
 		} break;
 		case HTTPRequest::RESULT_REQUEST_FAILED: {
-			template_list_state->set_text(TTR("Req. Failed."));
+			template_list_state->set_text(TTR("Request Failed."));
 		} break;
 		case HTTPRequest::RESULT_REDIRECT_LIMIT_REACHED: {
 			template_list_state->set_text(TTR("Redirect Loop."));
@@ -465,7 +465,7 @@ void ExportTemplateManager::_notification(int p_what) {
 				break;
 			case HTTPClient::STATUS_CONNECTING: status = TTR("Connecting.."); break;
 			case HTTPClient::STATUS_CANT_CONNECT:
-				status = TTR("Can't Conect");
+				status = TTR("Can't Connect");
 				errored = true;
 				break;
 			case HTTPClient::STATUS_CONNECTED: status = TTR("Connected"); break;
diff --git a/editor/icons/icon_GUI_visibility_hidden.svg b/editor/icons/icon_GUI_visibility_hidden.svg
index 2add2e9eb8..61131c77c8 100644
--- a/editor/icons/icon_GUI_visibility_hidden.svg
+++ b/editor/icons/icon_GUI_visibility_hidden.svg
@@ -1,55 +1,3 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="16"
-   height="16"
-   version="1.1"
-   viewBox="0 0 16 16"
-   id="svg2"
-   inkscape:version="0.91 r13725"
-   sodipodi:docname="icon_GUI_visibility_hidden.svg">
-  <metadata
-     id="metadata12">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title />
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <defs
-     id="defs10" />
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="1920"
-     inkscape:window-height="1027"
-     id="namedview8"
-     showgrid="false"
-     inkscape:zoom="14.75"
-     inkscape:cx="18.882384"
-     inkscape:cy="7.2939487"
-     inkscape:window-x="-8"
-     inkscape:window-y="-8"
-     inkscape:window-maximized="1"
-     inkscape:current-layer="svg2" />
-  <path
-     style="color:#000000;text-indent:0;text-decoration:none;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;text-transform:none;white-space:normal;isolation:auto;mix-blend-mode:normal;solid-color:#000000;fill:#e0e0e0;fill-opacity:1;fill-rule:evenodd;color-rendering:auto;image-rendering:auto;shape-rendering:auto"
-     d="M 8.3320312 2.1328125 C 8.1166713 2.129146 7.900423 2.1368613 7.6855469 2.1542969 C 4.8418629 2.3850399 2.1034153 4.4237115 1.0449219 7.5722656 C 0.98765482 7.7577705 0.9856205 7.9559357 1.0390625 8.1425781 C 1.2458895 8.8664725 1.5352035 9.5092453 1.8730469 10.089844 L 12.501953 3.7890625 C 11.256805 2.6845102 9.797893 2.1577685 8.3320312 2.1328125 z M 14.554688 3.3046875 L 0.7421875 11.507812 L 1.4453125 12.695312 L 15.257812 4.4921875 L 14.554688 3.3046875 z M 14.169922 5.8847656 L 3.6171875 12.140625 C 4.9944165 13.294116 6.6188565 13.867188 8 13.867188 C 10.5 13.867188 13.836536 12.077978 14.960938 8.1425781 C 15.012856 7.9619931 15.012856 7.7704285 14.960938 7.5898438 C 14.731965 6.9583712 14.46336 6.3981967 14.169922 5.8847656 z "
-     id="path6" />
+<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
+<path d="m7.9998 2c-2.5567 0-5.7907 1.9477-6.9551 5.7051a1.0001 1.0001 0 0 0 -0.00586 0.5703c1.1244 3.9354 4.4609 5.7246 6.9609 5.7246s5.8365-1.7892 6.9609-5.7246a1.0001 1.0001 0 0 0 0 -0.5527c-1.1003-3.7876-4.4066-5.7227-6.9609-5.7227zm0 2a4 4 0 0 1 4 4 4 4 0 0 1 -4 4 4 4 0 0 1 -4 -4 4 4 0 0 1 4 -4zm0 2a2 2 0 0 0 -2 2 2 2 0 0 0 2 2 2 2 0 0 0 2 -2 2 2 0 0 0 -2 -2z" color="#000000" color-rendering="auto" fill="#e0e0e0" fill-opacity=".39216" fill-rule="evenodd" image-rendering="auto" shape-rendering="auto" solid-color="#000000" style="isolation:auto;mix-blend-mode:normal;text-decoration-color:#000000;text-decoration-line:none;text-decoration-style:solid;text-indent:0;text-transform:none;white-space:normal"/>
 </svg>
diff --git a/editor/icons/icon_GUI_visibility_visible.svg b/editor/icons/icon_GUI_visibility_visible.svg
index 11ae563779..e3aff37058 100644
--- a/editor/icons/icon_GUI_visibility_visible.svg
+++ b/editor/icons/icon_GUI_visibility_visible.svg
@@ -1,63 +1,3 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="16"
-   height="16"
-   version="1.1"
-   viewBox="0 0 16 16"
-   id="svg2"
-   inkscape:version="0.91 r13725"
-   sodipodi:docname="icon_visibility_visible.svg">
-  <metadata
-     id="metadata12">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <defs
-     id="defs10" />
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="1920"
-     inkscape:window-height="1027"
-     id="namedview8"
-     showgrid="false"
-     inkscape:zoom="14.75"
-     inkscape:cx="15.823281"
-     inkscape:cy="12.108563"
-     inkscape:window-x="-8"
-     inkscape:window-y="-8"
-     inkscape:window-maximized="1"
-     inkscape:current-layer="svg2" />
-  <g
-     transform="translate(0 -1036.4)"
-     id="g4"
-     style="fill:#e0e0e0;fill-opacity:1">
-    <path
-       transform="translate(0,1036.4)"
-       d="M 8,2 C 5.4433,2 2.2093,3.9477 1.0449,7.7051 c -0.0572671,0.1855049 -0.059303,0.3836676 -0.00586,0.57031 1.1244,3.9354 4.4609,5.7246 6.9609,5.7246 2.5000004,0 5.8365004,-1.7892 6.9609004,-5.7246 0.05192,-0.180585 0.05192,-0.372145 0,-0.55273 -1.1003,-3.7876 -4.4066,-5.7227 -6.9609004,-5.7227 z m 0,2 c 2.209139,0 4,1.790861 4,4 0,2.209139 -1.790861,4 -4,4 C 5.790861,12 4,10.209139 4,8 4,5.790861 5.790861,4 8,4 Z M 8,6 C 6.8954305,6 6,6.8954305 6,8 6,9.1045695 6.8954305,10 8,10 9.1045695,10 10,9.1045695 10,8 10,6.8954305 9.1045695,6 8,6 Z"
-       style="color:#000000;text-indent:0;text-decoration:none;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;text-transform:none;white-space:normal;isolation:auto;mix-blend-mode:normal;solid-color:#000000;fill:#e0e0e0;fill-opacity:1;fill-rule:evenodd;color-rendering:auto;image-rendering:auto;shape-rendering:auto"
-       id="path6"
-       inkscape:connector-curvature="0"
-       sodipodi:nodetypes="cccsccccssssssssss" />
-  </g>
+<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
+<path d="m8 2c-2.5567 0-5.7907 1.9477-6.9551 5.7051a1.0001 1.0001 0 0 0 -0.00586 0.57031c1.1244 3.9354 4.4609 5.7246 6.9609 5.7246s5.8365-1.7892 6.9609-5.7246a1.0001 1.0001 0 0 0 0 -0.55273c-1.1003-3.7876-4.4066-5.7227-6.9609-5.7227zm0 2a4 4 0 0 1 4 4 4 4 0 0 1 -4 4 4 4 0 0 1 -4 -4 4 4 0 0 1 4 -4zm0 2a2 2 0 0 0 -2 2 2 2 0 0 0 2 2 2 2 0 0 0 2 -2 2 2 0 0 0 -2 -2z" color="#000000" color-rendering="auto" fill="#e0e0e0" fill-opacity=".99608" fill-rule="evenodd" image-rendering="auto" shape-rendering="auto" solid-color="#000000" style="isolation:auto;mix-blend-mode:normal;text-decoration-color:#000000;text-decoration-line:none;text-decoration-style:solid;text-indent:0;text-transform:none;white-space:normal"/>
 </svg>
diff --git a/editor/icons/icon_GUI_visibility_xray.svg b/editor/icons/icon_GUI_visibility_xray.svg
index 1fd9fcf1b5..b78709821f 100644
--- a/editor/icons/icon_GUI_visibility_xray.svg
+++ b/editor/icons/icon_GUI_visibility_xray.svg
@@ -1,61 +1,6 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="16"
-   height="16"
-   version="1.1"
-   viewBox="0 0 16 16"
-   id="svg2"
-   inkscape:version="0.91 r13725"
-   sodipodi:docname="icon_GUI_visibility_xray.svg">
-  <metadata
-     id="metadata12">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title />
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <defs
-     id="defs10" />
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="1920"
-     inkscape:window-height="1027"
-     id="namedview8"
-     showgrid="false"
-     inkscape:zoom="7.375"
-     inkscape:cx="43.019438"
-     inkscape:cy="-8.9853027"
-     inkscape:window-x="-8"
-     inkscape:window-y="-8"
-     inkscape:window-maximized="1"
-     inkscape:current-layer="svg2" />
-  <g
-     transform="translate(0.20338214,-1036.671)"
-     id="g4"
-     style="fill:#e0e0e0;fill-opacity:1" />
-  <path
-     id="path4154"
-     style="opacity:1;fill:#e0e0e0;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.42799997;stroke-linecap:square;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
-     d="m 5.0107427,7.1191578 c -0.084872,0.2859445 -0.1282828,0.5825859 -0.128907,0.8808593 8.579e-4,0.263009 0.034983,0.5248532 0.101563,0.7792969 l 6.0312493,0 C 11.081887,8.5249547 11.116668,8.2631092 11.118164,8.0000171 11.11754,7.7017437 11.074129,7.4051023 10.989257,7.1191578 Z M 7.9999096,2.000005 c -2.5567,0 -5.7907,1.9477 -6.9551,5.7051 -0.057267,0.1855049 -0.059303,0.3836676 -0.00586,0.57031 1.1244,3.9354 4.4609,5.7246 6.9609,5.7246 2.4999994,0 5.8364994,-1.7892 6.9608994,-5.7246 0.05192,-0.180585 0.05192,-0.372145 0,-0.55273 -1.1003,-3.7876 -4.4066,-5.7227 -6.9608994,-5.7227 z m 0,2 c 2.2091384,0 3.9999994,1.790861 3.9999994,4 0,2.209139 -1.790861,4 -3.9999994,4 -2.209139,0 -4,-1.790861 -4,-4 0,-2.209139 1.790861,-4 4,-4 z"
-     inkscape:connector-curvature="0"
-     sodipodi:nodetypes="ccccccccccsccccsssss" />
+<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
+<g fill="#e0e0e0" fill-rule="evenodd" shape-rendering="auto">
+<path d="m7.9998 2c-2.5567 0-5.7907 1.9477-6.9551 5.7051a1.0001 1.0001 0 0 0 -0.00586 0.5703c1.1244 3.9354 4.4609 5.7246 6.9609 5.7246s5.8365-1.7892 6.9609-5.7246a1.0001 1.0001 0 0 0 0 -0.5527c-1.1003-3.7876-4.4066-5.7227-6.9609-5.7227zm0 2a4 4 0 0 1 4 4 4 4 0 0 1 -4 4 4 4 0 0 1 -4 -4 4 4 0 0 1 4 -4zm0 2a2 2 0 0 0 -2 2 2 2 0 0 0 2 2 2 2 0 0 0 2 -2 2 2 0 0 0 -2 -2z" color="#000000" color-rendering="auto" fill-opacity=".39216" image-rendering="auto" solid-color="#000000" style="isolation:auto;mix-blend-mode:normal;text-decoration-color:#000000;text-decoration-line:none;text-decoration-style:solid;text-indent:0;text-transform:none;white-space:normal"/>
+<path d="m8 2c-2.5567 0-5.7907 1.9477-6.9551 5.7051a1.0001 1.0001 0 0 0 -0.00586 0.57031c1.1244 3.9354 4.4609 5.7246 6.9609 5.7246v-2a4 4 0 0 1 -4 -4 4 4 0 0 1 4 -4zm0 4a2 2 0 0 0 -2 2 2 2 0 0 0 2 2z" color="#000000" color-rendering="auto" fill-opacity=".99608" image-rendering="auto" solid-color="#000000" style="isolation:auto;mix-blend-mode:normal;text-decoration-color:#000000;text-decoration-line:none;text-decoration-style:solid;text-indent:0;text-transform:none;white-space:normal"/>
+</g>
 </svg>
diff --git a/editor/icons/icon_bake.svg b/editor/icons/icon_bake.svg
index 4a9ccfed12..ca5245da10 100644
--- a/editor/icons/icon_bake.svg
+++ b/editor/icons/icon_bake.svg
@@ -1,5 +1,3 @@
 <svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
-<g transform="translate(0 -1036.4)">
-<path transform="translate(0 1036.4)" d="m2 1v2h12v-2h-12zm-1 3v9a2 2 0 0 0 2 2h10a2 2 0 0 0 2 -2v-9h-14zm2 1h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm-9 2h10v6h-10v-6zm3 1v1h4v-1h-4z" fill="#e0e0e0"/>
-</g>
+<path d="m2 1v2h12v-2h-12zm-1 3v9a2 2 0 0 0 2 2h10a2 2 0 0 0 2 -2v-9h-14zm2 1h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm-9 2h10v6h-10v-6zm3 1v1h4v-1h-4z" fill="#e0e0e0"/>
 </svg>
diff --git a/editor/icons/icon_baked_light.svg b/editor/icons/icon_baked_light.svg
deleted file mode 100644
index f5bf07a444..0000000000
--- a/editor/icons/icon_baked_light.svg
+++ /dev/null
@@ -1,5 +0,0 @@
-<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
-<g transform="translate(0 -1036.4)">
-<path transform="translate(0 1036.4)" d="m2 1v2h12v-2h-12zm-1 3v9a2 2 0 0 0 2 2h10a2 2 0 0 0 2 -2v-9h-14zm2 1h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm-9 2h10v6h-10v-6zm3 1v1h4v-1h-4z" fill="#fc9c9c"/>
-</g>
-</svg>
diff --git a/editor/icons/icon_baked_light_instance.svg b/editor/icons/icon_baked_light_instance.svg
deleted file mode 100644
index f5bf07a444..0000000000
--- a/editor/icons/icon_baked_light_instance.svg
+++ /dev/null
@@ -1,5 +0,0 @@
-<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
-<g transform="translate(0 -1036.4)">
-<path transform="translate(0 1036.4)" d="m2 1v2h12v-2h-12zm-1 3v9a2 2 0 0 0 2 2h10a2 2 0 0 0 2 -2v-9h-14zm2 1h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm-9 2h10v6h-10v-6zm3 1v1h4v-1h-4z" fill="#fc9c9c"/>
-</g>
-</svg>
diff --git a/editor/icons/icon_baked_light_sampler.svg b/editor/icons/icon_baked_light_sampler.svg
deleted file mode 100644
index 0bf630039d..0000000000
--- a/editor/icons/icon_baked_light_sampler.svg
+++ /dev/null
@@ -1,5 +0,0 @@
-<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
-<g transform="translate(0 -1036.4)">
-<path transform="translate(0 1036.4)" d="m2 1v2h12v-2h-12zm-1 3v9a2 2 0 0 0 2 2h4v-2h-4v-6h4 6 2v-3h-14zm2 1h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm-6 3v1h1v-1h-1zm4 1a1 1 0 0 0 -1 1v4a1 1 0 0 0 1 1h4a1 1 0 0 0 1 -1v-4a1 1 0 0 0 -1 -1h-4zm3 1a1 1 0 0 1 1 1 1 1 0 0 1 -1 1 1 1 0 0 1 -1 -1 1 1 0 0 1 1 -1zm-2 2a1 1 0 0 1 1 1 1 1 0 0 1 -1 1 1 1 0 0 1 -1 -1 1 1 0 0 1 1 -1z" fill="#fc9c9c"/>
-</g>
-</svg>
diff --git a/editor/icons/icon_baked_lightmap.svg b/editor/icons/icon_baked_lightmap.svg
new file mode 100644
index 0000000000..6c6586244e
--- /dev/null
+++ b/editor/icons/icon_baked_lightmap.svg
@@ -0,0 +1,3 @@
+<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
+<path d="m2 1v2h12v-2h-12zm-1 3v9a2 2 0 0 0 2 2h10a2 2 0 0 0 2 -2v-9h-14zm2 1h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm3 0h1v1h-1v-1zm-9 2h10v6h-10v-6zm3 1v1h4v-1h-4z" fill="#fc9c9c"/>
+</svg>
diff --git a/editor/icons/icon_baked_lightmap_data.svg b/editor/icons/icon_baked_lightmap_data.svg
new file mode 100644
index 0000000000..b5ddd24680
--- /dev/null
+++ b/editor/icons/icon_baked_lightmap_data.svg
@@ -0,0 +1,3 @@
+<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
+<path d="m1 1v2h2v-2h-2zm3 0v2h2v-2h-2zm4 0v2h6v-2h-6zm-7 3v2h2v-2h-2zm3 0v2h2v-2h-2zm4 0v3h5v6h-5v2h5a2 2 0 0 0 2 -2v-9h-7zm1 1h1v1h-1v-1zm3 0h1v1h-1v-1zm-11 2v2h2v-2h-2zm3 0v2h2v-2h-2zm4 1v1h2v-1h-2zm-7 2v2h2v-2h-2zm3 0v2h2v-2h-2zm-3 3v2h2v-2h-2zm3 0v2h2v-2h-2z" fill="#e0e0e0"/>
+</svg>
diff --git a/editor/icons/icon_editor_handle.svg b/editor/icons/icon_editor_handle.svg
index 05f3e2f2cc..f215820ddc 100644
--- a/editor/icons/icon_editor_handle.svg
+++ b/editor/icons/icon_editor_handle.svg
@@ -1,7 +1,5 @@
 <svg width="10" height="10" version="1.1" viewBox="0 0 10 10" xmlns="http://www.w3.org/2000/svg">
-<g transform="translate(0 -1042.4)">
-<ellipse cx="5" cy="1047.4" rx="5" ry="5" fill-opacity=".29412"/>
-<ellipse cx="5" cy="1047.4" rx="4" ry="4" fill="#fff"/>
-<ellipse cx="5" cy="1047.4" rx="3" ry="3" fill="#ff8484"/>
-</g>
+<circle cx="5" cy="5" r="5" fill-opacity=".29412"/>
+<circle cx="5" cy="5" r="4" fill="#fff"/>
+<circle cx="5" cy="5" r="3" fill="#ff8484"/>
 </svg>
diff --git a/editor/icons/icon_editor_handle_add.svg b/editor/icons/icon_editor_handle_add.svg
index be61cd53f9..a8bc1fdc9b 100644
--- a/editor/icons/icon_editor_handle_add.svg
+++ b/editor/icons/icon_editor_handle_add.svg
@@ -1,5 +1,5 @@
 <svg width="10" height="10" version="1.1" viewBox="0 0 10 10" xmlns="http://www.w3.org/2000/svg">
-	<circle cx="5" cy="5" r="5" fill-opacity=".29412"/>
-	<circle cx="5" cy="5" r="4" fill="#474747"/>
-	<path d="m4 2v2h-2v2h2v2h2v-2h2v-2h-2v-2z" fill="#84ffb1"/>
+<circle cx="5" cy="5" r="5" fill-opacity=".29412"/>
+<circle cx="5" cy="5" r="4" fill="#474747"/>
+<path d="m4 2v2h-2v2h2v2h2v-2h2v-2h-2v-2z" fill="#84ffb1"/>
 </svg>
diff --git a/editor/icons/icon_editor_plugin.svg b/editor/icons/icon_editor_plugin.svg
index 528a583a04..e68d787bd3 100644
--- a/editor/icons/icon_editor_plugin.svg
+++ b/editor/icons/icon_editor_plugin.svg
@@ -1,9 +1,3 @@
 <svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
-<g transform="translate(0 -1036.4)" fill="#e0e0e0" fill-opacity=".99608">
-<path d="m2 1038.4v8h8v-8z" fill-rule="evenodd" stroke="#e0e0e0" stroke-linejoin="round" stroke-opacity=".99608" stroke-width="2"/>
-<circle cx="13" cy="1042.4" r="2"/>
-<circle cx="6" cy="1049.4" r="2"/>
-<rect x="5" y="1046.4" width="2" height="2"/>
-<rect x="10" y="1041.4" width="2" height="2"/>
-</g>
+<path d="m2 1c-0.55226 1e-4 -0.99994 0.4477-1 1v8c5.52e-5 0.5523 0.44774 0.9999 1 1h3v0.27148a2 2 0 0 0 -1 1.7285 2 2 0 0 0 2 2 2 2 0 0 0 2 -2 2 2 0 0 0 -1 -1.7305v-0.26953h3c0.55226-1e-4 0.99994-0.4477 1-1v-3h0.27148a2 2 0 0 0 1.7285 1 2 2 0 0 0 2 -2 2 2 0 0 0 -2 -2 2 2 0 0 0 -1.7305 1h-0.26953v-3c-5.5e-5 -0.5523-0.44774-0.9999-1-1h-8z" color="#000000" color-rendering="auto" dominant-baseline="auto" fill="#e0e0e0" fill-opacity=".99608" fill-rule="evenodd" image-rendering="auto" shape-rendering="auto" solid-color="#000000" style="font-feature-settings:normal;font-variant-alternates:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;font-variant-position:normal;isolation:auto;mix-blend-mode:normal;shape-padding:0;text-decoration-color:#000000;text-decoration-line:none;text-decoration-style:solid;text-indent:0;text-orientation:mixed;text-transform:none;white-space:normal"/>
 </svg>
diff --git a/editor/icons/icon_gizmo_baked_lightmap.svg b/editor/icons/icon_gizmo_baked_lightmap.svg
new file mode 100644
index 0000000000..cc21b7c1c5
--- /dev/null
+++ b/editor/icons/icon_gizmo_baked_lightmap.svg
@@ -0,0 +1,4 @@
+<svg width="128" height="128" version="1.1" viewBox="0 0 128 128" xmlns="http://www.w3.org/2000/svg">
+<path d="m18 8c-2.209 2.2e-4 -3.9998 1.791-4 4l0.01563 20h-6.0156c-2.209 2.2e-4 -3.9998 1.791-4 4v71.076c0 9.3065 7.6174 16.924 16.924 16.924h61.076c2.209-2e-4 3.9998-1.791 4-4v-12c-2.21e-4 -2.209-1.791-3.9998-4-4h-58v-40h20v12c2.21e-4 2.209 1.791 3.9998 4 4h32c2.209-2e-4 3.9998-1.791 4-4v-12h20v4c2e-3 0.72576 0.20093 1.4374 0.57617 2.0586-0.19584-6e-3 -0.37901-0.058594-0.57617-0.058594-10.998 0-20 9.0016-20 20-4e-6 0-4e-6 0.0098 0 0.0098 0.0088 6.2734 3.0833 12.01 8 15.756v2.2383c0 2.8834 1.66 5.3456 4 6.75v5.2461c2.21e-4 2.209 1.791 3.9998 4 4h8c2.209-2e-4 3.9998-1.791 4-4v-5.248c2.3405-1.4043 4-3.8682 4-6.752v-2.2344c4.9179-3.7475 7.9931-9.4866 8-15.762 0-7.935-4.7186-14.774-11.459-18h7.459c2.209-2.2e-4 3.9998-1.791 4-4v-32c-2.2e-4 -2.209-1.791-3.9998-4-4l-6-0.003906v-19.996c-2.2e-4 -2.209-1.791-3.9998-4-4zm8 38c1.1519 0 2 0.84806 2 2 3e-6 1.1519-0.84806 2-2 2s-2-0.84806-2-2c-3e-6 -1.1519 0.84806-2 2-2zm25 0c1.1519 0 2 0.84806 2 2 3e-6 1.1519-0.84806 2-2 2s-2-0.84806-2-2c-3e-6 -1.1519 0.84806-2 2-2zm26 0c1.1519 0 2 0.84806 2 2 3e-6 1.1519-0.84806 2-2 2s-2-0.84806-2-2c-3e-6 -1.1519 0.84806-2 2-2zm25 0c1.1519 0 2 0.84806 2 2s-0.84806 2-2 2-2-0.84806-2-2c-3e-6 -1.1519 0.84806-2 2-2zm2 38c3.3611 0 6 2.6388 6 6 0 3.361-2.639 6-6 6-3.361 0-6-2.639-6-6 0-3.3612 2.6389-6 6-6z" color="#000000" color-rendering="auto" dominant-baseline="auto" fill-opacity=".29412" image-rendering="auto" shape-rendering="auto" solid-color="#000000" style="font-feature-settings:normal;font-variant-alternates:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;font-variant-position:normal;isolation:auto;mix-blend-mode:normal;shape-padding:0;text-decoration-color:#000000;text-decoration-line:none;text-decoration-style:solid;text-indent:0;text-orientation:mixed;text-transform:none;white-space:normal"/>
+<path d="m18 12v16h92v-16zm-10 24v71.076c0 7.1594 5.7644 12.924 12.924 12.924h61.076v-12h-62v-48h88v8h12v-32zm18 6c3.3137-1e-5 6 2.6863 6 6 9e-6 3.3137-2.6863 6-6 6-3.3137 1e-5 -6-2.6863-6-6-9e-6 -3.3137 2.6863-6 6-6zm25 0c3.3137-1e-5 6 2.6863 6 6 9e-6 3.3137-2.6863 6-6 6-3.3137 1e-5 -6-2.6863-6-6-9e-6 -3.3137 2.6863-6 6-6zm26 0c3.3137-1e-5 6 2.6863 6 6 9e-6 3.3137-2.6863 6-6 6-3.3137 1e-5 -6-2.6863-6-6-9e-6 -3.3137 2.6863-6 6-6zm25 0c3.3137-1e-5 6 2.6863 6 6 1e-5 3.3137-2.6863 6-6 6-3.3137 1e-5 -6-2.6863-6-6-9e-6 -3.3137 2.6863-6 6-6zm-54 26v8h32v-8zm56 6c-8.8365 0-16 7.1634-16 16 8e-3 5.7082 3.0565 10.98 8 13.834v4.166c0 2.216 1.784 4 4 4h8c2.216 0 4-1.784 4-4v-4.1602c4.945-2.855 7.9937-8.1299 8-13.84 0-8.8366-7.1635-16-16-16zm0 6c5.5228 0 10 4.4771 10 10 0 5.5228-4.4772 10-10 10-5.5228 0-10-4.4772-10-10 0-5.5229 4.4772-10 10-10zm-4 36v4h8v-4z" fill="#f7f5cf"/>
+</svg>
diff --git a/editor/icons/icon_hidden.svg b/editor/icons/icon_hidden.svg
deleted file mode 100644
index 8328156e76..0000000000
--- a/editor/icons/icon_hidden.svg
+++ /dev/null
@@ -1,5 +0,0 @@
-<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
-<g transform="translate(0 -1036.4)">
-<path transform="translate(0 1036.4)" d="m2.9609 7.7266l-1.9219 0.54883c0.31999 1.12 0.8236 2.0593 1.4316 2.8398l-0.83398 0.83398 1.4141 1.4141 0.84375-0.84375c0.98585 0.74762 2.0766 1.2067 3.1055 1.3867v1.0938h2v-1.0938c1.0288-0.17998 2.1196-0.6391 3.1055-1.3867l0.84375 0.84375 1.4141-1.4141-0.83398-0.83398c0.60804-0.78055 1.1117-1.7199 1.4316-2.8398l-1.9219-0.54883c-0.8756 3.0646-3.5391 4.2734-5.0391 4.2734s-4.1635-1.2088-5.0391-4.2734z" color="#000000" color-rendering="auto" dominant-baseline="auto" fill="#e0e0e0" fill-opacity=".99608" fill-rule="evenodd" image-rendering="auto" shape-rendering="auto" solid-color="#000000" style="filter-blend-mode:normal;filter-gaussianBlur-deviation:0;font-feature-settings:normal;font-variant-alternates:normal;font-variant-caps:normal;font-variant-east-asian:normal;font-variant-ligatures:normal;font-variant-numeric:normal;font-variant-position:normal;isolation:auto;mix-blend-mode:normal;shape-padding:0;text-decoration-color:#000000;text-decoration-line:none;text-decoration-style:solid;text-indent:0;text-orientation:mixed;text-transform:none;white-space:normal"/>
-</g>
-</svg>
diff --git a/editor/icons/icon_kinematic_body_2d.svg b/editor/icons/icon_kinematic_body_2d.svg
index 51026e5f28..0441e499c0 100644
--- a/editor/icons/icon_kinematic_body_2d.svg
+++ b/editor/icons/icon_kinematic_body_2d.svg
@@ -1,7 +1,7 @@
 <svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
 <g transform="translate(0 -1036.4)">
-<g transform="translate(.49212 -.0044019)" fill="#a5b7f5" fill-opacity=".98824">
-<path transform="translate(0 1036.4)" d="m6 1c-0.55401 0-1 0.44599-1 1v3c0 0.55401 0.44599 1 1 1h1v0.99023a1.0001 1.0001 0 0 0 -0.31641 0.0625l-2.0508 0.68359-0.68359-2.0508a1.0001 1.0001 0 0 0 -0.99023 -0.69727 1.0001 1.0001 0 0 0 -0.9082 1.3281l1 3a1.0001 1.0001 0 0 0 1.2656 0.63281l1.6836-0.56055v0.61133c0 0.04088 0.018715 0.07566 0.023437 0.11523l-4.5781 3.0527a1.0001 1.0001 0 1 0 1.1094 1.6641l5.0566-3.3711 1.4941 2.9863a1.0001 1.0001 0 0 0 1.2109 0.50195l3-1a1.0001 1.0001 0 1 0 -0.63281 -1.8965l-2.1777 0.72461-0.97461-1.9512c0.2759-0.17764 0.46875-0.47227 0.46875-0.82617v-1h1.3828l0.72266 1.4473a1.0001 1.0001 0 1 0 1.7891 -0.89453l-1-2a1.0001 1.0001 0 0 0 -0.89453 -0.55273h-3v-1h1c0.55401 0 1-0.44599 1-1v-3c0-0.55401-0.44599-1-1-1zm0 2h1v2h-1z" fill="#a5b7f5" fill-opacity=".98824"/>
+<g transform="translate(.49212 -.0044019)" fill="#a5b7f3">
+<path transform="translate(0 1036.4)" d="m6 1c-0.55401 0-1 0.44599-1 1v3c0 0.55401 0.44599 1 1 1h1v0.99023a1.0001 1.0001 0 0 0 -0.31641 0.0625l-2.0508 0.68359-0.68359-2.0508a1.0001 1.0001 0 0 0 -0.99023 -0.69727 1.0001 1.0001 0 0 0 -0.9082 1.3281l1 3a1.0001 1.0001 0 0 0 1.2656 0.63281l1.6836-0.56055v0.61133c0 0.04088 0.018715 0.07566 0.023437 0.11523l-4.5781 3.0527a1.0001 1.0001 0 1 0 1.1094 1.6641l5.0566-3.3711 1.4941 2.9863a1.0001 1.0001 0 0 0 1.2109 0.50195l3-1a1.0001 1.0001 0 1 0 -0.63281 -1.8965l-2.1777 0.72461-0.97461-1.9512c0.2759-0.17764 0.46875-0.47227 0.46875-0.82617v-1h1.3828l0.72266 1.4473a1.0001 1.0001 0 1 0 1.7891 -0.89453l-1-2a1.0001 1.0001 0 0 0 -0.89453 -0.55273h-3v-1h1c0.55401 0 1-0.44599 1-1v-3c0-0.55401-0.44599-1-1-1zm0 2h1v2h-1z" fill="#a5b7f3"/>
 </g>
 </g>
 </svg>
diff --git a/editor/icons/icon_plugin_script.svg b/editor/icons/icon_plugin_script.svg
new file mode 100644
index 0000000000..763cca3a92
--- /dev/null
+++ b/editor/icons/icon_plugin_script.svg
@@ -0,0 +1,3 @@
+<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
+<path d="m7 1l-0.56445 2.2578c-0.23643 0.075851-0.46689 0.16921-0.68945 0.2793l-1.9883-1.1934-1.4141 1.4141 1.1953 1.9941c-0.11191 0.22113-0.20723 0.45028-0.28516 0.68555l-2.2539 0.5625v2l2.2578 0.56445c0.048141 0.14946 0.11579 0.29137 0.17773 0.43555h0.58789c0.51595-0.6841 1.1988-1.2456 2.0195-1.5957-0.028019-0.13296-0.042416-0.26842-0.042969-0.4043 9.6e-6 -1.1046 0.89543-2 2-2 1.1046 9.6e-6 2 0.89543 2 2-1.737e-4 0.1345-0.013915 0.26865-0.041016 0.40039 0.82295 0.35108 1.509 0.91301 2.0254 1.5996h0.58008c0.063668-0.14463 0.13192-0.2874 0.18164-0.4375l2.2539-0.5625v-2l-2.2578-0.56445c-0.075942-0.23577-0.1693-0.46557-0.2793-0.6875l1.1934-1.9902-1.4141-1.4141-1.9941 1.1953c-0.22113-0.11191-0.45028-0.20723-0.68555-0.28516l-0.5625-2.2539h-2zm1 6a1 1 0 0 0 -0.99805 0.92969 1 1 0 0 0 -0.0019531 0.070312v2.1738a3 3 0 0 0 -2 2.8262h1v2h1v-2h2v2h1v-2h1a3 3 0 0 0 -0.015625 -0.29883 3 3 0 0 0 -1.9844 -2.5254v-2.1758a1 1 0 0 0 -1 -1z" fill="#e0e0e0"/>
+</svg>
diff --git a/editor/icons/icon_proxy_texture.svg b/editor/icons/icon_proxy_texture.svg
new file mode 100644
index 0000000000..15ed5e7f2b
--- /dev/null
+++ b/editor/icons/icon_proxy_texture.svg
@@ -0,0 +1,5 @@
+<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
+<g transform="translate(0 -1036.4)">
+<path transform="translate(0 1036.4)" d="m1 1v4h4v-4h-4zm6 0v2h6v8h-6v4h7a1 1 0 0 0 1 -1v-12a1 1 0 0 0 -1 -1h-7zm2 4v1h-1v1h-1v3h1 2 2v-2h-1v-2h-1v-1h-1zm-8 1v4h4v-4h-4zm0 5v4h4v-4h-4z" fill="#e0e0e0" fill-opacity=".99608"/>
+</g>
+</svg>
diff --git a/editor/icons/icon_sprite.svg b/editor/icons/icon_sprite.svg
index 09fc2f0979..11ad42ec98 100644
--- a/editor/icons/icon_sprite.svg
+++ b/editor/icons/icon_sprite.svg
@@ -1,3 +1,3 @@
 <svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
-<path d="m5 1c-2.216 0-4 1.784-4 4v6c0 2.216 1.784 4 4 4h6c2.216 0 4-1.784 4-4v-6c0-2.216-1.784-4-4-4h-6zm-1 5c0.554 0 1 0.446 1 1v2c0 0.554-0.446 1-1 1s-1-0.446-1-1v-2c0-0.554 0.446-1 1-1zm8 0c0.554 0 1 0.446 1 1v2c0 0.554-0.446 1-1 1s-1-0.446-1-1v-2c0-0.554 0.446-1 1-1zm-1.8887 5.1074a1.0001 1.0001 0 0 1 0.7168 1.7207c-0.74987 0.74987-1.7676 1.1719-2.8281 1.1719s-2.0783-0.422-2.8281-1.1719a1.0001 1.0001 0 0 1 0.69727 -1.7168 1.0001 1.0001 0 0 1 0.7168 0.30273c0.37534 0.37535 0.88325 0.58594 1.4141 0.58594s1.0387-0.21059 1.4141-0.58594a1.0001 1.0001 0 0 1 0.69727 -0.30664z" fill="#a5b7f6" fill-opacity=".98824"/>
+<path d="m5 1c-2.216 0-4 1.784-4 4v6c0 2.216 1.784 4 4 4h6c2.216 0 4-1.784 4-4v-6c0-2.216-1.784-4-4-4h-6zm-1 5c0.554 0 1 0.446 1 1v2c0 0.554-0.446 1-1 1s-1-0.446-1-1v-2c0-0.554 0.446-1 1-1zm8 0c0.554 0 1 0.446 1 1v2c0 0.554-0.446 1-1 1s-1-0.446-1-1v-2c0-0.554 0.446-1 1-1zm-1.8887 5.1074a1.0001 1.0001 0 0 1 0.7168 1.7207c-0.74987 0.74987-1.7676 1.1719-2.8281 1.1719s-2.0783-0.422-2.8281-1.1719a1.0001 1.0001 0 0 1 0.69727 -1.7168 1.0001 1.0001 0 0 1 0.7168 0.30273c0.37534 0.37535 0.88325 0.58594 1.4141 0.58594s1.0387-0.21059 1.4141-0.58594a1.0001 1.0001 0 0 1 0.69727 -0.30664z" fill="#a5b7f3"/>
 </svg>
diff --git a/editor/icons/icon_visible.svg b/editor/icons/icon_visible.svg
deleted file mode 100644
index 7d157d7b7f..0000000000
--- a/editor/icons/icon_visible.svg
+++ /dev/null
@@ -1,5 +0,0 @@
-<svg width="16" height="16" version="1.1" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
-<g transform="translate(0 -1036.4)">
-<path transform="translate(0 1036.4)" d="m8 2c-2.5567 0-5.7907 1.9477-6.9551 5.7051a1.0001 1.0001 0 0 0 -0.0058594 0.57031c1.1244 3.9354 4.4609 5.7246 6.9609 5.7246s5.8365-1.7892 6.9609-5.7246a1.0001 1.0001 0 0 0 0 -0.55273c-1.1003-3.7876-4.4066-5.7227-6.9609-5.7227zm0 2a4 4 0 0 1 4 4 4 4 0 0 1 -4 4 4 4 0 0 1 -4 -4 4 4 0 0 1 4 -4zm0 2a2 2 0 0 0 -2 2 2 2 0 0 0 2 2 2 2 0 0 0 2 -2 2 2 0 0 0 -2 -2z" color="#000000" color-rendering="auto" fill="#e0e0e0" fill-opacity=".99608" fill-rule="evenodd" image-rendering="auto" shape-rendering="auto" solid-color="#000000" style="block-progression:tb;isolation:auto;mix-blend-mode:normal;text-decoration-color:#000000;text-decoration-line:none;text-decoration-style:solid;text-indent:0;text-transform:none;white-space:normal"/>
-</g>
-</svg>
diff --git a/editor/import/resource_importer_scene.cpp b/editor/import/resource_importer_scene.cpp
index 08d2897250..ed7c6dba79 100644
--- a/editor/import/resource_importer_scene.cpp
+++ b/editor/import/resource_importer_scene.cpp
@@ -1165,8 +1165,8 @@ void ResourceImporterScene::get_import_options(List<ImportOption> *r_options, in
 	r_options->push_back(ImportOption(PropertyInfo(Variant::BOOL, "meshes/compress"), true));
 	r_options->push_back(ImportOption(PropertyInfo(Variant::BOOL, "meshes/ensure_tangents"), true));
 	r_options->push_back(ImportOption(PropertyInfo(Variant::INT, "meshes/storage", PROPERTY_HINT_ENUM, "Built-In,Files"), meshes_out ? 1 : 0));
-	r_options->push_back(ImportOption(PropertyInfo(Variant::INT, "meshes/light_baking", PROPERTY_HINT_ENUM, "Disabled,Enable,Gen Lightmaps"), 0));
-	r_options->push_back(ImportOption(PropertyInfo(Variant::REAL, "meshes/lightmap_texel_size", PROPERTY_HINT_RANGE, "0.001,100,0.001"), 0.05));
+	r_options->push_back(ImportOption(PropertyInfo(Variant::INT, "meshes/light_baking", PROPERTY_HINT_ENUM, "Disabled,Enable,Gen Lightmaps", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_UPDATE_ALL_IF_MODIFIED), 0));
+	r_options->push_back(ImportOption(PropertyInfo(Variant::REAL, "meshes/lightmap_texel_size", PROPERTY_HINT_RANGE, "0.001,100,0.001"), 0.1));
 	r_options->push_back(ImportOption(PropertyInfo(Variant::BOOL, "external_files/store_in_subdir"), false));
 	r_options->push_back(ImportOption(PropertyInfo(Variant::BOOL, "animation/import", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_UPDATE_ALL_IF_MODIFIED), true));
 	r_options->push_back(ImportOption(PropertyInfo(Variant::REAL, "animation/fps", PROPERTY_HINT_RANGE, "1,120,1"), 15));
diff --git a/editor/plugins/asset_library_editor_plugin.cpp b/editor/plugins/asset_library_editor_plugin.cpp
index 3ab8f318a7..f04bc04d92 100644
--- a/editor/plugins/asset_library_editor_plugin.cpp
+++ b/editor/plugins/asset_library_editor_plugin.cpp
@@ -340,7 +340,7 @@ void EditorAssetLibraryItemDownload::_http_download_completed(int p_status, int
 		} break;
 		case HTTPRequest::RESULT_REQUEST_FAILED: {
 			error_text = TTR("Request failed, return code:") + " " + itos(p_code);
-			status->set_text(TTR("Req. Failed."));
+			status->set_text(TTR("Request Failed."));
 		} break;
 		case HTTPRequest::RESULT_REDIRECT_LIMIT_REACHED: {
 			error_text = TTR("Request failed, too many redirects");
diff --git a/editor/plugins/baked_lightmap_editor_plugin.cpp b/editor/plugins/baked_lightmap_editor_plugin.cpp
new file mode 100644
index 0000000000..08f4d06ef7
--- /dev/null
+++ b/editor/plugins/baked_lightmap_editor_plugin.cpp
@@ -0,0 +1,95 @@
+#include "baked_lightmap_editor_plugin.h"
+
+void BakedLightmapEditorPlugin::_bake() {
+
+	if (lightmap) {
+		BakedLightmap::BakeError err;
+		if (get_tree()->get_edited_scene_root() && get_tree()->get_edited_scene_root() == lightmap) {
+			err = lightmap->bake(lightmap);
+		} else {
+			err = lightmap->bake(lightmap->get_parent());
+		}
+
+		switch (err) {
+			case BakedLightmap::BAKE_ERROR_NO_SAVE_PATH:
+				EditorNode::get_singleton()->show_warning(TTR("Can't determine a save path for lightmap images.\nSave your scene (for images to be saved in the same dir), or pick a save path from the BakedLightmap properties."));
+				break;
+			case BakedLightmap::BAKE_ERROR_NO_MESHES:
+				EditorNode::get_singleton()->show_warning(TTR("No meshes to bake. Make sure they contain an UV2 channel and that the 'Bake Light' flag is on."));
+				break;
+			case BakedLightmap::BAKE_ERROR_CANT_CREATE_IMAGE:
+				EditorNode::get_singleton()->show_warning(TTR("Failed creating lightmap images, make sure path is writable."));
+				break;
+			defaut : {}
+		}
+	}
+}
+
+void BakedLightmapEditorPlugin::edit(Object *p_object) {
+
+	BakedLightmap *s = Object::cast_to<BakedLightmap>(p_object);
+	if (!s)
+		return;
+
+	lightmap = s;
+}
+
+bool BakedLightmapEditorPlugin::handles(Object *p_object) const {
+
+	return p_object->is_class("BakedLightmap");
+}
+
+void BakedLightmapEditorPlugin::make_visible(bool p_visible) {
+
+	if (p_visible) {
+		bake->show();
+	} else {
+
+		bake->hide();
+	}
+}
+
+EditorProgress *BakedLightmapEditorPlugin::tmp_progress = NULL;
+
+void BakedLightmapEditorPlugin::bake_func_begin(int p_steps) {
+
+	ERR_FAIL_COND(tmp_progress != NULL);
+
+	tmp_progress = memnew(EditorProgress("bake_lightmaps", TTR("Bake Lightmaps"), p_steps, true));
+}
+
+bool BakedLightmapEditorPlugin::bake_func_step(int p_step, const String &p_description) {
+
+	ERR_FAIL_COND_V(tmp_progress == NULL, false);
+	return tmp_progress->step(p_description, p_step);
+}
+
+void BakedLightmapEditorPlugin::bake_func_end() {
+	ERR_FAIL_COND(tmp_progress == NULL);
+	memdelete(tmp_progress);
+	tmp_progress = NULL;
+}
+
+void BakedLightmapEditorPlugin::_bind_methods() {
+
+	ClassDB::bind_method("_bake", &BakedLightmapEditorPlugin::_bake);
+}
+
+BakedLightmapEditorPlugin::BakedLightmapEditorPlugin(EditorNode *p_node) {
+
+	editor = p_node;
+	bake = memnew(Button);
+	bake->set_icon(editor->get_gui_base()->get_icon("Bake", "EditorIcons"));
+	bake->set_text(TTR("Bake Lightmaps"));
+	bake->hide();
+	bake->connect("pressed", this, "_bake");
+	add_control_to_container(CONTAINER_SPATIAL_EDITOR_MENU, bake);
+	lightmap = NULL;
+
+	BakedLightmap::bake_begin_function = bake_func_begin;
+	BakedLightmap::bake_step_function = bake_func_step;
+	BakedLightmap::bake_end_function = bake_func_end;
+}
+
+BakedLightmapEditorPlugin::~BakedLightmapEditorPlugin() {
+}
diff --git a/editor/plugins/baked_lightmap_editor_plugin.h b/editor/plugins/baked_lightmap_editor_plugin.h
new file mode 100644
index 0000000000..d64c33884a
--- /dev/null
+++ b/editor/plugins/baked_lightmap_editor_plugin.h
@@ -0,0 +1,39 @@
+#ifndef BAKED_LIGHTMAP_EDITOR_PLUGIN_H
+#define BAKED_LIGHTMAP_EDITOR_PLUGIN_H
+
+#include "editor/editor_node.h"
+#include "editor/editor_plugin.h"
+#include "scene/3d/baked_lightmap.h"
+#include "scene/resources/material.h"
+
+class BakedLightmapEditorPlugin : public EditorPlugin {
+
+	GDCLASS(BakedLightmapEditorPlugin, EditorPlugin);
+
+	BakedLightmap *lightmap;
+
+	Button *bake;
+	EditorNode *editor;
+
+	static EditorProgress *tmp_progress;
+	static void bake_func_begin(int p_steps);
+	static bool bake_func_step(int p_step, const String &p_description);
+	static void bake_func_end();
+
+	void _bake();
+
+protected:
+	static void _bind_methods();
+
+public:
+	virtual String get_name() const { return "BakedLightmap"; }
+	bool has_main_screen() const { return false; }
+	virtual void edit(Object *p_object);
+	virtual bool handles(Object *p_object) const;
+	virtual void make_visible(bool p_visible);
+
+	BakedLightmapEditorPlugin(EditorNode *p_node);
+	~BakedLightmapEditorPlugin();
+};
+
+#endif // BAKED_LIGHTMAP_EDITOR_PLUGIN_H
diff --git a/editor/plugins/gi_probe_editor_plugin.cpp b/editor/plugins/gi_probe_editor_plugin.cpp
index 443cd2e41f..416b0edb20 100644
--- a/editor/plugins/gi_probe_editor_plugin.cpp
+++ b/editor/plugins/gi_probe_editor_plugin.cpp
@@ -90,7 +90,7 @@ GIProbeEditorPlugin::GIProbeEditorPlugin(EditorNode *p_node) {
 
 	editor = p_node;
 	bake = memnew(Button);
-	bake->set_icon(editor->get_gui_base()->get_icon("BakedLight", "EditorIcons"));
+	bake->set_icon(editor->get_gui_base()->get_icon("Bake", "EditorIcons"));
 	bake->set_text(TTR("Bake GI Probe"));
 	bake->hide();
 	bake->connect("pressed", this, "_bake");
diff --git a/editor/plugins/mesh_instance_editor_plugin.h b/editor/plugins/mesh_instance_editor_plugin.h
index 68c149f98a..32c779509a 100644
--- a/editor/plugins/mesh_instance_editor_plugin.h
+++ b/editor/plugins/mesh_instance_editor_plugin.h
@@ -35,9 +35,9 @@
 #include "scene/3d/mesh_instance.h"
 #include "scene/gui/spin_box.h"
 
-class MeshInstanceEditor : public Node {
+class MeshInstanceEditor : public Control {
 
-	GDCLASS(MeshInstanceEditor, Node);
+	GDCLASS(MeshInstanceEditor, Control);
 
 	enum Menu {
 
diff --git a/editor/plugins/particles_2d_editor_plugin.cpp b/editor/plugins/particles_2d_editor_plugin.cpp
index 5eaa248224..ff8a9f93d6 100644
--- a/editor/plugins/particles_2d_editor_plugin.cpp
+++ b/editor/plugins/particles_2d_editor_plugin.cpp
@@ -77,11 +77,6 @@ void Particles2DEditorPlugin::_menu_callback(int p_idx) {
 		case MENU_CLEAR_EMISSION_MASK: {
 
 			emission_mask->popup_centered_minsize();
-
-			/*undo_redo->create_action(TTR("Clear Emission Mask"));
-			undo_redo->add_do_method(particles, "set_emission_points", PoolVector<Vector2>());
-			undo_redo->add_undo_method(particles, "set_emission_points", particles->get_emission_points());
-			undo_redo->commit_action();*/
 		} break;
 	}
 }
@@ -309,14 +304,6 @@ void Particles2DEditorPlugin::_generate_emission_mask() {
 	} else {
 		pm->set_emission_shape(ParticlesMaterial::EMISSION_SHAPE_POINTS);
 	}
-
-	/*undo_redo->create_action(TTR("Set Emission Mask"));
-	undo_redo->add_do_method(particles, "set_emission_points", epoints);
-	undo_redo->add_do_method(particles, "set_emission_half_extents", extents);
-	undo_redo->add_undo_method(particles, "set_emission_points", particles->get_emission_points());
-	undo_redo->add_undo_method(particles, "set_emission_half_extents", particles->get_emission_half_extents());
-	undo_redo->commit_action();
-	*/
 }
 
 void Particles2DEditorPlugin::_notification(int p_what) {
diff --git a/editor/plugins/particles_editor_plugin.cpp b/editor/plugins/particles_editor_plugin.cpp
index f4a9960087..52eba099c6 100644
--- a/editor/plugins/particles_editor_plugin.cpp
+++ b/editor/plugins/particles_editor_plugin.cpp
@@ -40,11 +40,6 @@ void ParticlesEditor::_node_removed(Node *p_node) {
 	}
 }
 
-void ParticlesEditor::_resource_seleted(const String &p_res) {
-
-	//print_line("selected resource path: "+p_res);
-}
-
 void ParticlesEditor::_node_selected(const NodePath &p_path) {
 
 	Node *sel = get_node(p_path);
@@ -84,23 +79,6 @@ void ParticlesEditor::_node_selected(const NodePath &p_path) {
 	emission_dialog->popup_centered(Size2(300, 130));
 }
 
-/*
-
-void ParticlesEditor::_populate() {
-
-	if(!node)
-		return;
-
-
-	if (node->get_particles().is_null())
-		return;
-
-	node->get_particles()->set_instance_count(populate_amount->get_text().to_int());
-	node->populate_parent(populate_rotate_random->get_val(),populate_tilt_random->get_val(),populate_scale_random->get_text().to_double(),populate_scale->get_text().to_double());
-
-}
-*/
-
 void ParticlesEditor::_notification(int p_notification) {
 
 	if (p_notification == NOTIFICATION_ENTER_TREE) {
@@ -132,13 +110,7 @@ void ParticlesEditor::_menu_option(int p_option) {
 				EditorNode::get_singleton()->show_warning(TTR("A processor material of type 'ParticlesMaterial' is required."));
 				return;
 			}
-			/*
-			Node *root = get_scene()->get_root_node();
-			ERR_FAIL_COND(!root);
-			EditorNode *en = Object::cast_to<EditorNode>(root);
-			ERR_FAIL_COND(!en);
-			Node * node = en->get_edited_scene();
-*/
+
 			emission_tree_dialog->popup_centered_ratio();
 
 		} break;
@@ -365,20 +337,14 @@ void ParticlesEditor::_generate_emission_points() {
 		material->set_emission_point_count(point_count);
 		material->set_emission_point_texture(tex);
 	}
-
-	//print_line("point count: "+itos(points.size()));
-	//node->set_emission_points(points);
 }
 
 void ParticlesEditor::_bind_methods() {
 
 	ClassDB::bind_method("_menu_option", &ParticlesEditor::_menu_option);
-	ClassDB::bind_method("_resource_seleted", &ParticlesEditor::_resource_seleted);
 	ClassDB::bind_method("_node_selected", &ParticlesEditor::_node_selected);
 	ClassDB::bind_method("_generate_emission_points", &ParticlesEditor::_generate_emission_points);
 	ClassDB::bind_method("_generate_aabb", &ParticlesEditor::_generate_aabb);
-
-	//ClassDB::bind_method("_populate",&ParticlesEditor::_populate);
 }
 
 ParticlesEditor::ParticlesEditor() {
@@ -394,8 +360,6 @@ ParticlesEditor::ParticlesEditor() {
 	options->get_popup()->add_separator();
 	options->get_popup()->add_item(TTR("Create Emission Points From Mesh"), MENU_OPTION_CREATE_EMISSION_VOLUME_FROM_MESH);
 	options->get_popup()->add_item(TTR("Create Emission Points From Node"), MENU_OPTION_CREATE_EMISSION_VOLUME_FROM_NODE);
-	//	options->get_popup()->add_item(TTR("Clear Emitter"), MENU_OPTION_CLEAR_EMISSION_VOLUME);
-
 	options->get_popup()->connect("id_pressed", this, "_menu_option");
 
 	emission_dialog = memnew(ConfirmationDialog);
@@ -420,7 +384,6 @@ ParticlesEditor::ParticlesEditor() {
 	emission_dialog->connect("confirmed", this, "_generate_emission_points");
 
 	err_dialog = memnew(ConfirmationDialog);
-	//err_dialog->get_cancel()->hide();
 	add_child(err_dialog);
 
 	emission_file_dialog = memnew(EditorFileDialog);
@@ -454,9 +417,6 @@ ParticlesEditor::ParticlesEditor() {
 	add_child(generate_aabb);
 
 	generate_aabb->connect("confirmed", this, "_generate_aabb");
-
-	//options->set_anchor(MARGIN_LEFT,Control::ANCHOR_END);
-	//options->set_anchor(MARGIN_RIGHT,Control::ANCHOR_END);
 }
 
 void ParticlesEditorPlugin::edit(Object *p_object) {
diff --git a/editor/plugins/particles_editor_plugin.h b/editor/plugins/particles_editor_plugin.h
index 2c8ce88eb2..a65538c7fa 100644
--- a/editor/plugins/particles_editor_plugin.h
+++ b/editor/plugins/particles_editor_plugin.h
@@ -73,7 +73,6 @@ class ParticlesEditor : public Control {
 
 	void _generate_aabb();
 	void _generate_emission_points();
-	void _resource_seleted(const String &p_res);
 	void _node_selected(const NodePath &p_path);
 
 	void _menu_option(int);
diff --git a/editor/plugins/script_editor_plugin.cpp b/editor/plugins/script_editor_plugin.cpp
index c02b3458e5..591e6dac56 100644
--- a/editor/plugins/script_editor_plugin.cpp
+++ b/editor/plugins/script_editor_plugin.cpp
@@ -1329,12 +1329,12 @@ void ScriptEditor::_members_overview_selected(int p_idx) {
 	if (!se) {
 		return;
 	}
-	Dictionary state;
-	state["scroll_position"] = members_overview->get_item_metadata(p_idx);
+	// Go to the member's line and reset the cursor column. We can't just change scroll_position
+	// directly, since code might be folded.
+	se->goto_line(members_overview->get_item_metadata(p_idx));
+	Dictionary state = se->get_edit_state();
 	state["column"] = 0;
-	state["row"] = members_overview->get_item_metadata(p_idx);
 	se->set_edit_state(state);
-	se->ensure_focus();
 }
 
 void ScriptEditor::_help_overview_selected(int p_idx) {
@@ -1845,6 +1845,11 @@ void ScriptEditor::apply_scripts() const {
 	}
 }
 
+void ScriptEditor::open_script_create_dialog(const String &p_base_name, const String &p_base_path) {
+	_menu_option(FILE_NEW);
+	script_create_dialog->config(p_base_name, p_base_path);
+}
+
 void ScriptEditor::_editor_play() {
 
 	debugger->start();
@@ -2548,6 +2553,7 @@ void ScriptEditor::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("get_current_script"), &ScriptEditor::_get_current_script);
 	ClassDB::bind_method(D_METHOD("get_open_scripts"), &ScriptEditor::_get_open_scripts);
+	ClassDB::bind_method(D_METHOD("open_script_create_dialog", "base_name", "base_path"), &ScriptEditor::open_script_create_dialog);
 
 	ADD_SIGNAL(MethodInfo("editor_script_changed", PropertyInfo(Variant::OBJECT, "script", PROPERTY_HINT_RESOURCE_TYPE, "Script")));
 	ADD_SIGNAL(MethodInfo("script_close", PropertyInfo(Variant::OBJECT, "script", PROPERTY_HINT_RESOURCE_TYPE, "Script")));
diff --git a/editor/plugins/script_editor_plugin.h b/editor/plugins/script_editor_plugin.h
index ffd42d18ca..9d5c110dec 100644
--- a/editor/plugins/script_editor_plugin.h
+++ b/editor/plugins/script_editor_plugin.h
@@ -360,6 +360,7 @@ public:
 
 	void ensure_focus_current();
 	void apply_scripts() const;
+	void open_script_create_dialog(const String &p_base_name, const String &p_base_path);
 
 	void ensure_select_current();
 
diff --git a/editor/plugins/script_text_editor.cpp b/editor/plugins/script_text_editor.cpp
index 95f2739927..0610f55b3f 100644
--- a/editor/plugins/script_text_editor.cpp
+++ b/editor/plugins/script_text_editor.cpp
@@ -537,10 +537,6 @@ void ScriptTextEditor::set_edit_state(const Variant &p_state) {
 	code_editor->get_text_edit()->cursor_set_line(state["row"]);
 	code_editor->get_text_edit()->set_v_scroll(state["scroll_position"]);
 	code_editor->get_text_edit()->grab_focus();
-
-	//int scroll_pos;
-	//int cursor_column;
-	//int cursor_row;
 }
 
 String ScriptTextEditor::get_name() {
@@ -924,26 +920,7 @@ void ScriptTextEditor::_edit_option(int p_op) {
 			if (scr.is_null())
 				return;
 
-			tx->begin_complex_operation();
-			if (tx->is_selection_active()) {
-				tx->indent_selection_left();
-			} else {
-				int begin = tx->cursor_get_line();
-				String line_text = tx->get_line(begin);
-				// begins with tab
-				if (line_text.begins_with("\t")) {
-					line_text = line_text.substr(1, line_text.length());
-					tx->set_line(begin, line_text);
-				}
-				// begins with 4 spaces
-				else if (line_text.begins_with("    ")) {
-					line_text = line_text.substr(4, line_text.length());
-					tx->set_line(begin, line_text);
-				}
-			}
-			tx->end_complex_operation();
-			tx->update();
-			//tx->deselect();
+			tx->indent_left();
 		} break;
 		case EDIT_INDENT_RIGHT: {
 
@@ -951,18 +928,7 @@ void ScriptTextEditor::_edit_option(int p_op) {
 			if (scr.is_null())
 				return;
 
-			tx->begin_complex_operation();
-			if (tx->is_selection_active()) {
-				tx->indent_selection_right();
-			} else {
-				int begin = tx->cursor_get_line();
-				String line_text = tx->get_line(begin);
-				line_text = '\t' + line_text;
-				tx->set_line(begin, line_text);
-			}
-			tx->end_complex_operation();
-			tx->update();
-			//tx->deselect();
+			tx->indent_right();
 		} break;
 		case EDIT_DELETE_LINE: {
 
@@ -1503,14 +1469,15 @@ void ScriptTextEditor::_make_context_menu(bool p_selection, bool p_color, bool p
 	context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/select_all"), EDIT_SELECT_ALL);
 	context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/undo"), EDIT_UNDO);
 	context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/redo"), EDIT_REDO);
+	context_menu->add_separator();
+	context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/indent_left"), EDIT_INDENT_LEFT);
+	context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/indent_right"), EDIT_INDENT_RIGHT);
+	context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/toggle_comment"), EDIT_TOGGLE_COMMENT);
 
 	if (p_selection) {
 		context_menu->add_separator();
 		context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/convert_to_uppercase"), EDIT_TO_UPPERCASE);
 		context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/convert_to_lowercase"), EDIT_TO_LOWERCASE);
-		context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/indent_left"), EDIT_INDENT_LEFT);
-		context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/indent_right"), EDIT_INDENT_RIGHT);
-		context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/toggle_comment"), EDIT_TOGGLE_COMMENT);
 	}
 	if (p_can_fold || p_is_folded)
 		context_menu->add_shortcut(ED_GET_SHORTCUT("script_text_editor/toggle_fold_line"), EDIT_TOGGLE_FOLD_LINE);
diff --git a/editor/plugins/shader_editor_plugin.cpp b/editor/plugins/shader_editor_plugin.cpp
index b390070b4a..3e00776dfd 100644
--- a/editor/plugins/shader_editor_plugin.cpp
+++ b/editor/plugins/shader_editor_plugin.cpp
@@ -161,7 +161,7 @@ void ShaderTextEditor::_load_theme_settings() {
 
 		for (const Map<StringName, ShaderLanguage::FunctionInfo>::Element *E = ShaderTypes::get_singleton()->get_functions(VisualServer::ShaderMode(shader->get_mode())).front(); E; E = E->next()) {
 
-			for (const Map<StringName, ShaderLanguage::DataType>::Element *F = E->get().built_ins.front(); F; F = F->next()) {
+			for (const Map<StringName, ShaderLanguage::BuiltInInfo>::Element *F = E->get().built_ins.front(); F; F = F->next()) {
 				keywords.push_back(F->key());
 			}
 		}
@@ -379,26 +379,7 @@ void ShaderEditor::_menu_option(int p_option) {
 			if (shader.is_null())
 				return;
 
-			tx->begin_complex_operation();
-			if (tx->is_selection_active()) {
-				tx->indent_selection_left();
-			} else {
-				int begin = tx->cursor_get_line();
-				String line_text = tx->get_line(begin);
-				// begins with tab
-				if (line_text.begins_with("\t")) {
-					line_text = line_text.substr(1, line_text.length());
-					tx->set_line(begin, line_text);
-				}
-				// begins with 4 spaces
-				else if (line_text.begins_with("    ")) {
-					line_text = line_text.substr(4, line_text.length());
-					tx->set_line(begin, line_text);
-				}
-			}
-			tx->end_complex_operation();
-			tx->update();
-			//tx->deselect();
+			tx->indent_left();
 
 		} break;
 		case EDIT_INDENT_RIGHT: {
@@ -407,18 +388,7 @@ void ShaderEditor::_menu_option(int p_option) {
 			if (shader.is_null())
 				return;
 
-			tx->begin_complex_operation();
-			if (tx->is_selection_active()) {
-				tx->indent_selection_right();
-			} else {
-				int begin = tx->cursor_get_line();
-				String line_text = tx->get_line(begin);
-				line_text = '\t' + line_text;
-				tx->set_line(begin, line_text);
-			}
-			tx->end_complex_operation();
-			tx->update();
-			//tx->deselect();
+			tx->indent_right();
 
 		} break;
 		case EDIT_DELETE_LINE: {
diff --git a/editor/plugins/spatial_editor_plugin.cpp b/editor/plugins/spatial_editor_plugin.cpp
index 80638c6f1e..cefc957ebf 100644
--- a/editor/plugins/spatial_editor_plugin.cpp
+++ b/editor/plugins/spatial_editor_plugin.cpp
@@ -4268,7 +4268,7 @@ void SpatialEditor::_menu_item_pressed(int p_option) {
 		case MENU_VISIBILITY_SKELETON: {
 
 			const int idx = view_menu->get_popup()->get_item_index(MENU_VISIBILITY_SKELETON);
-			view_menu->get_popup()->toggle_item_statable(idx);
+			view_menu->get_popup()->toggle_item_multistate(idx);
 
 			// Change icon
 			const int state = view_menu->get_popup()->get_item_state(idx);
@@ -5077,8 +5077,7 @@ SpatialEditor::SpatialEditor(EditorNode *p_editor) {
 	p->add_shortcut(ED_SHORTCUT("spatial_editor/settings", TTR("Settings")), MENU_VIEW_CAMERA_SETTINGS);
 
 	p->add_separator();
-	p->add_statable_item(TTR("Skeleton Gizmo visibility"), 3, 1, MENU_VISIBILITY_SKELETON);
-	p->add_separator();
+	p->add_multistate_item(TTR("Skeleton Gizmo visibility"), 3, 1, MENU_VISIBILITY_SKELETON);
 
 	p->set_item_checked(p->get_item_index(MENU_VIEW_ORIGIN), true);
 	p->set_item_checked(p->get_item_index(MENU_VIEW_GRID), true);
diff --git a/editor/plugins/tile_map_editor_plugin.cpp b/editor/plugins/tile_map_editor_plugin.cpp
index 4d06342fe0..40abc4026a 100644
--- a/editor/plugins/tile_map_editor_plugin.cpp
+++ b/editor/plugins/tile_map_editor_plugin.cpp
@@ -751,6 +751,7 @@ bool TileMapEditor::forward_gui_input(const Ref<InputEvent> &p_event) {
 
 						if (id != TileMap::INVALID_CELL) {
 
+							_set_cell(over_tile, id, flip_h, flip_v, transpose);
 							undo_redo->add_do_method(node, "set", "tile_data", node->get("tile_data"));
 							undo_redo->commit_action();
 
diff --git a/editor/plugins/tile_set_editor_plugin.cpp b/editor/plugins/tile_set_editor_plugin.cpp
index b8c57fd959..ae726b69ef 100644
--- a/editor/plugins/tile_set_editor_plugin.cpp
+++ b/editor/plugins/tile_set_editor_plugin.cpp
@@ -244,7 +244,7 @@ TileSetEditor::TileSetEditor(EditorNode *p_editor) {
 	MenuButton *options = memnew(MenuButton);
 	panel->add_child(options);
 	options->set_position(Point2(1, 1));
-	options->set_text("Theme");
+	options->set_text(TTR("Tile Set"));
 	options->get_popup()->add_item(TTR("Add Item"), MENU_OPTION_ADD_ITEM);
 	options->get_popup()->add_item(TTR("Remove Item"), MENU_OPTION_REMOVE_ITEM);
 	options->get_popup()->add_separator();
diff --git a/editor/progress_dialog.cpp b/editor/progress_dialog.cpp
index 09f5375bb4..2c2e5a7c9b 100644
--- a/editor/progress_dialog.cpp
+++ b/editor/progress_dialog.cpp
@@ -163,7 +163,7 @@ void ProgressDialog::_popup() {
 	popup_centered(ms);
 }
 
-void ProgressDialog::add_task(const String &p_task, const String &p_label, int p_steps) {
+void ProgressDialog::add_task(const String &p_task, const String &p_label, int p_steps, bool p_can_cancel) {
 
 	ERR_FAIL_COND(tasks.has(p_task));
 	Task t;
@@ -180,17 +180,24 @@ void ProgressDialog::add_task(const String &p_task, const String &p_label, int p
 	main->add_child(t.vb);
 
 	tasks[p_task] = t;
+	if (p_can_cancel) {
+		cancel_hb->show();
+	} else {
+		cancel_hb->hide();
+	}
+	cancel_hb->raise();
+	cancelled = false;
 	_popup();
 }
 
-void ProgressDialog::task_step(const String &p_task, const String &p_state, int p_step, bool p_force_redraw) {
+bool ProgressDialog::task_step(const String &p_task, const String &p_state, int p_step, bool p_force_redraw) {
 
-	ERR_FAIL_COND(!tasks.has(p_task));
+	ERR_FAIL_COND_V(!tasks.has(p_task), cancelled);
 
 	if (!p_force_redraw) {
 		uint64_t tus = OS::get_singleton()->get_ticks_usec();
 		if (tus - last_progress_tick < 50000) //50ms
-			return;
+			return cancelled;
 	}
 
 	Task &t = tasks[p_task];
@@ -201,7 +208,11 @@ void ProgressDialog::task_step(const String &p_task, const String &p_state, int
 
 	t.state->set_text(p_state);
 	last_progress_tick = OS::get_singleton()->get_ticks_usec();
+	if (cancel_hb->is_visible()) {
+		OS::get_singleton()->force_process_input();
+	}
 	Main::iteration(); // this will not work on a lot of platforms, so it's only meant for the editor
+	return cancelled;
 }
 
 void ProgressDialog::end_task(const String &p_task) {
@@ -218,6 +229,14 @@ void ProgressDialog::end_task(const String &p_task) {
 		_popup();
 }
 
+void ProgressDialog::_cancel_pressed() {
+	cancelled = true;
+}
+
+void ProgressDialog::_bind_methods() {
+	ClassDB::bind_method("_cancel_pressed", &ProgressDialog::_cancel_pressed);
+}
+
 ProgressDialog::ProgressDialog() {
 
 	main = memnew(VBoxContainer);
@@ -226,4 +245,13 @@ ProgressDialog::ProgressDialog() {
 	set_exclusive(true);
 	last_progress_tick = 0;
 	singleton = this;
+	cancel_hb = memnew(HBoxContainer);
+	main->add_child(cancel_hb);
+	cancel_hb->hide();
+	cancel = memnew(Button);
+	cancel_hb->add_spacer();
+	cancel_hb->add_child(cancel);
+	cancel->set_text(TTR("Cancel"));
+	cancel_hb->add_spacer();
+	cancel->connect("pressed", this, "_cancel_pressed");
 }
diff --git a/editor/progress_dialog.h b/editor/progress_dialog.h
index 8ac0907145..b13ea606bc 100644
--- a/editor/progress_dialog.h
+++ b/editor/progress_dialog.h
@@ -31,6 +31,7 @@
 #define PROGRESS_DIALOG_H
 
 #include "scene/gui/box_container.h"
+#include "scene/gui/button.h"
 #include "scene/gui/label.h"
 #include "scene/gui/popup.h"
 #include "scene/gui/progress_bar.h"
@@ -76,6 +77,8 @@ class ProgressDialog : public Popup {
 		ProgressBar *progress;
 		Label *state;
 	};
+	HBoxContainer *cancel_hb;
+	Button *cancel;
 
 	Map<String, Task> tasks;
 	VBoxContainer *main;
@@ -84,13 +87,17 @@ class ProgressDialog : public Popup {
 	static ProgressDialog *singleton;
 	void _popup();
 
+	void _cancel_pressed();
+	bool cancelled;
+
 protected:
 	void _notification(int p_what);
+	static void _bind_methods();
 
 public:
 	static ProgressDialog *get_singleton() { return singleton; }
-	void add_task(const String &p_task, const String &p_label, int p_steps);
-	void task_step(const String &p_task, const String &p_state, int p_step = -1, bool p_force_redraw = true);
+	void add_task(const String &p_task, const String &p_label, int p_steps, bool p_can_cancel = false);
+	bool task_step(const String &p_task, const String &p_state, int p_step = -1, bool p_force_redraw = true);
 	void end_task(const String &p_task);
 
 	ProgressDialog();
diff --git a/editor/project_settings_editor.cpp b/editor/project_settings_editor.cpp
index 76fd20ca12..1a7b7f3575 100644
--- a/editor/project_settings_editor.cpp
+++ b/editor/project_settings_editor.cpp
@@ -531,7 +531,7 @@ void ProjectSettingsEditor::_action_button_pressed(Object *p_obj, int p_column,
 			Variant old_val = ProjectSettings::get_singleton()->get(name);
 			int order = ProjectSettings::get_singleton()->get_order(name);
 
-			undo_redo->create_action(TTR("Add Input Action"));
+			undo_redo->create_action(TTR("Erase Input Action"));
 			undo_redo->add_do_method(ProjectSettings::get_singleton(), "clear", name);
 			undo_redo->add_undo_method(ProjectSettings::get_singleton(), "set", name, old_val);
 			undo_redo->add_undo_method(ProjectSettings::get_singleton(), "set_order", name, order);
@@ -852,7 +852,7 @@ void ProjectSettingsEditor::_action_add() {
 
 	Array va;
 	String name = "input/" + action_name->get_text();
-	undo_redo->create_action(TTR("Add Input Action Event"));
+	undo_redo->create_action(TTR("Add Input Action"));
 	undo_redo->add_do_method(ProjectSettings::get_singleton(), "set", name, va);
 	undo_redo->add_undo_method(ProjectSettings::get_singleton(), "clear", name);
 	undo_redo->add_do_method(this, "_update_actions");
diff --git a/editor/property_editor.cpp b/editor/property_editor.cpp
index b187a9ae9d..16ce364b92 100644
--- a/editor/property_editor.cpp
+++ b/editor/property_editor.cpp
@@ -551,6 +551,7 @@ bool CustomPropertyEditor::edit(Object *p_owner, const String &p_name, Variant::
 
 				text_edit->show();
 				text_edit->set_text(v);
+				text_edit->deselect();
 
 				int button_margin = get_constant("button_margin", "Dialogs");
 				int margin = get_constant("margin", "Dialogs");
@@ -900,10 +901,10 @@ bool CustomPropertyEditor::edit(Object *p_owner, const String &p_name, Variant::
 						int id = TYPE_BASE_ID + idx;
 						if (has_icon(t, "EditorIcons")) {
 
-							menu->add_icon_item(get_icon(t, "EditorIcons"), TTR("New") + " " + t, id);
+							menu->add_icon_item(get_icon(t, "EditorIcons"), vformat(TTR("New %s"), t), id);
 						} else {
 
-							menu->add_item(TTR("New") + " " + t, id);
+							menu->add_item(vformat(TTR("New %s"), t), id);
 						}
 
 						idx++;
@@ -2831,7 +2832,7 @@ void PropertyEditor::update_tree() {
 					class_descr_cache[type] = descr.word_wrap(80);
 				}
 
-				sep->set_tooltip(0, TTR("Class:") + " " + p.name + ":\n\n" + class_descr_cache[type]);
+				sep->set_tooltip(0, TTR("Class:") + " " + p.name + (class_descr_cache[type] == "" ? "" : "\n\n" + class_descr_cache[type]));
 			}
 			continue;
 
@@ -2963,7 +2964,7 @@ void PropertyEditor::update_tree() {
 				descr_cache[classname][propname] = descr;
 			}
 
-			item->set_tooltip(0, TTR("Property:") + " " + p.name + "\n\n" + descr);
+			item->set_tooltip(0, TTR("Property:") + " " + p.name + (descr == "" ? "" : "\n\n" + descr));
 		}
 
 		Dictionary d;
@@ -4532,6 +4533,7 @@ void SectionedPropertyEditor::update_category_list() {
 		for (int i = 0; i < sc; i++) {
 
 			TreeItem *parent = section_map[metasection];
+			parent->set_custom_bg_color(0, get_color("prop_subsection", "Editor"));
 
 			if (i > 0) {
 				metasection += "/" + sectionarr[i];
@@ -4585,7 +4587,7 @@ SectionedPropertyEditor::SectionedPropertyEditor() {
 	search_box = NULL;
 
 	VBoxContainer *left_vb = memnew(VBoxContainer);
-	left_vb->set_custom_minimum_size(Size2(160, 0) * EDSCALE);
+	left_vb->set_custom_minimum_size(Size2(170, 0) * EDSCALE);
 	add_child(left_vb);
 
 	sections = memnew(Tree);
diff --git a/editor/scene_tree_dock.cpp b/editor/scene_tree_dock.cpp
index 4d86030e7d..4d5d467857 100644
--- a/editor/scene_tree_dock.cpp
+++ b/editor/scene_tree_dock.cpp
@@ -1373,77 +1373,81 @@ void SceneTreeDock::_create() {
 		}
 
 	} else if (current_option == TOOL_REPLACE) {
-		Node *n = scene_tree->get_selected();
-		ERR_FAIL_COND(!n);
+		List<Node *> selection = editor_selection->get_selected_node_list();
+		ERR_FAIL_COND(selection.size() <= 0);
+		for (List<Node *>::Element *E = selection.front(); E; E = E->next()) {
+			Node *n = E->get();
+			ERR_FAIL_COND(!n);
 
-		Object *c = create_dialog->instance_selected();
+			Object *c = create_dialog->instance_selected();
 
-		ERR_FAIL_COND(!c);
-		Node *newnode = Object::cast_to<Node>(c);
-		ERR_FAIL_COND(!newnode);
+			ERR_FAIL_COND(!c);
+			Node *newnode = Object::cast_to<Node>(c);
+			ERR_FAIL_COND(!newnode);
 
-		List<PropertyInfo> pinfo;
-		n->get_property_list(&pinfo);
+			List<PropertyInfo> pinfo;
+			n->get_property_list(&pinfo);
 
-		for (List<PropertyInfo>::Element *E = pinfo.front(); E; E = E->next()) {
-			if (!(E->get().usage & PROPERTY_USAGE_STORAGE))
-				continue;
-			if (E->get().name == "__meta__")
-				continue;
-			newnode->set(E->get().name, n->get(E->get().name));
-		}
+			for (List<PropertyInfo>::Element *E = pinfo.front(); E; E = E->next()) {
+				if (!(E->get().usage & PROPERTY_USAGE_STORAGE))
+					continue;
+				if (E->get().name == "__meta__")
+					continue;
+				newnode->set(E->get().name, n->get(E->get().name));
+			}
 
-		editor->push_item(NULL);
+			editor->push_item(NULL);
 
-		//reconnect signals
-		List<MethodInfo> sl;
+			//reconnect signals
+			List<MethodInfo> sl;
 
-		n->get_signal_list(&sl);
-		for (List<MethodInfo>::Element *E = sl.front(); E; E = E->next()) {
+			n->get_signal_list(&sl);
+			for (List<MethodInfo>::Element *E = sl.front(); E; E = E->next()) {
 
-			List<Object::Connection> cl;
-			n->get_signal_connection_list(E->get().name, &cl);
+				List<Object::Connection> cl;
+				n->get_signal_connection_list(E->get().name, &cl);
 
-			for (List<Object::Connection>::Element *F = cl.front(); F; F = F->next()) {
+				for (List<Object::Connection>::Element *F = cl.front(); F; F = F->next()) {
 
-				Object::Connection &c = F->get();
-				if (!(c.flags & Object::CONNECT_PERSIST))
-					continue;
-				newnode->connect(c.signal, c.target, c.method, varray(), Object::CONNECT_PERSIST);
+					Object::Connection &c = F->get();
+					if (!(c.flags & Object::CONNECT_PERSIST))
+						continue;
+					newnode->connect(c.signal, c.target, c.method, varray(), Object::CONNECT_PERSIST);
+				}
 			}
-		}
 
-		String newname = n->get_name();
+			String newname = n->get_name();
 
-		List<Node *> to_erase;
-		for (int i = 0; i < n->get_child_count(); i++) {
-			if (n->get_child(i)->get_owner() == NULL && n->is_owned_by_parent()) {
-				to_erase.push_back(n->get_child(i));
+			List<Node *> to_erase;
+			for (int i = 0; i < n->get_child_count(); i++) {
+				if (n->get_child(i)->get_owner() == NULL && n->is_owned_by_parent()) {
+					to_erase.push_back(n->get_child(i));
+				}
 			}
-		}
-		n->replace_by(newnode, true);
+			n->replace_by(newnode, true);
 
-		if (n == edited_scene) {
-			edited_scene = newnode;
-			editor->set_edited_scene(newnode);
-			newnode->set_editable_instances(n->get_editable_instances());
-		}
+			if (n == edited_scene) {
+				edited_scene = newnode;
+				editor->set_edited_scene(newnode);
+				newnode->set_editable_instances(n->get_editable_instances());
+			}
 
-		//small hack to make collisionshapes and other kind of nodes to work
-		for (int i = 0; i < newnode->get_child_count(); i++) {
-			Node *c = newnode->get_child(i);
-			c->call("set_transform", c->call("get_transform"));
-		}
-		editor_data->get_undo_redo().clear_history();
-		newnode->set_name(newname);
+			//small hack to make collisionshapes and other kind of nodes to work
+			for (int i = 0; i < newnode->get_child_count(); i++) {
+				Node *c = newnode->get_child(i);
+				c->call("set_transform", c->call("get_transform"));
+			}
+			editor_data->get_undo_redo().clear_history();
+			newnode->set_name(newname);
 
-		editor->push_item(newnode);
+			editor->push_item(newnode);
 
-		memdelete(n);
+			memdelete(n);
 
-		while (to_erase.front()) {
-			memdelete(to_erase.front()->get());
-			to_erase.pop_front();
+			while (to_erase.front()) {
+				memdelete(to_erase.front()->get());
+				to_erase.pop_front();
+			}
 		}
 	}
 }
@@ -1737,13 +1741,12 @@ void SceneTreeDock::_tree_rmb(const Vector2 &p_menu_pos) {
 		menu->add_icon_shortcut(get_icon("Add", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/add_child_node"), TOOL_NEW);
 		menu->add_icon_shortcut(get_icon("Instance", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/instance_scene"), TOOL_INSTANCE);
 		menu->add_separator();
-		menu->add_icon_shortcut(get_icon("Reload", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/change_node_type"), TOOL_REPLACE);
-		menu->add_separator();
 		menu->add_icon_shortcut(get_icon("ScriptCreate", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/attach_script"), TOOL_ATTACH_SCRIPT);
 		menu->add_icon_shortcut(get_icon("ScriptRemove", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/clear_script"), TOOL_CLEAR_SCRIPT);
 		menu->add_separator();
 	}
-
+	menu->add_icon_shortcut(get_icon("Reload", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/change_node_type"), TOOL_REPLACE);
+	menu->add_separator();
 	menu->add_icon_shortcut(get_icon("MoveUp", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/move_up"), TOOL_MOVE_UP);
 	menu->add_icon_shortcut(get_icon("MoveDown", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/move_down"), TOOL_MOVE_DOWN);
 	menu->add_icon_shortcut(get_icon("Duplicate", "EditorIcons"), ED_GET_SHORTCUT("scene_tree/duplicate"), TOOL_DUPLICATE);
diff --git a/editor/scene_tree_editor.cpp b/editor/scene_tree_editor.cpp
index 25924212fd..827e8d9ee4 100644
--- a/editor/scene_tree_editor.cpp
+++ b/editor/scene_tree_editor.cpp
@@ -256,9 +256,9 @@ bool SceneTreeEditor::_add_nodes(Node *p_node, TreeItem *p_parent) {
 
 			bool v = p_node->call("is_visible");
 			if (v)
-				item->add_button(0, get_icon("Visible", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
+				item->add_button(0, get_icon("GuiVisibilityVisible", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
 			else
-				item->add_button(0, get_icon("Hidden", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
+				item->add_button(0, get_icon("GuiVisibilityHidden", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
 
 			if (!p_node->is_connected("visibility_changed", this, "_node_visibility_changed"))
 				p_node->connect("visibility_changed", this, "_node_visibility_changed", varray(p_node));
@@ -272,9 +272,9 @@ bool SceneTreeEditor::_add_nodes(Node *p_node, TreeItem *p_parent) {
 
 			bool v = p_node->call("is_visible");
 			if (v)
-				item->add_button(0, get_icon("Visible", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
+				item->add_button(0, get_icon("GuiVisibilityVisible", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
 			else
-				item->add_button(0, get_icon("Hidden", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
+				item->add_button(0, get_icon("GuiVisibilityHidden", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
 
 			if (!p_node->is_connected("visibility_changed", this, "_node_visibility_changed"))
 				p_node->connect("visibility_changed", this, "_node_visibility_changed", varray(p_node));
@@ -337,9 +337,9 @@ void SceneTreeEditor::_node_visibility_changed(Node *p_node) {
 	}
 
 	if (visible)
-		item->set_button(0, idx, get_icon("Visible", "EditorIcons"));
+		item->set_button(0, idx, get_icon("GuiVisibilityVisible", "EditorIcons"));
 	else
-		item->set_button(0, idx, get_icon("Hidden", "EditorIcons"));
+		item->set_button(0, idx, get_icon("GuiVisibilityHidden", "EditorIcons"));
 
 	_update_visibility_color(p_node, item);
 }
@@ -774,9 +774,11 @@ Variant SceneTreeEditor::get_drag_data_fw(const Point2 &p_point, Control *p_from
 
 		Node *n = get_node(np);
 		if (n) {
-
-			selected.push_back(n);
-			icons.push_back(next->get_icon(0));
+			// Only allow selection if not part of an instanced scene.
+			if (!n->get_owner() || n->get_owner() == get_scene_node() || n->get_owner()->get_filename() == String()) {
+				selected.push_back(n);
+				icons.push_back(next->get_icon(0));
+			}
 		}
 		next = tree->get_next_selected(next);
 	}
diff --git a/editor/script_create_dialog.cpp b/editor/script_create_dialog.cpp
index 3cab14b0c4..97f442b0ec 100644
--- a/editor/script_create_dialog.cpp
+++ b/editor/script_create_dialog.cpp
@@ -331,6 +331,12 @@ void ScriptCreateDialog::_file_selected(const String &p_file) {
 	} else {
 		file_path->set_text(p);
 		_path_changed(p);
+
+		String filename = p.get_file().get_basename();
+		int select_start = p.find_last(filename);
+		file_path->select(select_start, select_start + filename.length());
+		file_path->set_cursor_position(select_start + filename.length());
+		file_path->grab_focus();
 	}
 }
 
@@ -425,6 +431,10 @@ void ScriptCreateDialog::_path_changed(const String &p_path) {
 	_update_dialog();
 }
 
+void ScriptCreateDialog::_path_entered(const String &p_path) {
+	ok_pressed();
+}
+
 void ScriptCreateDialog::_msg_script_valid(bool valid, const String &p_msg) {
 
 	error_label->set_text(TTR(p_msg));
@@ -459,7 +469,7 @@ void ScriptCreateDialog::_update_dialog() {
 			script_ok = false;
 		}
 	}
-	if (has_named_classes && (!is_class_name_valid)) {
+	if (has_named_classes && (is_new_script_created && !is_class_name_valid)) {
 		_msg_script_valid(false, TTR("Invalid class name"));
 		script_ok = false;
 	}
@@ -550,6 +560,7 @@ void ScriptCreateDialog::_bind_methods() {
 	ClassDB::bind_method("_browse_path", &ScriptCreateDialog::_browse_path);
 	ClassDB::bind_method("_file_selected", &ScriptCreateDialog::_file_selected);
 	ClassDB::bind_method("_path_changed", &ScriptCreateDialog::_path_changed);
+	ClassDB::bind_method("_path_entered", &ScriptCreateDialog::_path_entered);
 	ClassDB::bind_method("_template_changed", &ScriptCreateDialog::_template_changed);
 	ADD_SIGNAL(MethodInfo("script_created", PropertyInfo(Variant::OBJECT, "script", PROPERTY_HINT_RESOURCE_TYPE, "Script")));
 }
@@ -715,6 +726,7 @@ ScriptCreateDialog::ScriptCreateDialog() {
 	hb = memnew(HBoxContainer);
 	file_path = memnew(LineEdit);
 	file_path->connect("text_changed", this, "_path_changed");
+	file_path->connect("text_entered", this, "_path_entered");
 	file_path->set_h_size_flags(SIZE_EXPAND_FILL);
 	hb->add_child(file_path);
 	path_button = memnew(Button);
diff --git a/editor/script_create_dialog.h b/editor/script_create_dialog.h
index c7bbc82d47..1cff9871d8 100644
--- a/editor/script_create_dialog.h
+++ b/editor/script_create_dialog.h
@@ -73,6 +73,7 @@ class ScriptCreateDialog : public ConfirmationDialog {
 	Vector<String> template_list;
 
 	void _path_changed(const String &p_path = String());
+	void _path_entered(const String &p_path = String());
 	void _lang_changed(int l = 0);
 	void _built_in_pressed();
 	bool _validate(const String &p_string);
diff --git a/editor/spatial_editor_gizmos.cpp b/editor/spatial_editor_gizmos.cpp
index f785b3e198..f0e8d438fa 100644
--- a/editor/spatial_editor_gizmos.cpp
+++ b/editor/spatial_editor_gizmos.cpp
@@ -2753,7 +2753,122 @@ GIProbeGizmo::GIProbeGizmo(GIProbe *p_probe) {
 }
 
 ////////
+////////
+
+///
+
+String BakedIndirectLightGizmo::get_handle_name(int p_idx) const {
+
+	switch (p_idx) {
+		case 0: return "Extents X";
+		case 1: return "Extents Y";
+		case 2: return "Extents Z";
+	}
+
+	return "";
+}
+Variant BakedIndirectLightGizmo::get_handle_value(int p_idx) const {
+
+	return baker->get_extents();
+}
+void BakedIndirectLightGizmo::set_handle(int p_idx, Camera *p_camera, const Point2 &p_point) {
+
+	Transform gt = baker->get_global_transform();
+	//gt.orthonormalize();
+	Transform gi = gt.affine_inverse();
+
+	Vector3 extents = baker->get_extents();
+
+	Vector3 ray_from = p_camera->project_ray_origin(p_point);
+	Vector3 ray_dir = p_camera->project_ray_normal(p_point);
+
+	Vector3 sg[2] = { gi.xform(ray_from), gi.xform(ray_from + ray_dir * 16384) };
+
+	Vector3 axis;
+	axis[p_idx] = 1.0;
+
+	Vector3 ra, rb;
+	Geometry::get_closest_points_between_segments(Vector3(), axis * 16384, sg[0], sg[1], ra, rb);
+	float d = ra[p_idx];
+	if (d < 0.001)
+		d = 0.001;
+
+	extents[p_idx] = d;
+	baker->set_extents(extents);
+}
+
+void BakedIndirectLightGizmo::commit_handle(int p_idx, const Variant &p_restore, bool p_cancel) {
+
+	Vector3 restore = p_restore;
+
+	if (p_cancel) {
+		baker->set_extents(restore);
+		return;
+	}
+
+	UndoRedo *ur = SpatialEditor::get_singleton()->get_undo_redo();
+	ur->create_action(TTR("Change Probe Extents"));
+	ur->add_do_method(baker, "set_extents", baker->get_extents());
+	ur->add_undo_method(baker, "set_extents", restore);
+	ur->commit_action();
+}
 
+void BakedIndirectLightGizmo::redraw() {
+
+	Color gizmo_color = EDITOR_GET("editors/3d_gizmos/gizmo_colors/baked_indirect_light");
+	Ref<Material> material = create_material("baked_indirect_light_material", gizmo_color);
+	Ref<Material> icon = create_icon_material("baked_indirect_light_icon", SpatialEditor::get_singleton()->get_icon("GizmoBakedLightmap", "EditorIcons"));
+	Color gizmo_color_internal = gizmo_color;
+	gizmo_color_internal.a = 0.1;
+	Ref<Material> material_internal = create_material("baked_indirect_light_internal_material", gizmo_color_internal);
+
+	clear();
+
+	Vector<Vector3> lines;
+	Vector3 extents = baker->get_extents();
+
+	static const int subdivs[BakedLightmap::SUBDIV_MAX] = { 64, 128, 256, 512 };
+
+	AABB aabb = AABB(-extents, extents * 2);
+	int subdiv = subdivs[baker->get_bake_subdiv()];
+	float cell_size = aabb.get_longest_axis_size() / subdiv;
+
+	for (int i = 0; i < 12; i++) {
+		Vector3 a, b;
+		aabb.get_edge(i, a, b);
+		lines.push_back(a);
+		lines.push_back(b);
+	}
+
+	add_lines(lines, material);
+	add_collision_segments(lines);
+
+	Vector<Vector3> handles;
+
+	for (int i = 0; i < 3; i++) {
+
+		Vector3 ax;
+		ax[i] = aabb.position[i] + aabb.size[i];
+		handles.push_back(ax);
+	}
+
+	if (is_selected()) {
+
+		gizmo_color.a = 0.1;
+		Ref<Material> solid_material = create_material("baked_indirect_light_solid_material", gizmo_color);
+		add_solid_box(solid_material, aabb.get_size());
+	}
+
+	add_unscaled_billboard(icon, 0.05);
+	add_handles(handles);
+}
+BakedIndirectLightGizmo::BakedIndirectLightGizmo(BakedLightmap *p_baker) {
+
+	baker = p_baker;
+	set_spatial_node(p_baker);
+}
+
+////////
 void NavigationMeshSpatialGizmo::redraw() {
 
 	Ref<Material> edge_material = create_material("navigation_material", EDITOR_GET("editors/3d_gizmos/gizmo_colors/navigation_edge"));
@@ -3409,6 +3524,11 @@ Ref<SpatialEditorGizmo> SpatialEditorGizmos::get_gizmo(Spatial *p_spatial) {
 		Ref<GIProbeGizmo> misg = memnew(GIProbeGizmo(Object::cast_to<GIProbe>(p_spatial)));
 		return misg;
 	}
+	if (Object::cast_to<BakedLightmap>(p_spatial)) {
+
+		Ref<BakedIndirectLightGizmo> misg = memnew(BakedIndirectLightGizmo(Object::cast_to<BakedLightmap>(p_spatial)));
+		return misg;
+	}
 
 	if (Object::cast_to<VehicleWheel>(p_spatial)) {
 
@@ -3495,6 +3615,7 @@ SpatialEditorGizmos::SpatialEditorGizmos() {
 	EDITOR_DEF("editors/3d_gizmos/gizmo_colors/particles", Color(0.8, 0.7, 0.4));
 	EDITOR_DEF("editors/3d_gizmos/gizmo_colors/reflection_probe", Color(0.6, 1, 0.5));
 	EDITOR_DEF("editors/3d_gizmos/gizmo_colors/gi_probe", Color(0.5, 1, 0.6));
+	EDITOR_DEF("editors/3d_gizmos/gizmo_colors/baked_indirect_light", Color(0.5, 0.6, 1));
 	EDITOR_DEF("editors/3d_gizmos/gizmo_colors/shape", Color(0.5, 0.7, 1));
 	EDITOR_DEF("editors/3d_gizmos/gizmo_colors/joint", Color(0.5, 0.8, 1));
 	EDITOR_DEF("editors/3d_gizmos/gizmo_colors/navigation_edge", Color(0.5, 1, 1));
diff --git a/editor/spatial_editor_gizmos.h b/editor/spatial_editor_gizmos.h
index 751bad2b13..ea8a33d2c6 100644
--- a/editor/spatial_editor_gizmos.h
+++ b/editor/spatial_editor_gizmos.h
@@ -32,6 +32,7 @@
 
 #include "editor/plugins/spatial_editor_plugin.h"
 #include "scene/3d/audio_stream_player_3d.h"
+#include "scene/3d/baked_lightmap.h"
 #include "scene/3d/camera.h"
 #include "scene/3d/collision_polygon.h"
 #include "scene/3d/collision_shape.h"
@@ -288,6 +289,22 @@ public:
 	GIProbeGizmo(GIProbe *p_probe = NULL);
 };
 
+class BakedIndirectLightGizmo : public EditorSpatialGizmo {
+
+	GDCLASS(BakedIndirectLightGizmo, EditorSpatialGizmo);
+
+	BakedLightmap *baker;
+
+public:
+	virtual String get_handle_name(int p_idx) const;
+	virtual Variant get_handle_value(int p_idx) const;
+	virtual void set_handle(int p_idx, Camera *p_camera, const Point2 &p_point);
+	virtual void commit_handle(int p_idx, const Variant &p_restore, bool p_cancel = false);
+
+	void redraw();
+	BakedIndirectLightGizmo(BakedLightmap *p_baker = NULL);
+};
+
 class CollisionShapeSpatialGizmo : public EditorSpatialGizmo {
 
 	GDCLASS(CollisionShapeSpatialGizmo, EditorSpatialGizmo);
diff --git a/main/input_default.cpp b/main/input_default.cpp
index 8c91a1a5de..f637e77d56 100644
--- a/main/input_default.cpp
+++ b/main/input_default.cpp
@@ -635,7 +635,7 @@ static const char *s_ControllerMappings[] = {
 	"d81d0b00000000000000504944564944,BUFFALO BSGP1601 Series ,x:b4,a:b5,b:b3,y:b2,back:b12,start:b13,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b8,lefttrigger:b6,rightshoulder:b9,righttrigger:b7,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a2,righty:a3,",
 	"d81d0f00000000000000504944564944,iBUFFALO BSGP1204 Series,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b6,lefttrigger:b4,rightshoulder:b7,righttrigger:b5,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a2,righty:a3,",
 	"d81d1000000000000000504944564944,iBUFFALO BSGP1204P Series,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b6,lefttrigger:b4,rightshoulder:b7,righttrigger:b5,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a2,righty:a3,",
-	"ff113133000000000000504944564944,Gembird JPD-DualForce,a:b2,b:b3,x:b0,y:b1,start:b9,back:b8,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a4,lefttrigger:b6,righttrigger:b7,leftstick:b10,rightstick:b11,",
+	"ff113133000000000000504944564944,SVEN X-PAD,a:b2,b:b3,y:b1,x:b0,start:b5,back:b4,leftshoulder:b6,rightshoulder:b7,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a4,lefttrigger:b8,righttrigger:b9,",
 	"ffff0000000000000000504944564944,GameStop Gamepad,a:b0,b:b1,back:b8,dpdown:h0.4,dpleft:h0.8,dpright:h0.2,dpup:h0.1,guide:,leftshoulder:b4,leftstick:b10,lefttrigger:b6,leftx:a0,lefty:a1,rightshoulder:b5,rightstick:b11,righttrigger:b7,rightx:a2,righty:a3,start:b9,x:b2,y:b3,",
 	"__XINPUT_DEVICE__,XInput Gamepad,a:b12,b:b13,x:b14,y:b15,start:b4,back:b5,leftstick:b6,rightstick:b7,leftshoulder:b8,rightshoulder:b9,dpup:b0,dpdown:b1,dpleft:b2,dpright:b3,leftx:a0,lefty:a1,rightx:a2,righty:a3,lefttrigger:a4,righttrigger:a5,",
 #endif
@@ -684,7 +684,7 @@ static const char *s_ControllerMappings[] = {
 	"0300000000f00000f100000000010000,RetroUSB.com Super RetroPort,a:b1,b:b5,x:b0,y:b4,back:b2,start:b3,leftshoulder:b6,rightshoulder:b7,leftx:a0,lefty:a1,",
 	"030000000d0f00000d00000000010000,hori,a:b0,b:b6,y:b2,x:b1,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,start:b9,guide:b10,back:b8,leftshoulder:b3,rightshoulder:b7,leftx:b4,lefty:b5,",
 	"030000000d0f00001000000011010000,HORI CO. LTD. FIGHTING STICK 3,x:b0,a:b1,b:b2,y:b3,back:b8,guide:b12,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7",
-	"030000000d0f00002200000011010000,HORI CO.,LTD. REAL ARCADE Pro.V3,x:b0,a:b1,b:b2,y:b3,back:b8,guide:b12,start:b9,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,",
+	"030000000d0f00002200000011010000,HORI CO. LTD. REAL ARCADE Pro.V3,x:b0,a:b1,b:b2,y:b3,back:b8,guide:b12,start:b9,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,",
 	"030000000d0f00004d00000011010000,HORI Gem Pad 3,x:b0,a:b1,b:b2,y:b3,back:b8,guide:b12,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a2,righty:a3,",
 	"03000000100800000100000010010000,Twin USB PS2 Adapter,a:b2,b:b1,y:b0,x:b3,start:b9,guide:,back:b8,leftstick:b10,rightstick:b11,leftshoulder:b6,rightshoulder:b7,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a3,righty:a2,lefttrigger:b4,righttrigger:b5,",
 	"030000001008000001e5000010010000,NEXT Classic USB Game Controller,a:b0,b:b1,back:b8,start:b9,rightx:a2,righty:a3,leftx:a0,lefty:a1,",
@@ -698,8 +698,8 @@ static const char *s_ControllerMappings[] = {
 	"03000000451300000830000010010000,NYKO CORE,a:b1,b:b2,y:b3,x:b0,start:b9,guide:b12,back:b8,leftstick:b10,rightstick:b11,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a5,lefttrigger:b6,righttrigger:b7,",
 	"030000004c0500006802000011010000,PS3 Controller,a:b14,b:b13,back:b0,dpdown:b6,dpleft:b7,dpright:b5,dpup:b4,guide:b16,leftshoulder:b10,leftstick:b1,lefttrigger:b8,leftx:a0,lefty:a1,rightshoulder:b11,rightstick:b2,righttrigger:b9,rightx:a2,righty:a3,start:b3,x:b15,y:b12,",
 	"030000004c050000a00b000011010000,Sony DualShock 4 Wireless Adaptor,a:b1,b:b2,y:b3,x:b0,start:b9,guide:b12,back:b13,leftstick:b10,rightstick:b11,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a5,lefttrigger:a3,righttrigger:a4,",
-	"030000004c050000c405000011010000,Sony DualShock 4,a:b1,b:b2,y:b3,x:b0,start:b9,guide:b12,back:b8,leftstick:b10,rightstick:b11,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a5,lefttrigger:a3,righttrigger:a4,",
-	"030000004c050000c405000011810000,Sony Computer Entertainment Wireless Controller,leftx:a0,lefty:a1,dpdown:h0.4,rightstick:h0.1,rightshoulder:b5,rightx:a3,start:b9,righty:a4,dpleft:h0.8,lefttrigger:a2,x:b3,dpup:h0.1,back:b8,leftstick:b11,leftshoulder:b4,y:b2,a:b0,dpright:h0.2,righttrigger:a5,b:b1,",
+	"030000004c050000c405000011010000,Sony DualShock 4,a:b1,b:b2,y:b3,x:b0,start:b9,guide:b12,back:b8,leftstick:b10,rightstick:b11,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a5,lefttrigger:b6,righttrigger:b7,",
+	"030000004c050000c405000011810000,Sony DualShock 4,a:b0,b:b1,y:b2,x:b3,start:b9,guide:b10,back:b8,leftstick:b11,rightstick:b12,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a3,righty:a4,lefttrigger:a2,righttrigger:a5,",
 	"030000004c050000cc09000011010000,Sony DualShock 4 V2,a:b1,b:b2,y:b3,x:b0,start:b9,guide:b12,back:b13,leftstick:b10,rightstick:b11,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a5,lefttrigger:a3,righttrigger:a4,",
 	"030000004c050000cc09000011810000,Sony DualShock 4 (CUH-ZCT2U) (USB),a:b0,b:b1,y:b2,x:b3,leftshoulder:b4,rightshoulder:b5,back:b8,start:b9,guide:b10,leftstick:b11,rightstick:b12,leftx:a0,lefty:a1,lefttrigger:a2,rightx:a3,righty:a4,righttrigger:a5,dpup:h0.1,dpright:h0.2,dpdown:h0.4,dpleft:h0.8,",
 	"030000004f04000000b3000010010000,Thrustmaster Firestorm Dual Power,a:b0,b:b2,y:b3,x:b1,start:b10,guide:b8,back:b9,leftstick:b11,rightstick:b12,leftshoulder:b4,rightshoulder:b6,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a3,lefttrigger:b5,righttrigger:b7,",
@@ -718,7 +718,7 @@ static const char *s_ControllerMappings[] = {
 	"030000005e0400008e02000020200000,SpeedLink XEOX Pro Analog Gamepad pad,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
 	"030000005e0400008e02000062230000,Microsoft X-Box 360 pad,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
 	"030000005e0400008e02000073050000,Speedlink TORID Wireless Gamepad,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
-	"030000005e0400009102000007010000,X360 Wireless Controller,a:b0,b:b1,back:b6,dpdown:b14,dpleft:b11,dpright:b12,dpup:b13,guide:b8,leftshoulder:b4,leftstick:b9,lefttrigger:a2,leftx:a0,lefty:a1,rightshoulder:b5,rightstick:b10,righttrigger:a5,rightx:a3,righty:a4,start:b7,x:b2,y:b3,",
+	"030000005e0400009102000007010000,X360 Wireless Controller,a:b0,b:b1,y:b3,x:b2,start:b7,guide:b8,back:b6,leftstick:b9,rightstick:b10,leftshoulder:b4,rightshoulder:b5,dpup:b13,dpleft:b11,dpdown:b14,dpright:b12,leftx:a0,lefty:a1,rightx:a3,righty:a4,lefttrigger:a2,righttrigger:a5,",
 	"030000005e040000a102000000010000,X360 Wireless Controller,a:b0,b:b1,back:b6,dpdown:b14,dpleft:b11,dpright:b12,dpup:b13,guide:b8,leftshoulder:b4,leftstick:b9,lefttrigger:a2,leftx:a0,lefty:a1,rightshoulder:b5,rightstick:b10,righttrigger:a5,rightx:a3,righty:a4,start:b7,x:b2,y:b3,",
 	"030000005e040000d102000001010000,Microsoft X-Box One pad,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.0,dpdown:h0.4,dpright:h0.0,dpright:h0.2,dpup:h0.0,dpup:h0.1,leftshoulder:h0.0,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
 	"030000005e040000dd02000003020000,Microsoft X-Box One pad v2,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.0,dpdown:h0.4,dpright:h0.0,dpright:h0.2,dpup:h0.0,dpup:h0.1,leftshoulder:h0.0,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
@@ -734,7 +734,7 @@ static const char *s_ControllerMappings[] = {
 	"030000006d0400001fc2000005030000,Logitech F710 Gamepad (XInput),a:b0,b:b1,back:b6,dpdown:h0.4,dpleft:h0.8,dpright:h0.2,dpup:h0.1,guide:b8,leftshoulder:b4,leftstick:b9,lefttrigger:a2,leftx:a0,lefty:a1,rightshoulder:b5,rightstick:b10,righttrigger:a5,rightx:a3,righty:a4,start:b7,x:b2,y:b3,",
 	"030000006e0500000320000010010000,JC-U3613M - DirectInput Mode,x:b0,a:b2,b:b3,y:b1,back:b10,guide:b12,start:b11,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7,leftstick:b8,rightstick:b9,leftx:a0,lefty:a1,rightx:a2,righty:a3,",
 	"030000006f0e00000103000000020000,Logic3 Controller,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.0,dpdown:h0.4,dpright:h0.0,dpright:h0.2,dpup:h0.0,dpup:h0.1,leftshoulder:h0.0,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
-	"030000006f0e00001304000000010000,Generic X-Box pad,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
+	"030000006f0e00001304000000010000,Generic X-Box pad,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.0,dpdown:h0.4,dpright:h0.0,dpright:h0.2,dpup:h0.0,dpup:h0.1,leftshoulder:h0.0,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:a0,rightstick:a3,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
 	"030000006f0e00001e01000011010000,Rock Candy Gamepad for PS3,a:b1,b:b2,x:b0,y:b3,back:b8,start:b9,guide:b12,leftshoulder:b4,rightshoulder:b5,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a2,righty:a3,lefttrigger:b6,righttrigger:b7,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,",
 	"030000006f0e00001f01000000010000,Generic X-Box pad,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
 	"030000006f0e00002801000011010000,PDP Rock Candy Wireless Controller for PS3,leftx:a0,lefty:a1,dpdown:h0.4,rightstick:b11,rightshoulder:b5,rightx:a2,start:b9,righty:a3,dpleft:h0.8,lefttrigger:b6,x:b0,dpup:h0.1,back:b8,leftstick:b10,leftshoulder:b4,y:b3,a:b1,dpright:h0.2,righttrigger:b7,b:b2,",
@@ -742,13 +742,13 @@ static const char *s_ControllerMappings[] = {
 	"030000006f0e00003901000020060000,Afterglow Wired Controller for Xbox One,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
 	"030000006f0e00004601000001010000,Rock Candy Wired Controller for Xbox One,a:b0,b:b1,x:b2,y:b3,leftshoulder:b4,rightshoulder:b5,back:b6,start:b7,guide:b8,leftstick:b9,rightstick:b10,lefttrigger:a2,righttrigger:a5,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
 	"03000000780000000600000010010000,Microntek USB Joystick,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,leftshoulder:b6,lefttrigger:b4,rightshoulder:b7,righttrigger:b5,leftx:a0,lefty:a1,",
-	"03000000790000000600000010010000,DragonRise Inc. Generic USB Joystick,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
-	"03000000790000001100000010010000,Retrolink Classic Controller,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,leftshoulder:b4,rightshoulder:b5,leftx:a0,lefty:a1,",
+	"03000000790000000600000010010000,DragonRise Inc.   Generic   USB  Joystick  ,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
+	"03000000790000001100000010010000,RetroLink Saturn Classic Controller,x:b3,a:b0,b:b1,y:b4,back:b5,guide:b2,start:b8,leftshoulder:b6,rightshoulder:b7,leftx:a0,lefty:a1,",
 	"03000000830500006020000010010000,iBuffalo USB 2-axis 8-button Gamepad,a:b1,b:b0,x:b3,y:b2,back:b6,start:b7,leftshoulder:b4,rightshoulder:b5,leftx:a0,lefty:a1,",
 	"030000008916000000fd000024010000,Razer Onza Tournament,a:b0,b:b1,y:b3,x:b2,start:b7,guide:b8,back:b6,leftstick:b9,rightstick:b10,leftshoulder:b4,rightshoulder:b5,dpup:b13,dpleft:b11,dpdown:b14,dpright:b12,leftx:a0,lefty:a1,rightx:a3,righty:a4,lefttrigger:a2,righttrigger:a5,",
 	"030000008916000001fd000024010000,Razer Onza Classic Edition,x:b2,a:b0,b:b1,y:b3,back:b6,guide:b8,start:b7,dpleft:b11,dpdown:b14,dpright:b12,dpup:b13,leftshoulder:b4,lefttrigger:a2,rightshoulder:b5,righttrigger:a5,leftstick:b9,rightstick:b10,leftx:a0,lefty:a1,rightx:a3,righty:a4,",
-	"030000008f0e00000300000010010000,GreenAsia Inc. USB Joystick,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,dpleft:h0.8,dpdown:h0.0,dpdown:h0.4,dpright:h0.0,dpright:h0.2,dpup:h0.0,dpup:h0.1,leftshoulder:h0.0,leftshoulder:b6,lefttrigger:b4,rightshoulder:b7,righttrigger:b5,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a3,righty:a2,",
-	"030000008f0e00001200000010010000,GreenAsia Inc. USB Joystick,x:b2,a:b0,b:b1,y:b3,back:b8,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:b5,rightshoulder:b6,righttrigger:b7,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a3,righty:a2,",
+	"030000008f0e00000300000010010000,GreenAsia Inc.    USB Joystick     ,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,dpleft:h0.8,dpdown:h0.0,dpdown:h0.4,dpright:h0.0,dpright:h0.2,dpup:h0.0,dpup:h0.1,leftshoulder:h0.0,leftshoulder:b6,lefttrigger:b4,rightshoulder:b7,righttrigger:b5,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a3,righty:a2,",
+	"030000008f0e00001200000010010000,GreenAsia Inc.      USB  Joystick  ,x:b2,a:b0,b:b1,y:b3,back:b8,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:b5,rightshoulder:b6,righttrigger:b7,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a3,righty:a2,",
 	"030000009b2800000300000001010000,raphnet.net 4nes4snes v1.5,x:b1,a:b0,b:b4,y:b5,back:b2,start:b3,leftshoulder:b6,rightshoulder:b7,leftx:a0,lefty:a1,",
 	"03000000a30600000901000000010000,Saitek P880,a:b2,b:b3,y:b1,x:b0,leftstick:b8,rightstick:b9,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a3,righty:a2,lefttrigger:b6,righttrigger:b7,",
 	"03000000a30600000c04000011010000,Saitek P2900 Wireless Pad,a:b1,b:b2,y:b3,x:b0,start:b12,guide:b9,back:b8,leftstick:b10,rightstick:b11,leftshoulder:b6,rightshoulder:b7,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a3,righty:a2,lefttrigger:b4,righttrigger:b5,",
@@ -761,7 +761,7 @@ static const char *s_ControllerMappings[] = {
 	"03000000bd12000015d0000010010000,Tomee SNES USB Controller,x:b3,a:b2,b:b1,y:b0,back:b8,start:b9,leftshoulder:b4,rightshoulder:b5,leftx:a0,lefty:a1,",
 	"03000000c01600008704000011010000,Serial/Keyboard/Mouse/Joystick,a:b12,b:b10,x:b13,y:b11,back:b4,start:b5,leftstick:b14,rightstick:b15,leftshoulder:b9,rightshoulder:b8,dpup:b0,dpdown:b2,dpleft:b3,dpright:b1,leftx:a1,lefty:a0,rightx:a2,righty:a3,lefttrigger:b6,righttrigger:b7,",
 	"03000000c0160000e105000001010000,Xin-Mo Xin-Mo Dual Arcade,y:b0,x:b1,b:b3,a:b4,leftshoulder:b2,rightshoulder:b5,back:b6,start:b7,guide:b9,dpleft:b13,dpdown:b12,dpright:b14,dpup:b11,leftx:a0,lefty:a1,",
-	"03000000c9110000f055000011010000,HJC Game GAMEPAD,x:b2,a:b0,b:b1,y:b3,back:b4,back:b8,start:b9,dpleft:h0.8,dpdown:h0.0,dpdown:h0.4,dpright:h0.0,dpright:h0.2,dpup:h0.0,dpup:h0.1,leftshoulder:h0.0,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a2,righty:a3,platform:Linux,",
+	"03000000c9110000f055000011010000,HJC Game GAMEPAD,leftx:a0,lefty:a1,dpdown:h0.4,rightstick:b11,rightshoulder:b5,rightx:a2,start:b9,righty:a3,dpleft:h0.8,lefttrigger:b6,x:b2,dpup:h0.1,back:b8,leftstick:b10,leftshoulder:b4,y:b3,a:b0,dpright:h0.2,righttrigger:b7,b:b1,",
 	"03000000d814000007cd000011010000,Toodles 2008 Chimp PC/PS3,a:b0,b:b1,y:b2,x:b3,start:b9,back:b8,leftshoulder:b4,rightshoulder:b5,leftx:a0,lefty:a1,lefttrigger:b6,righttrigger:b7,",
 	"03000000d81400000862000011010000,HitBox (PS3/PC) Analog Mode,a:b1,b:b2,y:b3,x:b0,start:b12,guide:b9,back:b8,leftshoulder:b4,rightshoulder:b5,lefttrigger:b6,righttrigger:b7,leftx:a0,lefty:a1,",
 	"03000000de280000ff11000001000000,Valve Streaming Gamepad,a:b0,b:b1,back:b6,dpdown:h0.4,dpleft:h0.8,dpright:h0.2,dpup:h0.1,guide:b8,leftshoulder:b4,leftstick:b9,lefttrigger:a2,leftx:a0,lefty:a1,rightshoulder:b5,rightstick:b10,righttrigger:a5,rightx:a3,righty:a4,start:b7,x:b2,y:b3,",
@@ -778,7 +778,7 @@ static const char *s_ControllerMappings[] = {
 	"05000000380700006652000025010000,Mad Catz C.T.R.L.R ,x:b0,a:b1,b:b2,y:b3,back:b8,guide:b12,start:b9,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,dpup:h0.1,leftshoulder:b4,lefttrigger:b6,rightshoulder:b5,righttrigger:b7,leftstick:b10,rightstick:b11,leftx:a0,lefty:a1,rightx:a2,righty:a3,",
 	"0500000047532047616d657061640000,GameStop Gamepad,a:b0,b:b1,back:b8,dpdown:h0.4,dpleft:h0.8,dpright:h0.2,dpup:h0.1,guide:,leftshoulder:b4,leftstick:b10,lefttrigger:b6,leftx:a0,lefty:a1,rightshoulder:b5,rightstick:b11,righttrigger:b7,rightx:a2,righty:a3,start:b9,x:b2,y:b3,",
 	"050000004c0500006802000000010000,PS3 Controller (Bluetooth),a:b14,b:b13,y:b12,x:b15,start:b3,guide:b16,back:b0,leftstick:b1,rightstick:b2,leftshoulder:b10,rightshoulder:b11,dpup:b4,dpleft:b7,dpdown:b6,dpright:b5,leftx:a0,lefty:a1,rightx:a2,righty:a3,lefttrigger:b8,righttrigger:b9,",
-	"050000004c050000c405000000010000,PS4 Controller (Bluetooth),a:b1,b:b2,back:b8,dpdown:h0.4,dpleft:h0.8,dpright:h0.2,dpup:h0.1,guide:b12,leftshoulder:b4,leftstick:b10,lefttrigger:a3,leftx:a0,lefty:a1,rightshoulder:b5,rightstick:b11,righttrigger:a4,rightx:a2,righty:a5,start:b9,x:b0,y:b3,",
+	"050000004c050000c405000000010000,Sony DualShock 4 BT,a:b1,b:b2,back:b13,dpdown:h0.4,dpleft:h0.8,dpright:h0.2,dpup:h0.1,guide:b12,leftshoulder:b4,leftstick:b10,lefttrigger:a3,leftx:a0,lefty:a1,rightshoulder:b5,rightstick:b11,righttrigger:a4,rightx:a2,righty:a5,start:b9,x:b0,y:b3,",
 	"050000004c050000cc09000000010000,Sony DualShock 4 V2 BT,a:b1,b:b2,y:b3,x:b0,start:b9,guide:b12,back:b13,leftstick:b10,rightstick:b11,leftshoulder:b4,rightshoulder:b5,dpup:h0.1,dpleft:h0.8,dpdown:h0.4,dpright:h0.2,leftx:a0,lefty:a1,rightx:a2,righty:a5,lefttrigger:a3,righttrigger:a4,",
 	"050000004c050000cc09000000810000,Sony DualShock 4 (CUH-ZCT2U) (Bluetooth),a:b0,b:b1,y:b2,x:b3,leftshoulder:b4,rightshoulder:b5,back:b8,start:b9,guide:b10,leftstick:b11,rightstick:b12,leftx:a0,lefty:a1,lefttrigger:a2,rightx:a3,righty:a4,righttrigger:a5,dpup:h0.1,dpright:h0.2,dpdown:h0.4,dpleft:h0.8,",
 	"05000000504c415953544154494f4e00,PS3 Controller (Bluetooth),a:b14,b:b13,y:b12,x:b15,start:b3,guide:b16,back:b0,leftstick:b1,rightstick:b2,leftshoulder:b10,rightshoulder:b11,dpup:b4,dpleft:b7,dpdown:b6,dpright:b5,leftx:a0,lefty:a1,rightx:a2,righty:a3,lefttrigger:b8,righttrigger:b9,",
diff --git a/main/main.cpp b/main/main.cpp
index 1328807121..c6e20f6d3b 100644
--- a/main/main.cpp
+++ b/main/main.cpp
@@ -1136,6 +1136,8 @@ Error Main::setup2(Thread::ID p_main_tid_override) {
 	translation_server->load_translations();
 	ResourceLoader::load_translation_remaps(); //load remaps for resources
 
+	ResourceLoader::load_path_remaps();
+
 	audio_server->load_default_bus_layout();
 
 	if (use_debug_profiler && script_debugger) {
@@ -1816,6 +1818,9 @@ void Main::cleanup() {
 	OS::get_singleton()->_execpath = "";
 	OS::get_singleton()->_local_clipboard = "";
 
+	ResourceLoader::clear_translation_remaps();
+	ResourceLoader::clear_path_remaps();
+
 	ScriptServer::finish_languages();
 
 #ifdef TOOLS_ENABLED
diff --git a/modules/bullet/SCsub b/modules/bullet/SCsub
index 7a37cca130..0967bca3f2 100644
--- a/modules/bullet/SCsub
+++ b/modules/bullet/SCsub
@@ -1,9 +1,13 @@
 #!/usr/bin/env python
 
 Import('env')
+Import('env_modules')
 
 # build only version 2
 # Bullet 2.87
+
+env_bullet = env_modules.Clone()
+
 bullet_src__2_x = [
         # BulletCollision
           "BulletCollision/BroadphaseCollision/btAxisSweep3.cpp"
@@ -181,11 +185,11 @@ thirdparty_src = thirdparty_dir + "src/"
 bullet_sources = [thirdparty_src + file for file in bullet_src__2_x]
 
 # include headers
-env.Append(CPPPATH=[thirdparty_src])
+env_bullet.Append(CPPPATH=[thirdparty_src])
 
-env.add_source_files(env.modules_sources, bullet_sources)
+env_bullet.add_source_files(env.modules_sources, bullet_sources)
 
 # Godot source files
-env.add_source_files(env.modules_sources, "*.cpp")
+env_bullet.add_source_files(env.modules_sources, "*.cpp")
 
 Export('env')
diff --git a/modules/enet/networked_multiplayer_enet.cpp b/modules/enet/networked_multiplayer_enet.cpp
index ce485956b4..396bebf0ea 100644
--- a/modules/enet/networked_multiplayer_enet.cpp
+++ b/modules/enet/networked_multiplayer_enet.cpp
@@ -386,7 +386,7 @@ int NetworkedMultiplayerENet::get_available_packet_count() const {
 
 	return incoming_packets.size();
 }
-Error NetworkedMultiplayerENet::get_packet(const uint8_t **r_buffer, int &r_buffer_size) const {
+Error NetworkedMultiplayerENet::get_packet(const uint8_t **r_buffer, int &r_buffer_size) {
 
 	ERR_FAIL_COND_V(incoming_packets.size() == 0, ERR_UNAVAILABLE);
 
@@ -480,7 +480,7 @@ int NetworkedMultiplayerENet::get_max_packet_size() const {
 	return 1 << 24; //anything is good
 }
 
-void NetworkedMultiplayerENet::_pop_current_packet() const {
+void NetworkedMultiplayerENet::_pop_current_packet() {
 
 	if (current_packet.packet) {
 		enet_packet_destroy(current_packet.packet);
diff --git a/modules/enet/networked_multiplayer_enet.h b/modules/enet/networked_multiplayer_enet.h
index 81d517147d..d7bc5c7849 100644
--- a/modules/enet/networked_multiplayer_enet.h
+++ b/modules/enet/networked_multiplayer_enet.h
@@ -86,12 +86,12 @@ private:
 
 	CompressionMode compression_mode;
 
-	mutable List<Packet> incoming_packets;
+	List<Packet> incoming_packets;
 
-	mutable Packet current_packet;
+	Packet current_packet;
 
 	uint32_t _gen_unique_id() const;
-	void _pop_current_packet() const;
+	void _pop_current_packet();
 
 	Vector<uint8_t> src_compressor_mem;
 	Vector<uint8_t> dst_compressor_mem;
@@ -123,7 +123,7 @@ public:
 	virtual bool is_server() const;
 
 	virtual int get_available_packet_count() const;
-	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size) const; ///< buffer is GONE after next get_packet
+	virtual Error get_packet(const uint8_t **r_buffer, int &r_buffer_size); ///< buffer is GONE after next get_packet
 	virtual Error put_packet(const uint8_t *p_buffer, int p_buffer_size);
 
 	virtual int get_max_packet_size() const;
diff --git a/modules/etc/SCsub b/modules/etc/SCsub
index 9c3e703f11..31d8f00ef3 100644
--- a/modules/etc/SCsub
+++ b/modules/etc/SCsub
@@ -34,7 +34,8 @@ env_etc.Append(CPPPATH=[thirdparty_dir])
 env_etc.add_source_files(env.modules_sources, "*.cpp")
 
 # upstream uses c++11
-env_etc.Append(CCFLAGS="-std=gnu++11")
+if (not env_etc.msvc):
+	env_etc.Append(CCFLAGS="-std=c++11")
 # -ffast-math seems to be incompatible with ec2comp on recent versions of
 # GCC and Clang
 if '-ffast-math' in env_etc['CCFLAGS']:
diff --git a/modules/gdnative/gdnative/gdnative.cpp b/modules/gdnative/gdnative/gdnative.cpp
index 92a88e354b..8ff67b10b1 100644
--- a/modules/gdnative/gdnative/gdnative.cpp
+++ b/modules/gdnative/gdnative/gdnative.cpp
@@ -52,10 +52,6 @@ godot_object GDAPI *godot_global_get_singleton(char *p_name) {
 	return (godot_object *)Engine::get_singleton()->get_singleton_object(String(p_name));
 } // result shouldn't be freed
 
-void GDAPI *godot_get_stack_bottom() {
-	return OS::get_singleton()->get_stack_bottom();
-}
-
 // MethodBind API
 
 godot_method_bind GDAPI *godot_method_bind_get_method(const char *p_classname, const char *p_methodname) {
diff --git a/modules/gdnative/gdnative_api.json b/modules/gdnative/gdnative_api.json
index 31f3b0b77b..06c6e9f410 100644
--- a/modules/gdnative/gdnative_api.json
+++ b/modules/gdnative/gdnative_api.json
@@ -5569,6 +5569,12 @@
         ]
       },
       {
+        "name": "godot_get_global_constants",
+        "return_type": "godot_dictionary",
+        "arguments": [
+        ]
+      },
+      {
         "name": "godot_register_native_call_type",
         "return_type": "void",
         "arguments": [
diff --git a/modules/gdnative/include/gdnative/gdnative.h b/modules/gdnative/include/gdnative/gdnative.h
index f7f5606428..9d7829a51f 100644
--- a/modules/gdnative/include/gdnative/gdnative.h
+++ b/modules/gdnative/include/gdnative/gdnative.h
@@ -212,10 +212,6 @@ void GDAPI godot_object_destroy(godot_object *p_o);
 
 godot_object GDAPI *godot_global_get_singleton(char *p_name); // result shouldn't be freed
 
-////// OS API
-
-void GDAPI *godot_get_stack_bottom(); //  returns stack bottom of the main thread
-
 ////// MethodBind API
 
 typedef struct {
diff --git a/modules/gdscript/SCsub b/modules/gdscript/SCsub
index 0882406761..13870170a5 100644
--- a/modules/gdscript/SCsub
+++ b/modules/gdscript/SCsub
@@ -1,7 +1,10 @@
 #!/usr/bin/env python
 
 Import('env')
+Import('env_modules')
 
-env.add_source_files(env.modules_sources, "*.cpp")
+env_gdscript = env_modules.Clone()
+
+env_gdscript.add_source_files(env.modules_sources, "*.cpp")
 
 Export('env')
diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp
index 599f204184..36ae61e388 100644
--- a/modules/gdscript/gdscript_parser.cpp
+++ b/modules/gdscript/gdscript_parser.cpp
@@ -1140,6 +1140,7 @@ GDScriptParser::Node *GDScriptParser::_parse_expression(Node *p_parent, bool p_s
 			bool unary = false;
 			bool ternary = false;
 			bool error = false;
+			bool right_to_left = false;
 
 			switch (expression[i].op) {
 
@@ -1194,11 +1195,13 @@ GDScriptParser::Node *GDScriptParser::_parse_expression(Node *p_parent, bool p_s
 				case OperatorNode::OP_TERNARY_IF:
 					priority = 14;
 					ternary = true;
+					right_to_left = true;
 					break;
 				case OperatorNode::OP_TERNARY_ELSE:
 					priority = 14;
 					error = true;
-					break; // Errors out when found without IF (since IF would consume it)
+					// Rigth-to-left should be false in this case, otherwise it would always error.
+					break;
 
 				case OperatorNode::OP_ASSIGN: priority = 15; break;
 				case OperatorNode::OP_ASSIGN_ADD: priority = 15; break;
@@ -1218,13 +1221,13 @@ GDScriptParser::Node *GDScriptParser::_parse_expression(Node *p_parent, bool p_s
 				}
 			}
 
-			if (priority < min_priority) {
+			if (priority < min_priority || (right_to_left && priority == min_priority)) {
+				// < is used for left to right (default)
+				// <= is used for right to left
 				if (error) {
 					_set_error("Unexpected operator");
 					return NULL;
 				}
-				// < is used for left to right (default)
-				// <= is used for right to left
 				next_op = i;
 				min_priority = priority;
 				is_unary = unary;
diff --git a/modules/gdscript/register_types.cpp b/modules/gdscript/register_types.cpp
index 1e007ddb0f..e707032ed8 100644
--- a/modules/gdscript/register_types.cpp
+++ b/modules/gdscript/register_types.cpp
@@ -30,6 +30,7 @@
 #include "register_types.h"
 
 #include "gdscript.h"
+#include "gdscript_tokenizer.h"
 #include "io/file_access_encrypted.h"
 #include "io/resource_loader.h"
 #include "os/file_access.h"
@@ -38,6 +39,45 @@ GDScriptLanguage *script_language_gd = NULL;
 ResourceFormatLoaderGDScript *resource_loader_gd = NULL;
 ResourceFormatSaverGDScript *resource_saver_gd = NULL;
 
+#ifdef TOOLS_ENABLED
+
+#include "editor/editor_export.h"
+#include "editor/editor_node.h"
+#include "editor/editor_settings.h"
+
+class EditorExportGDScript : public EditorExportPlugin {
+
+	GDCLASS(EditorExportGDScript, EditorExportPlugin);
+
+public:
+	virtual void _export_file(const String &p_path, const String &p_type, const Set<String> &p_features) {
+
+		if (!p_path.ends_with(".gd"))
+			return;
+
+		Vector<uint8_t> file = FileAccess::get_file_as_array(p_path);
+		if (file.empty())
+			return;
+		String txt;
+		txt.parse_utf8((const char *)file.ptr(), file.size());
+		file = GDScriptTokenizerBuffer::parse_code_string(txt);
+
+		if (file.empty())
+			return;
+
+		add_file(p_path.get_basename() + ".gdc", file, true);
+	}
+};
+
+static void _editor_init() {
+
+	Ref<EditorExportGDScript> gd_export;
+	gd_export.instance();
+	EditorExport::get_singleton()->add_export_plugin(gd_export);
+}
+
+#endif
+
 void register_gdscript_types() {
 
 	ClassDB::register_class<GDScript>();
@@ -49,6 +89,10 @@ void register_gdscript_types() {
 	ResourceLoader::add_resource_format_loader(resource_loader_gd);
 	resource_saver_gd = memnew(ResourceFormatSaverGDScript);
 	ResourceSaver::add_resource_format_saver(resource_saver_gd);
+
+#ifdef TOOLS_ENABLED
+	EditorNode::add_init_callback(_editor_init);
+#endif
 }
 
 void unregister_gdscript_types() {
diff --git a/modules/gridmap/SCsub b/modules/gridmap/SCsub
index 0882406761..2ffe15cd33 100644
--- a/modules/gridmap/SCsub
+++ b/modules/gridmap/SCsub
@@ -1,7 +1,10 @@
 #!/usr/bin/env python
 
 Import('env')
+Import('env_modules')
 
-env.add_source_files(env.modules_sources, "*.cpp")
+env_gridmap = env_modules.Clone()
+
+env_gridmap.add_source_files(env.modules_sources, "*.cpp")
 
 Export('env')
diff --git a/modules/gridmap/grid_map.cpp b/modules/gridmap/grid_map.cpp
index b3a1947647..bebf8bcf8f 100644
--- a/modules/gridmap/grid_map.cpp
+++ b/modules/gridmap/grid_map.cpp
@@ -469,7 +469,7 @@ bool GridMap::_octant_update(const OctantKey &p_key) {
 			nm.xform = xform;
 
 			if (navigation) {
-				nm.id = navigation->navmesh_create(navmesh, xform, this);
+				nm.id = navigation->navmesh_add(navmesh, xform, this);
 			} else {
 				nm.id = -1;
 			}
@@ -556,7 +556,7 @@ void GridMap::_octant_enter_world(const OctantKey &p_key) {
 			if (cell_map.has(F->key()) && F->get().id < 0) {
 				Ref<NavigationMesh> nm = theme->get_item_navmesh(cell_map[F->key()].item);
 				if (nm.is_valid()) {
-					F->get().id = navigation->navmesh_create(nm, F->get().xform, this);
+					F->get().id = navigation->navmesh_add(nm, F->get().xform, this);
 				}
 			}
 		}
diff --git a/modules/gridmap/grid_map.h b/modules/gridmap/grid_map.h
index 5bfdf1dac3..ab66bf123e 100644
--- a/modules/gridmap/grid_map.h
+++ b/modules/gridmap/grid_map.h
@@ -186,6 +186,11 @@ class GridMap : public Spatial {
 
 	Vector3 _get_offset() const;
 
+	struct BakedMesh {
+		Ref<Mesh> mesh;
+		Transform transform;
+	};
+
 protected:
 	bool _set(const StringName &p_name, const Variant &p_value);
 	bool _get(const StringName &p_name, Variant &r_ret) const;
diff --git a/modules/mono/SCsub b/modules/mono/SCsub
index 18a20ecac4..320bbe7090 100644
--- a/modules/mono/SCsub
+++ b/modules/mono/SCsub
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 
 Import('env')
+Import('env_modules')
+
+env_mono = env_modules.Clone()
 
 from compat import byte_to_str
 
@@ -43,12 +46,12 @@ def make_cs_files_header(src, dst):
         header.write('#endif // _CS_FILES_DATA_H')
 
 
-env.add_source_files(env.modules_sources, '*.cpp')
-env.add_source_files(env.modules_sources, 'mono_gd/*.cpp')
-env.add_source_files(env.modules_sources, 'utils/*.cpp')
+env_mono.add_source_files(env.modules_sources, '*.cpp')
+env_mono.add_source_files(env.modules_sources, 'mono_gd/*.cpp')
+env_mono.add_source_files(env.modules_sources, 'utils/*.cpp')
 
 if env['tools']:
-    env.add_source_files(env.modules_sources, 'editor/*.cpp')
+    env_mono.add_source_files(env.modules_sources, 'editor/*.cpp')
     make_cs_files_header('glue/cs_files', 'glue/cs_compressed.gen.h')
 
 vars = Variables()
@@ -58,12 +61,12 @@ vars.Update(env)
 
 # Glue sources
 if env['mono_glue']:
-    env.add_source_files(env.modules_sources, 'glue/*.cpp')
+    env_mono.add_source_files(env.modules_sources, 'glue/*.cpp')
 else:
-    env.Append(CPPDEFINES=['MONO_GLUE_DISABLED'])
+    env_mono.Append(CPPDEFINES=['MONO_GLUE_DISABLED'])
 
 if ARGUMENTS.get('yolo_copy', False):
-    env.Append(CPPDEFINES=['YOLO_COPY'])
+    env_mono.Append(CPPDEFINES=['YOLO_COPY'])
 
 
 # Build GodotSharpTools solution
@@ -201,8 +204,8 @@ def mono_build_solution(source, target, env):
 
 
 mono_sln_builder = Builder(action = mono_build_solution)
-env.Append(BUILDERS={'MonoBuildSolution': mono_sln_builder})
-env.MonoBuildSolution(
+env_mono.Append(BUILDERS={'MonoBuildSolution': mono_sln_builder})
+env_mono.MonoBuildSolution(
     os.path.join(Dir('#bin').abspath, 'GodotSharpTools.dll'),
     'editor/GodotSharpTools/GodotSharpTools.sln'
 )
diff --git a/modules/openssl/stream_peer_openssl.cpp b/modules/openssl/stream_peer_openssl.cpp
index 6d1d5485f3..7e8b308cf8 100644
--- a/modules/openssl/stream_peer_openssl.cpp
+++ b/modules/openssl/stream_peer_openssl.cpp
@@ -412,8 +412,12 @@ void StreamPeerOpenSSL::_print_error(int err) {
 
 	err = SSL_get_error(ssl, err);
 	switch (err) {
-		case SSL_ERROR_NONE: ERR_PRINT("NO ERROR: The TLS/SSL I/O operation completed"); break;
-		case SSL_ERROR_ZERO_RETURN: ERR_PRINT("The TLS/SSL connection has been closed.");
+		case SSL_ERROR_NONE:
+			ERR_PRINT("NO ERROR: The TLS/SSL I/O operation completed");
+			break;
+		case SSL_ERROR_ZERO_RETURN:
+			ERR_PRINT("The TLS/SSL connection has been closed.");
+			break;
 		case SSL_ERROR_WANT_READ:
 		case SSL_ERROR_WANT_WRITE:
 			ERR_PRINT("The operation did not complete.");
diff --git a/modules/thekla_unwrap/SCsub b/modules/thekla_unwrap/SCsub
index 1d4b086848..c57bf326ea 100644
--- a/modules/thekla_unwrap/SCsub
+++ b/modules/thekla_unwrap/SCsub
@@ -56,14 +56,19 @@ if env['builtin_thekla_atlas']:
     env_thekla_unwrap.Append(CPPPATH=[thirdparty_dir, thirdparty_dir + "/poshlib", thirdparty_dir + "/nvcore", thirdparty_dir + "/nvmesh"])
 
     # upstream uses c++11
-    env_thekla_unwrap.Append(CXXFLAGS="-std=gnu++11")
+    if (not env_thekla_unwrap.msvc):
+        env_thekla_unwrap.Append(CXXFLAGS="-std=c++11")
 
     if env["platform"] == 'x11':
-        env_thekla_unwrap.Append(CCFLAGS=["-DNV_OS_LINUX"])
+        env_thekla_unwrap.Append(CCFLAGS=["-DNV_OS_LINUX", "-DPOSH_COMPILER_GCC"])
     elif env["platform"] == 'osx':
-        env_thekla_unwrap.Append(CCFLAGS=["-DNV_OS_DARWIN"])
+        env_thekla_unwrap.Append(CCFLAGS=["-DNV_OS_DARWIN", "-DPOSH_COMPILER_GCC"])
     elif env["platform"] == 'windows':
-        env_thekla_unwrap.Append(CCFLAGS=["-DNV_OS_WIN32"])
+        if env.msvc:
+            env_thekla_unwrap.Append(CCFLAGS=["-DNV_OS_WIN32", "-DNV_CC_MSVC", "-DPOSH_COMPILER_MSVC" ])
+        else:
+            env_thekla_unwrap.Append(CCFLAGS=["-DNV_OS_MINGW", "-DNV_CC_GNUC", "-DPOSH_COMPILER_GCC", "-U__STRICT_ANSI__"])
+            env.Append(LIBS=["dbghelp"])
         
 # Godot source files
 env_thekla_unwrap.add_source_files(env.modules_sources, "*.cpp")
diff --git a/modules/thekla_unwrap/register_types.cpp b/modules/thekla_unwrap/register_types.cpp
index 01b834f8cb..ab3203068f 100644
--- a/modules/thekla_unwrap/register_types.cpp
+++ b/modules/thekla_unwrap/register_types.cpp
@@ -42,7 +42,7 @@ bool thekla_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_ver
 		input_mesh.face_array[i].vertex_index[0] = p_indices[i * 3 + 0];
 		input_mesh.face_array[i].vertex_index[1] = p_indices[i * 3 + 1];
 		input_mesh.face_array[i].vertex_index[2] = p_indices[i * 3 + 2];
-		printf("face %i - %i, %i, %i - mat %i\n", i, input_mesh.face_array[i].vertex_index[0], input_mesh.face_array[i].vertex_index[1], input_mesh.face_array[i].vertex_index[2], p_face_materials[i]);
+		//printf("face %i - %i, %i, %i - mat %i\n", i, input_mesh.face_array[i].vertex_index[0], input_mesh.face_array[i].vertex_index[1], input_mesh.face_array[i].vertex_index[2], p_face_materials[i]);
 		input_mesh.face_array[i].material_index = p_face_materials[i];
 	}
 	input_mesh.vertex_array = new Thekla::Atlas_Input_Vertex[p_vertex_count];
@@ -54,8 +54,8 @@ bool thekla_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_ver
 		}
 		input_mesh.vertex_array[i].uv[0] = 0;
 		input_mesh.vertex_array[i].uv[1] = 0;
-		printf("vertex %i - %f, %f, %f\n", i, input_mesh.vertex_array[i].position[0], input_mesh.vertex_array[i].position[1], input_mesh.vertex_array[i].position[2]);
-		printf("normal %i - %f, %f, %f\n", i, input_mesh.vertex_array[i].normal[0], input_mesh.vertex_array[i].normal[1], input_mesh.vertex_array[i].normal[2]);
+		//printf("vertex %i - %f, %f, %f\n", i, input_mesh.vertex_array[i].position[0], input_mesh.vertex_array[i].position[1], input_mesh.vertex_array[i].position[2]);
+		//printf("normal %i - %f, %f, %f\n", i, input_mesh.vertex_array[i].normal[0], input_mesh.vertex_array[i].normal[1], input_mesh.vertex_array[i].normal[2]);
 	}
 	input_mesh.face_count = p_index_count / 3;
 	input_mesh.vertex_count = p_vertex_count;
@@ -65,6 +65,7 @@ bool thekla_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_ver
 	Thekla::atlas_set_default_options(&options);
 	options.packer_options.witness.packing_quality = 1;
 	options.packer_options.witness.texel_area = 1.0 / p_texel_size;
+	options.packer_options.witness.conservative = true;
 
 	//generate
 	Thekla::Atlas_Error err;
diff --git a/modules/visual_script/SCsub b/modules/visual_script/SCsub
index 0882406761..96ee911ba0 100644
--- a/modules/visual_script/SCsub
+++ b/modules/visual_script/SCsub
@@ -1,7 +1,10 @@
 #!/usr/bin/env python
 
 Import('env')
+Import('env_modules')
 
-env.add_source_files(env.modules_sources, "*.cpp")
+env_vs = env_modules.Clone()
+
+env_vs.add_source_files(env.modules_sources, "*.cpp")
 
 Export('env')
diff --git a/modules/webp/SCsub b/modules/webp/SCsub
index f9295fed47..ea7af1bf9e 100644
--- a/modules/webp/SCsub
+++ b/modules/webp/SCsub
@@ -26,9 +26,6 @@ if env['builtin_libwebp']:
         "dsp/alpha_processing_neon.c",
         "dsp/alpha_processing_sse2.c",
         "dsp/alpha_processing_sse41.c",
-        "dsp/argb.c",
-        "dsp/argb_mips_dsp_r2.c",
-        "dsp/argb_sse2.c",
         "dsp/cost.c",
         "dsp/cost_mips32.c",
         "dsp/cost_mips_dsp_r2.c",
@@ -36,6 +33,9 @@ if env['builtin_libwebp']:
         "dsp/cpu.c",
         "dsp/dec.c",
         "dsp/dec_clip_tables.c",
+        "dsp/ssim.c",
+        "dsp/ssim_sse2.c",
+        "dsp/yuv_neon.c",
         "dsp/dec_mips32.c",
         "dsp/dec_mips_dsp_r2.c",
         "dsp/dec_msa.c",
@@ -84,6 +84,7 @@ if env['builtin_libwebp']:
         "dsp/yuv_sse2.c",
         "enc/alpha_enc.c",
         "enc/analysis_enc.c",
+        "enc/backward_references_cost_enc.c",
         "enc/backward_references_enc.c",
         "enc/config_enc.c",
         "enc/cost_enc.c",
@@ -122,10 +123,10 @@ if env['builtin_libwebp']:
         "utils/thread_utils.c",
         "utils/utils.c",
     ]
-    thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
+    thirdparty_sources = [thirdparty_dir + "src/" + file for file in thirdparty_sources]
 
     env_webp.add_source_files(env.modules_sources, thirdparty_sources)
-    env_webp.Append(CPPPATH=[thirdparty_dir])
+    env_webp.Append(CPPPATH=[thirdparty_dir, thirdparty_dir + "src/"])
 
 # Godot source files
 env_webp.add_source_files(env.modules_sources, "*.cpp")
diff --git a/platform/javascript/http_client_javascript.cpp b/platform/javascript/http_client_javascript.cpp
index 0b105dcb40..b170ba6f35 100644
--- a/platform/javascript/http_client_javascript.cpp
+++ b/platform/javascript/http_client_javascript.cpp
@@ -37,16 +37,31 @@ Error HTTPClient::connect_to_host(const String &p_host, int p_port, bool p_ssl,
 		WARN_PRINT("Disabling HTTPClient's host verification is not supported for the HTML5 platform, host will be verified");
 	}
 
+	port = p_port;
+	use_tls = p_ssl;
+
 	host = p_host;
-	if (host.begins_with("http://")) {
-		host.replace_first("http://", "");
-	} else if (host.begins_with("https://")) {
-		host.replace_first("https://", "");
+
+	String host_lower = host.to_lower();
+	if (host_lower.begins_with("http://")) {
+		host = host.substr(7, host.length() - 7);
+	} else if (host_lower.begins_with("https://")) {
+		use_tls = true;
+		host = host.substr(8, host.length() - 8);
+	}
+
+	ERR_FAIL_COND_V(host.length() < HOST_MIN_LEN, ERR_INVALID_PARAMETER);
+
+	if (port < 0) {
+		if (use_tls) {
+			port = PORT_HTTPS;
+		} else {
+			port = PORT_HTTP;
+		}
 	}
 
 	status = host.is_valid_ip_address() ? STATUS_CONNECTING : STATUS_RESOLVING;
-	port = p_port;
-	use_tls = p_ssl;
+
 	return OK;
 }
 
@@ -68,17 +83,7 @@ Error HTTPClient::prepare_request(Method p_method, const String &p_url, const Ve
 	ERR_FAIL_COND_V(status != STATUS_CONNECTED, ERR_INVALID_PARAMETER);
 	ERR_FAIL_COND_V(host.empty(), ERR_UNCONFIGURED);
 	ERR_FAIL_COND_V(port < 0, ERR_UNCONFIGURED);
-
-	static const char *_methods[HTTPClient::METHOD_MAX] = {
-		"GET",
-		"HEAD",
-		"POST",
-		"PUT",
-		"DELETE",
-		"OPTIONS",
-		"TRACE",
-		"CONNECT"
-	};
+	ERR_FAIL_COND_V(!p_url.begins_with("/"), ERR_INVALID_PARAMETER);
 
 	String url = (use_tls ? "https://" : "http://") + host + ":" + itos(port) + "/" + p_url;
 	godot_xhr_reset(xhr_id);
diff --git a/platform/osx/SCsub b/platform/osx/SCsub
index cb88bc470a..029e3d808c 100644
--- a/platform/osx/SCsub
+++ b/platform/osx/SCsub
@@ -4,7 +4,12 @@ import os
 Import('env')
 
 def make_debug(target, source, env):
-    os.system('dsymutil %s -o %s.dSYM' % (target[0], target[0]))
+    if (env["macports_clang"] != 'no'):
+        mpprefix = os.environ.get("MACPORTS_PREFIX", "/opt/local")
+        mpclangver = env["macports_clang"]
+        os.system(mpprefix + '/libexec/llvm-' + mpclangver + '/bin/llvm-dsymutil %s -o %s.dSYM' % (target[0], target[0]))
+    else:
+        os.system('dsymutil %s -o %s.dSYM' % (target[0], target[0]))
 
 files = [
     'crash_handler_osx.mm',
diff --git a/platform/osx/detect.py b/platform/osx/detect.py
index ff7cf2ad2f..e8a8319431 100644
--- a/platform/osx/detect.py
+++ b/platform/osx/detect.py
@@ -72,6 +72,19 @@ def configure(env):
         else: # 64-bit, default
             env.Append(CCFLAGS=['-arch', 'x86_64'])
             env.Append(LINKFLAGS=['-arch', 'x86_64'])
+        if (env["macports_clang"] != 'no'):
+            mpprefix = os.environ.get("MACPORTS_PREFIX", "/opt/local")
+            mpclangver = env["macports_clang"]
+            env["CC"] = mpprefix + "/libexec/llvm-" + mpclangver + "/bin/clang"
+            env["LD"] = mpprefix + "/libexec/llvm-" + mpclangver + "/bin/clang++"
+            env["CXX"] = mpprefix + "/libexec/llvm-" + mpclangver + "/bin/clang++"
+            env['AR'] = mpprefix + "/libexec/llvm-" + mpclangver + "/bin/llvm-ar"
+            env['RANLIB'] = mpprefix + "/libexec/llvm-" + mpclangver + "/bin/llvm-ranlib"
+            env['AS'] = mpprefix + "/libexec/llvm-" + mpclangver + "/bin/llvm-as"
+            env.Append(CCFLAGS=['-D__MACPORTS__']) #hack to fix libvpx MM256_BROADCASTSI128_SI256 define
+            if (env["openmp"]):
+                env.Append(CPPFLAGS=['-fopenmp'])
+                env.Append(LINKFLAGS=['-fopenmp'])
 
     else: # osxcross build
         root = os.environ.get("OSXCROSS_ROOT", 0)
diff --git a/platform/osx/os_osx.h b/platform/osx/os_osx.h
index 6543ca7dd2..ede50a05ba 100644
--- a/platform/osx/os_osx.h
+++ b/platform/osx/os_osx.h
@@ -204,7 +204,7 @@ public:
 	virtual void request_attention();
 	virtual String get_joy_guid(int p_device) const;
 
-	virtual void set_borderless_window(int p_borderless);
+	virtual void set_borderless_window(bool p_borderless);
 	virtual bool get_borderless_window();
 	virtual void set_ime_position(const Point2 &p_pos);
 	virtual void set_ime_intermediate_text_callback(ImeCallback p_callback, void *p_inp);
@@ -228,6 +228,8 @@ public:
 
 	virtual Error move_to_trash(const String &p_path);
 
+	void force_process_input();
+
 	OS_OSX();
 
 private:
diff --git a/platform/osx/os_osx.mm b/platform/osx/os_osx.mm
index 75d0bd1648..f3809e6eed 100644
--- a/platform/osx/os_osx.mm
+++ b/platform/osx/os_osx.mm
@@ -1818,7 +1818,7 @@ void OS_OSX::request_attention() {
 	[NSApp requestUserAttention:NSCriticalRequest];
 }
 
-void OS_OSX::set_borderless_window(int p_borderless) {
+void OS_OSX::set_borderless_window(bool p_borderless) {
 
 	// OrderOut prevents a lose focus bug with the window
 	[window_object orderOut:nil];
@@ -1971,6 +1971,12 @@ void OS_OSX::push_input(const Ref<InputEvent> &p_event) {
 	input->parse_input_event(ev);
 }
 
+void OS_OSX::force_process_input() {
+
+	process_events(); // get rid of pending events
+	joypad_osx->process_joypads();
+}
+
 void OS_OSX::run() {
 
 	force_quit = false;
diff --git a/platform/uwp/SCsub b/platform/uwp/SCsub
index f0d69fef33..fb0c4a92ae 100644
--- a/platform/uwp/SCsub
+++ b/platform/uwp/SCsub
@@ -4,9 +4,6 @@ Import('env')
 
 files = [
     'thread_uwp.cpp',
-    '#platform/windows/tcp_server_winsock.cpp',
-    '#platform/windows/packet_peer_udp_winsock.cpp',
-    '#platform/windows/stream_peer_winsock.cpp',
     '#platform/windows/key_mapping_win.cpp',
     '#platform/windows/windows_terminal_logger.cpp',
     'joypad_uwp.cpp',
diff --git a/platform/uwp/os_uwp.cpp b/platform/uwp/os_uwp.cpp
index 659f162724..3018ac0bef 100644
--- a/platform/uwp/os_uwp.cpp
+++ b/platform/uwp/os_uwp.cpp
@@ -34,13 +34,13 @@
 #include "drivers/windows/dir_access_windows.h"
 #include "drivers/windows/file_access_windows.h"
 #include "drivers/windows/mutex_windows.h"
+#include "drivers/windows/packet_peer_udp_winsock.h"
 #include "drivers/windows/rw_lock_windows.h"
 #include "drivers/windows/semaphore_windows.h"
+#include "drivers/windows/stream_peer_tcp_winsock.h"
+#include "drivers/windows/tcp_server_winsock.h"
 #include "io/marshalls.h"
 #include "main/main.h"
-#include "platform/windows/packet_peer_udp_winsock.h"
-#include "platform/windows/stream_peer_winsock.h"
-#include "platform/windows/tcp_server_winsock.h"
 #include "platform/windows/windows_terminal_logger.h"
 #include "project_settings.h"
 #include "servers/audio_server.h"
@@ -163,7 +163,7 @@ void OSUWP::initialize_core() {
 	DirAccess::make_default<DirAccessWindows>(DirAccess::ACCESS_FILESYSTEM);
 
 	TCPServerWinsock::make_default();
-	StreamPeerWinsock::make_default();
+	StreamPeerTCPWinsock::make_default();
 	PacketPeerUDPWinsock::make_default();
 
 	// We need to know how often the clock is updated
diff --git a/platform/windows/SCsub b/platform/windows/SCsub
index 5a253d5db5..5030f4b3e0 100644
--- a/platform/windows/SCsub
+++ b/platform/windows/SCsub
@@ -4,9 +4,14 @@ import os
 Import('env')
 
 def make_debug_mingw(target, source, env):
-    os.system('objcopy --only-keep-debug %s %s.debug' % (target[0], target[0]))
-    os.system('strip --strip-debug --strip-unneeded %s' % (target[0]))
-    os.system('objcopy --add-gnu-debuglink=%s.debug %s' % (target[0], target[0]))
+    mingw_prefix = ""
+    if (env["bits"] == "32"):
+        mingw_prefix = env["mingw_prefix_32"]
+    else:
+        mingw_prefix = env["mingw_prefix_64"]
+    os.system(mingw_prefix + 'objcopy --only-keep-debug %s %s.debug' % (target[0], target[0]))
+    os.system(mingw_prefix + 'strip --strip-debug --strip-unneeded %s' % (target[0]))
+    os.system(mingw_prefix + 'objcopy --add-gnu-debuglink=%s.debug %s' % (target[0], target[0]))
 
 common_win = [
     "context_gl_win.cpp",
@@ -14,9 +19,6 @@ common_win = [
     "os_windows.cpp",
     "ctxgl_procaddr.cpp",
     "key_mapping_win.cpp",
-    "tcp_server_winsock.cpp",
-    "packet_peer_udp_winsock.cpp",
-    "stream_peer_winsock.cpp",
     "joypad.cpp",
     "power_windows.cpp",
     "windows_terminal_logger.cpp"
diff --git a/platform/windows/context_gl_win.cpp b/platform/windows/context_gl_win.cpp
index 81aa18dd23..ccb0a41d13 100644
--- a/platform/windows/context_gl_win.cpp
+++ b/platform/windows/context_gl_win.cpp
@@ -181,8 +181,6 @@ Error ContextGL_Win::initialize() {
 			MessageBox(NULL, "Can't Activate The GL 3.3 Rendering Context.", "ERROR", MB_OK | MB_ICONEXCLAMATION);
 			return ERR_CANT_CREATE; // Return FALSE
 		}
-
-		printf("Activated GL 3.3 context");
 	}
 
 	wglSwapIntervalEXT = (PFNWGLSWAPINTERVALEXTPROC)wglGetProcAddress("wglSwapIntervalEXT");
diff --git a/platform/windows/detect.py b/platform/windows/detect.py
index d85e1b061c..01eb50e69c 100644
--- a/platform/windows/detect.py
+++ b/platform/windows/detect.py
@@ -188,6 +188,9 @@ def configure(env):
         else:
             VC_PATH = ""
 
+        if (env["openmp"]):
+            env.Append(CPPFLAGS=['/openmp'])
+
         env.Append(CCFLAGS=["/I" + p for p in os.getenv("INCLUDE").split(";")])
         env.Append(LIBPATH=[p for p in os.getenv("LIB").split(";")])
 
@@ -264,6 +267,10 @@ def configure(env):
             env.Append(CCFLAGS=['-flto'])
             env.Append(LINKFLAGS=['-flto=' + str(env.GetOption("num_jobs"))])
 
+        if (env["openmp"]):
+            env.Append(CPPFLAGS=['-fopenmp'])
+            env.Append(LINKFLAGS=['-fopenmp'])
+
         ## Compile flags
 
         env.Append(CCFLAGS=['-DWINDOWS_ENABLED', '-mwindows'])
diff --git a/platform/windows/os_windows.cpp b/platform/windows/os_windows.cpp
index 41730d33af..6cab683e83 100644
--- a/platform/windows/os_windows.cpp
+++ b/platform/windows/os_windows.cpp
@@ -34,19 +34,19 @@
 #include "drivers/windows/dir_access_windows.h"
 #include "drivers/windows/file_access_windows.h"
 #include "drivers/windows/mutex_windows.h"
+#include "drivers/windows/packet_peer_udp_winsock.h"
 #include "drivers/windows/rw_lock_windows.h"
 #include "drivers/windows/semaphore_windows.h"
+#include "drivers/windows/stream_peer_tcp_winsock.h"
+#include "drivers/windows/tcp_server_winsock.h"
 #include "drivers/windows/thread_windows.h"
 #include "io/marshalls.h"
 #include "joypad.h"
 #include "lang_table.h"
 #include "main/main.h"
-#include "packet_peer_udp_winsock.h"
 #include "servers/audio_server.h"
 #include "servers/visual/visual_server_raster.h"
 #include "servers/visual/visual_server_wrap_mt.h"
-#include "stream_peer_winsock.h"
-#include "tcp_server_winsock.h"
 #include "version_generated.gen.h"
 #include "windows_terminal_logger.h"
 
@@ -196,7 +196,7 @@ void OS_Windows::initialize_core() {
 	DirAccess::make_default<DirAccessWindows>(DirAccess::ACCESS_FILESYSTEM);
 
 	TCPServerWinsock::make_default();
-	StreamPeerWinsock::make_default();
+	StreamPeerTCPWinsock::make_default();
 	PacketPeerUDPWinsock::make_default();
 
 	// We need to know how often the clock is updated
@@ -1253,7 +1253,7 @@ void OS_Windows::finalize_core() {
 	memdelete(process_map);
 
 	TCPServerWinsock::cleanup();
-	StreamPeerWinsock::cleanup();
+	StreamPeerTCPWinsock::cleanup();
 }
 
 void OS_Windows::alert(const String &p_alert, const String &p_title) {
@@ -1598,7 +1598,7 @@ bool OS_Windows::is_window_maximized() const {
 	return maximized;
 }
 
-void OS_Windows::set_borderless_window(int p_borderless) {
+void OS_Windows::set_borderless_window(bool p_borderless) {
 	if (video_mode.borderless_window == p_borderless)
 		return;
 
@@ -2156,6 +2156,10 @@ void OS_Windows::swap_buffers() {
 	gl_context->swap_buffers();
 }
 
+void OS_Windows::force_process_input() {
+	process_events(); // get rid of pending events
+}
+
 void OS_Windows::run() {
 
 	if (!main_loop)
diff --git a/platform/windows/os_windows.h b/platform/windows/os_windows.h
index af1ccd4446..f2226a53a9 100644
--- a/platform/windows/os_windows.h
+++ b/platform/windows/os_windows.h
@@ -210,7 +210,7 @@ public:
 	virtual bool is_window_maximized() const;
 	virtual void request_attention();
 
-	virtual void set_borderless_window(int p_borderless);
+	virtual void set_borderless_window(bool p_borderless);
 	virtual bool get_borderless_window();
 
 	virtual Error open_dynamic_library(const String p_path, void *&p_library_handle, bool p_also_set_library_path = false);
@@ -287,6 +287,8 @@ public:
 	void disable_crash_handler();
 	bool is_disable_crash_handler() const;
 
+	void force_process_input();
+
 	virtual Error move_to_trash(const String &p_path);
 
 	OS_Windows(HINSTANCE _hInstance);
diff --git a/platform/x11/detect.py b/platform/x11/detect.py
index d7dbe71da4..09bf57c5f1 100644
--- a/platform/x11/detect.py
+++ b/platform/x11/detect.py
@@ -158,6 +158,7 @@ def configure(env):
     if not env['builtin_libwebp']:
         env.ParseConfig('pkg-config libwebp --cflags --libs')
 
+
     # freetype depends on libpng and zlib, so bundling one of them while keeping others
     # as shared libraries leads to weird issues
     if env['builtin_freetype'] or env['builtin_libpng'] or env['builtin_zlib']:
@@ -263,5 +264,10 @@ def configure(env):
         env.Append(CPPFLAGS=['-m64'])
         env.Append(LINKFLAGS=['-m64', '-L/usr/lib/i686-linux-gnu'])
 
+
+    if env["openmp"]:
+        env.Append(CPPFLAGS=['-fopenmp'])
+        env.Append(LINKFLAGS=['-fopenmp'])
+
     if env['use_static_cpp']:
         env.Append(LINKFLAGS=['-static-libstdc++'])
diff --git a/platform/x11/os_x11.cpp b/platform/x11/os_x11.cpp
index b59fab7088..0c0bc1a8a3 100644
--- a/platform/x11/os_x11.cpp
+++ b/platform/x11/os_x11.cpp
@@ -1098,7 +1098,7 @@ bool OS_X11::is_window_maximized() const {
 	return false;
 }
 
-void OS_X11::set_borderless_window(int p_borderless) {
+void OS_X11::set_borderless_window(bool p_borderless) {
 
 	if (current_videomode.borderless_window == p_borderless)
 		return;
@@ -2264,6 +2264,13 @@ void OS_X11::set_icon(const Ref<Image> &p_icon) {
 	XFlush(x11_display);
 }
 
+void OS_X11::force_process_input() {
+	process_xevents(); // get rid of pending events
+#ifdef JOYDEV_ENABLED
+	joypad->process_joypads();
+#endif
+}
+
 void OS_X11::run() {
 
 	force_quit = false;
diff --git a/platform/x11/os_x11.h b/platform/x11/os_x11.h
index 244c69ee2b..c8cea1e30c 100644
--- a/platform/x11/os_x11.h
+++ b/platform/x11/os_x11.h
@@ -258,7 +258,7 @@ public:
 	virtual bool is_window_maximized() const;
 	virtual void request_attention();
 
-	virtual void set_borderless_window(int p_borderless);
+	virtual void set_borderless_window(bool p_borderless);
 	virtual bool get_borderless_window();
 	virtual void set_ime_position(const Point2 &p_pos);
 
@@ -279,6 +279,7 @@ public:
 
 	virtual bool _check_internal_feature_support(const String &p_feature);
 
+	virtual void force_process_input();
 	void run();
 
 	void disable_crash_handler();
diff --git a/scene/2d/animated_sprite.cpp b/scene/2d/animated_sprite.cpp
index f8f94926b7..de28fef929 100644
--- a/scene/2d/animated_sprite.cpp
+++ b/scene/2d/animated_sprite.cpp
@@ -355,38 +355,21 @@ void AnimatedSprite::_notification(int p_what) {
 
 		case NOTIFICATION_DRAW: {
 
-			if (frames.is_null()) {
-				print_line("no draw no faemos");
+			if (frames.is_null())
 				return;
-			}
-
-			if (frame < 0) {
-				print_line("no draw frame <0");
+			if (frame < 0)
 				return;
-			}
-
-			if (!frames->has_animation(animation)) {
-				print_line("no draw no anim: " + String(animation));
+			if (!frames->has_animation(animation))
 				return;
-			}
 
 			Ref<Texture> texture = frames->get_frame(animation, frame);
-			if (texture.is_null()) {
-				print_line("no draw texture is null");
+			if (texture.is_null())
 				return;
-			}
 
 			Ref<Texture> normal = frames->get_normal_frame(animation, frame);
 
-			//print_line("DECIDED TO DRAW");
-
 			RID ci = get_canvas_item();
 
-			/*
-			texture->draw(ci,Point2());
-			break;
-			*/
-
 			Size2i s;
 			s = texture->get_size();
 			Point2 ofs = offset;
@@ -403,9 +386,7 @@ void AnimatedSprite::_notification(int p_what) {
 			if (vflip)
 				dst_rect.size.y = -dst_rect.size.y;
 
-			//texture->draw_rect(ci,dst_rect,false,modulate);
 			texture->draw_rect_region(ci, dst_rect, Rect2(Vector2(), texture->get_size()), Color(1, 1, 1), false, normal);
-			//VisualServer::get_singleton()->canvas_item_add_texture_rect_region(ci,dst_rect,texture->get_rid(),src_rect,modulate);
 
 		} break;
 	}
diff --git a/scene/2d/navigation2d.cpp b/scene/2d/navigation2d.cpp
index 9eff107827..40013814f8 100644
--- a/scene/2d/navigation2d.cpp
+++ b/scene/2d/navigation2d.cpp
@@ -205,7 +205,7 @@ void Navigation2D::_navpoly_unlink(int p_id) {
 	nm.linked = false;
 }
 
-int Navigation2D::navpoly_create(const Ref<NavigationPolygon> &p_mesh, const Transform2D &p_xform, Object *p_owner) {
+int Navigation2D::navpoly_add(const Ref<NavigationPolygon> &p_mesh, const Transform2D &p_xform, Object *p_owner) {
 
 	int id = last_id++;
 	NavMesh nm;
@@ -708,7 +708,7 @@ Object *Navigation2D::get_closest_point_owner(const Vector2 &p_point) {
 
 void Navigation2D::_bind_methods() {
 
-	ClassDB::bind_method(D_METHOD("navpoly_create", "mesh", "xform", "owner"), &Navigation2D::navpoly_create, DEFVAL(Variant()));
+	ClassDB::bind_method(D_METHOD("navpoly_add", "mesh", "xform", "owner"), &Navigation2D::navpoly_add, DEFVAL(Variant()));
 	ClassDB::bind_method(D_METHOD("navpoly_set_transform", "id", "xform"), &Navigation2D::navpoly_set_transform);
 	ClassDB::bind_method(D_METHOD("navpoly_remove", "id"), &Navigation2D::navpoly_remove);
 
diff --git a/scene/2d/navigation2d.h b/scene/2d/navigation2d.h
index bb97e1a9a9..02dbcb0f96 100644
--- a/scene/2d/navigation2d.h
+++ b/scene/2d/navigation2d.h
@@ -159,7 +159,7 @@ protected:
 
 public:
 	//API should be as dynamic as possible
-	int navpoly_create(const Ref<NavigationPolygon> &p_mesh, const Transform2D &p_xform, Object *p_owner = NULL);
+	int navpoly_add(const Ref<NavigationPolygon> &p_mesh, const Transform2D &p_xform, Object *p_owner = NULL);
 	void navpoly_set_transform(int p_id, const Transform2D &p_xform);
 	void navpoly_remove(int p_id);
 
diff --git a/scene/2d/navigation_polygon.cpp b/scene/2d/navigation_polygon.cpp
index c53241e985..5a6a5128e6 100644
--- a/scene/2d/navigation_polygon.cpp
+++ b/scene/2d/navigation_polygon.cpp
@@ -293,7 +293,7 @@ void NavigationPolygonInstance::set_enabled(bool p_enabled) {
 
 			if (navpoly.is_valid()) {
 
-				nav_id = navigation->navpoly_create(navpoly, get_relative_transform_to_parent(navigation), this);
+				nav_id = navigation->navpoly_add(navpoly, get_relative_transform_to_parent(navigation), this);
 			}
 		}
 	}
@@ -324,7 +324,7 @@ void NavigationPolygonInstance::_notification(int p_what) {
 
 					if (enabled && navpoly.is_valid()) {
 
-						nav_id = navigation->navpoly_create(navpoly, get_relative_transform_to_parent(navigation), this);
+						nav_id = navigation->navpoly_add(navpoly, get_relative_transform_to_parent(navigation), this);
 					}
 					break;
 				}
@@ -419,7 +419,7 @@ void NavigationPolygonInstance::set_navigation_polygon(const Ref<NavigationPolyg
 	}
 
 	if (navigation && navpoly.is_valid() && enabled) {
-		nav_id = navigation->navpoly_create(navpoly, get_relative_transform_to_parent(navigation), this);
+		nav_id = navigation->navpoly_add(navpoly, get_relative_transform_to_parent(navigation), this);
 	}
 	//update_gizmo();
 	_change_notify("navpoly");
diff --git a/scene/2d/tile_map.cpp b/scene/2d/tile_map.cpp
index 1e34372d1e..bcdad177c5 100644
--- a/scene/2d/tile_map.cpp
+++ b/scene/2d/tile_map.cpp
@@ -486,7 +486,7 @@ void TileMap::_update_dirty_quadrants() {
 					xform.set_origin(offset.floor() + q.pos);
 					_fix_cell_transform(xform, c, npoly_ofs + center_ofs, s);
 
-					int pid = navigation->navpoly_create(navpoly, nav_rel * xform);
+					int pid = navigation->navpoly_add(navpoly, nav_rel * xform);
 
 					Quadrant::NavPoly np;
 					np.id = pid;
diff --git a/scene/3d/baked_lightmap.cpp b/scene/3d/baked_lightmap.cpp
new file mode 100644
index 0000000000..3d9bb73181
--- /dev/null
+++ b/scene/3d/baked_lightmap.cpp
@@ -0,0 +1,718 @@
+#include "baked_lightmap.h"
+#include "io/resource_saver.h"
+#include "os/dir_access.h"
+#include "os/os.h"
+#include "voxel_light_baker.h"
+
+void BakedLightmapData::set_bounds(const AABB &p_bounds) {
+
+	bounds = p_bounds;
+	VS::get_singleton()->lightmap_capture_set_bounds(baked_light, p_bounds);
+}
+
+AABB BakedLightmapData::get_bounds() const {
+
+	return bounds;
+}
+
+void BakedLightmapData::set_octree(const PoolVector<uint8_t> &p_octree) {
+
+	VS::get_singleton()->lightmap_capture_set_octree(baked_light, p_octree);
+}
+
+PoolVector<uint8_t> BakedLightmapData::get_octree() const {
+
+	return VS::get_singleton()->lightmap_capture_get_octree(baked_light);
+}
+
+void BakedLightmapData::set_cell_space_transform(const Transform &p_xform) {
+
+	cell_space_xform = p_xform;
+	VS::get_singleton()->lightmap_capture_set_octree_cell_transform(baked_light, p_xform);
+}
+
+Transform BakedLightmapData::get_cell_space_transform() const {
+	return cell_space_xform;
+}
+
+void BakedLightmapData::set_cell_subdiv(int p_cell_subdiv) {
+	cell_subdiv = p_cell_subdiv;
+	VS::get_singleton()->lightmap_capture_set_octree_cell_subdiv(baked_light, p_cell_subdiv);
+}
+
+int BakedLightmapData::get_cell_subdiv() const {
+	return cell_subdiv;
+}
+
+void BakedLightmapData::set_energy(float p_energy) {
+
+	energy = p_energy;
+	VS::get_singleton()->lightmap_capture_set_energy(baked_light, energy);
+}
+
+float BakedLightmapData::get_energy() const {
+
+	return energy;
+}
+
+void BakedLightmapData::add_user(const NodePath &p_path, const Ref<Texture> &p_lightmap) {
+
+	ERR_FAIL_COND(p_lightmap.is_null());
+	User user;
+	user.path = p_path;
+	user.lightmap = p_lightmap;
+	users.push_back(user);
+}
+
+int BakedLightmapData::get_user_count() const {
+
+	return users.size();
+}
+NodePath BakedLightmapData::get_user_path(int p_user) const {
+
+	ERR_FAIL_INDEX_V(p_user, users.size(), NodePath());
+	return users[p_user].path;
+}
+Ref<Texture> BakedLightmapData::get_user_lightmap(int p_user) const {
+
+	ERR_FAIL_INDEX_V(p_user, users.size(), Ref<Texture>());
+	return users[p_user].lightmap;
+}
+
+void BakedLightmapData::clear_users() {
+	users.clear();
+}
+
+void BakedLightmapData::_set_user_data(const Array &p_data) {
+
+	ERR_FAIL_COND(p_data.size() & 1);
+
+	for (int i = 0; i < p_data.size(); i += 2) {
+		add_user(p_data[i], p_data[i + 1]);
+	}
+}
+
+Array BakedLightmapData::_get_user_data() const {
+
+	Array ret;
+	for (int i = 0; i < users.size(); i++) {
+		ret.push_back(users[i].path);
+		ret.push_back(users[i].lightmap);
+	}
+	return ret;
+}
+
+RID BakedLightmapData::get_rid() const {
+	return baked_light;
+}
+void BakedLightmapData::_bind_methods() {
+
+	ClassDB::bind_method(D_METHOD("_set_user_data", "data"), &BakedLightmapData::_set_user_data);
+	ClassDB::bind_method(D_METHOD("_get_user_data"), &BakedLightmapData::_get_user_data);
+
+	ClassDB::bind_method(D_METHOD("set_bounds", "bounds"), &BakedLightmapData::set_bounds);
+	ClassDB::bind_method(D_METHOD("get_bounds"), &BakedLightmapData::get_bounds);
+
+	ClassDB::bind_method(D_METHOD("set_cell_space_transform", "xform"), &BakedLightmapData::set_cell_space_transform);
+	ClassDB::bind_method(D_METHOD("get_cell_space_transform"), &BakedLightmapData::get_cell_space_transform);
+
+	ClassDB::bind_method(D_METHOD("set_cell_subdiv", "cell_subdiv"), &BakedLightmapData::set_cell_subdiv);
+	ClassDB::bind_method(D_METHOD("get_cell_subdiv"), &BakedLightmapData::get_cell_subdiv);
+
+	ClassDB::bind_method(D_METHOD("set_octree", "octree"), &BakedLightmapData::set_octree);
+	ClassDB::bind_method(D_METHOD("get_octree"), &BakedLightmapData::get_octree);
+
+	ClassDB::bind_method(D_METHOD("set_energy", "energy"), &BakedLightmapData::set_energy);
+	ClassDB::bind_method(D_METHOD("get_energy"), &BakedLightmapData::get_energy);
+
+	ClassDB::bind_method(D_METHOD("add_user", "path", "lightmap"), &BakedLightmapData::add_user);
+	ClassDB::bind_method(D_METHOD("get_user_count"), &BakedLightmapData::get_user_count);
+	ClassDB::bind_method(D_METHOD("get_user_path", "user_idx"), &BakedLightmapData::get_user_path);
+	ClassDB::bind_method(D_METHOD("get_user_lightmap", "user_idx"), &BakedLightmapData::get_user_lightmap);
+	ClassDB::bind_method(D_METHOD("clear_users"), &BakedLightmapData::clear_users);
+
+	ADD_PROPERTY(PropertyInfo(Variant::AABB, "bounds", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_bounds", "get_bounds");
+	ADD_PROPERTY(PropertyInfo(Variant::POOL_BYTE_ARRAY, "octree", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_octree", "get_octree");
+	ADD_PROPERTY(PropertyInfo(Variant::TRANSFORM, "cell_space_transform", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_cell_space_transform", "get_cell_space_transform");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "cell_subdiv", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_cell_subdiv", "get_cell_subdiv");
+	ADD_PROPERTY(PropertyInfo(Variant::REAL, "energy", PROPERTY_HINT_RANGE, "0,16,0.01"), "set_energy", "get_energy");
+	ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "user_data", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "_set_user_data", "_get_user_data");
+}
+
+BakedLightmapData::BakedLightmapData() {
+
+	baked_light = VS::get_singleton()->lightmap_capture_create();
+	energy = 1;
+	cell_subdiv = 1;
+}
+
+BakedLightmapData::~BakedLightmapData() {
+
+	VS::get_singleton()->free(baked_light);
+}
+
+///////////////////////////
+
+BakedLightmap::BakeBeginFunc BakedLightmap::bake_begin_function = NULL;
+BakedLightmap::BakeStepFunc BakedLightmap::bake_step_function = NULL;
+BakedLightmap::BakeEndFunc BakedLightmap::bake_end_function = NULL;
+
+void BakedLightmap::set_bake_subdiv(Subdiv p_subdiv) {
+	bake_subdiv = p_subdiv;
+}
+
+BakedLightmap::Subdiv BakedLightmap::get_bake_subdiv() const {
+	return bake_subdiv;
+}
+
+void BakedLightmap::set_capture_subdiv(Subdiv p_subdiv) {
+	capture_subdiv = p_subdiv;
+}
+
+BakedLightmap::Subdiv BakedLightmap::get_capture_subdiv() const {
+	return capture_subdiv;
+}
+
+void BakedLightmap::set_extents(const Vector3 &p_extents) {
+	extents = p_extents;
+	update_gizmo();
+}
+
+Vector3 BakedLightmap::get_extents() const {
+	return extents;
+}
+
+void BakedLightmap::_find_meshes_and_lights(Node *p_at_node, List<PlotMesh> &plot_meshes, List<PlotLight> &plot_lights) {
+
+	MeshInstance *mi = Object::cast_to<MeshInstance>(p_at_node);
+	if (mi && mi->get_flag(GeometryInstance::FLAG_USE_BAKED_LIGHT) && mi->is_visible_in_tree()) {
+		Ref<Mesh> mesh = mi->get_mesh();
+		if (mesh.is_valid()) {
+
+			bool all_have_uv2 = true;
+			for (int i = 0; i < mesh->get_surface_count(); i++) {
+				if (!(mesh->surface_get_format(i) & Mesh::ARRAY_FORMAT_TEX_UV2)) {
+					all_have_uv2 = false;
+					break;
+				}
+			}
+
+			if (all_have_uv2 && mesh->get_lightmap_size_hint() != Size2()) {
+				//READY TO BAKE! size hint could be computed if not found, actually..
+
+				AABB aabb = mesh->get_aabb();
+
+				Transform xf = get_global_transform().affine_inverse() * mi->get_global_transform();
+
+				if (AABB(-extents, extents * 2).intersects(xf.xform(aabb))) {
+					PlotMesh pm;
+					pm.local_xform = xf;
+					pm.mesh = mesh;
+					pm.path = get_path_to(mi);
+					for (int i = 0; i < mesh->get_surface_count(); i++) {
+						pm.instance_materials.push_back(mi->get_surface_material(i));
+					}
+					pm.override_material = mi->get_material_override();
+					plot_meshes.push_back(pm);
+				}
+			}
+		}
+	}
+
+	Light *light = Object::cast_to<Light>(p_at_node);
+
+	if (light && light->get_bake_mode() != Light::BAKE_DISABLED) {
+		PlotLight pl;
+		Transform xf = get_global_transform().affine_inverse() * light->get_global_transform();
+
+		pl.local_xform = xf;
+		pl.light = light;
+		plot_lights.push_back(pl);
+	}
+	for (int i = 0; i < p_at_node->get_child_count(); i++) {
+
+		Node *child = p_at_node->get_child(i);
+		if (!child->get_owner())
+			continue; //maybe a helper
+
+		_find_meshes_and_lights(child, plot_meshes, plot_lights);
+	}
+}
+
+void BakedLightmap::set_hdr(bool p_enable) {
+	hdr = p_enable;
+}
+
+bool BakedLightmap::is_hdr() const {
+	return hdr;
+}
+
+bool BakedLightmap::_bake_time(void *ud, float p_secs, float p_progress) {
+
+	uint64_t time = OS::get_singleton()->get_ticks_usec();
+	BakeTimeData *btd = (BakeTimeData *)ud;
+
+	if (time - btd->last_step > 1000000) {
+
+		int mins_left = p_secs / 60;
+		int secs_left = Math::fmod(p_secs, 60.0f);
+		int percent = p_progress * 100;
+		bool abort = bake_step_function(btd->pass + percent, btd->text + " " + itos(percent) + "% (Time Left: " + itos(mins_left) + ":" + itos(secs_left) + "s)");
+		btd->last_step = time;
+		if (abort)
+			return true;
+	}
+
+	return false;
+}
+
+BakedLightmap::BakeError BakedLightmap::bake(Node *p_from_node, bool p_create_visual_debug) {
+
+	String save_path;
+
+	if (image_path.begins_with("res://")) {
+		save_path = image_path;
+	} else {
+		if (get_filename() != "") {
+			save_path = get_filename().get_base_dir();
+		} else if (get_owner() && get_owner()->get_filename() != "") {
+			save_path = get_owner()->get_filename().get_base_dir();
+		}
+
+		if (save_path == "") {
+			return BAKE_ERROR_NO_SAVE_PATH;
+		}
+		if (image_path != "") {
+			save_path.plus_file(image_path);
+		}
+	}
+	{
+		//check for valid save path
+		DirAccessRef d = DirAccess::open(save_path);
+		if (!d) {
+			ERR_PRINTS("Invalid Save Path: " + save_path);
+			return BAKE_ERROR_NO_SAVE_PATH;
+		}
+	}
+
+	Ref<BakedLightmapData> new_light_data;
+	new_light_data.instance();
+
+	static const int subdiv_value[SUBDIV_MAX] = { 8, 9, 10, 11, 12, 13 };
+
+	VoxelLightBaker baker;
+
+	baker.begin_bake(subdiv_value[bake_subdiv], AABB(-extents, extents * 2.0));
+
+	List<PlotMesh> mesh_list;
+	List<PlotLight> light_list;
+
+	_find_meshes_and_lights(p_from_node ? p_from_node : get_parent(), mesh_list, light_list);
+
+	if (bake_begin_function) {
+		bake_begin_function(mesh_list.size() + light_list.size() + 1 + mesh_list.size() * 100);
+	}
+
+	int step = 0;
+
+	int pmc = 0;
+
+	for (List<PlotMesh>::Element *E = mesh_list.front(); E; E = E->next()) {
+
+		if (bake_step_function) {
+			bake_step_function(step++, RTR("Plotting Meshes: ") + " (" + itos(pmc + 1) + "/" + itos(mesh_list.size()) + ")");
+		}
+
+		pmc++;
+		baker.plot_mesh(E->get().local_xform, E->get().mesh, E->get().instance_materials, E->get().override_material);
+	}
+
+	pmc = 0;
+	baker.begin_bake_light(VoxelLightBaker::BakeQuality(bake_quality), VoxelLightBaker::BakeMode(bake_mode), propagation, energy);
+
+	for (List<PlotLight>::Element *E = light_list.front(); E; E = E->next()) {
+
+		if (bake_step_function) {
+			bake_step_function(step++, RTR("Plotting Lights:") + " (" + itos(pmc + 1) + "/" + itos(light_list.size()) + ")");
+		}
+
+		pmc++;
+		PlotLight pl = E->get();
+		switch (pl.light->get_light_type()) {
+			case VS::LIGHT_DIRECTIONAL: {
+				baker.plot_light_directional(-pl.local_xform.basis.get_axis(2), pl.light->get_color(), pl.light->get_param(Light::PARAM_ENERGY), pl.light->get_param(Light::PARAM_INDIRECT_ENERGY), pl.light->get_bake_mode() == Light::BAKE_ALL);
+			} break;
+			case VS::LIGHT_OMNI: {
+				baker.plot_light_omni(pl.local_xform.origin, pl.light->get_color(), pl.light->get_param(Light::PARAM_ENERGY), pl.light->get_param(Light::PARAM_INDIRECT_ENERGY), pl.light->get_param(Light::PARAM_RANGE), pl.light->get_param(Light::PARAM_ATTENUATION), pl.light->get_bake_mode() == Light::BAKE_ALL);
+			} break;
+			case VS::LIGHT_SPOT: {
+				baker.plot_light_spot(pl.local_xform.origin, pl.local_xform.basis.get_axis(2), pl.light->get_color(), pl.light->get_param(Light::PARAM_ENERGY), pl.light->get_param(Light::PARAM_INDIRECT_ENERGY), pl.light->get_param(Light::PARAM_RANGE), pl.light->get_param(Light::PARAM_ATTENUATION), pl.light->get_param(Light::PARAM_SPOT_ANGLE), pl.light->get_param(Light::PARAM_SPOT_ATTENUATION), pl.light->get_bake_mode() == Light::BAKE_ALL);
+
+			} break;
+		}
+	}
+	/*if (bake_step_function) {
+		bake_step_function(pmc++, RTR("Finishing Plot"));
+	}*/
+
+	baker.end_bake();
+
+	Set<String> used_mesh_names;
+
+	pmc = 0;
+	for (List<PlotMesh>::Element *E = mesh_list.front(); E; E = E->next()) {
+
+		String mesh_name = E->get().mesh->get_name();
+		if (mesh_name == "" || mesh_name.find(":") != -1 || mesh_name.find("/") != -1) {
+			mesh_name = "LightMap";
+		}
+
+		if (used_mesh_names.has(mesh_name)) {
+			int idx = 2;
+			String base = mesh_name;
+			while (true) {
+				mesh_name = base + itos(idx);
+				if (!used_mesh_names.has(mesh_name))
+					break;
+				idx++;
+			}
+		}
+		used_mesh_names.insert(mesh_name);
+
+		pmc++;
+		VoxelLightBaker::LightMapData lm;
+
+		Error err;
+		if (bake_step_function) {
+			BakeTimeData btd;
+			btd.text = RTR("Lighting Meshes: ") + mesh_name + " (" + itos(pmc) + "/" + itos(mesh_list.size()) + ")";
+			btd.pass = step;
+			btd.last_step = 0;
+			err = baker.make_lightmap(E->get().local_xform, E->get().mesh, lm, _bake_time, &btd);
+			if (err != OK) {
+				bake_end_function();
+				if (err == ERR_SKIP)
+					return BAKE_ERROR_USER_ABORTED;
+				return BAKE_ERROR_CANT_CREATE_IMAGE;
+			}
+			step += 100;
+		} else {
+
+			err = baker.make_lightmap(E->get().local_xform, E->get().mesh, lm);
+		}
+
+		if (err == OK) {
+
+			Ref<Image> image;
+			image.instance();
+
+			uint32_t tex_flags = Texture::FLAGS_DEFAULT;
+			if (hdr) {
+
+				//just save a regular image
+				PoolVector<uint8_t> data;
+				int s = lm.light.size();
+				data.resize(lm.light.size() * 2);
+				{
+
+					PoolVector<uint8_t>::Write w = data.write();
+					PoolVector<float>::Read r = lm.light.read();
+					uint16_t *hfw = (uint16_t *)w.ptr();
+					for (int i = 0; i < s; i++) {
+						hfw[i] = Math::make_half_float(r[i]);
+					}
+				}
+
+				image->create(lm.width, lm.height, false, Image::FORMAT_RGBH, data);
+
+			} else {
+
+				//just save a regular image
+				PoolVector<uint8_t> data;
+				int s = lm.light.size();
+				data.resize(lm.light.size());
+				{
+
+					PoolVector<uint8_t>::Write w = data.write();
+					PoolVector<float>::Read r = lm.light.read();
+					for (int i = 0; i < s; i += 3) {
+						Color c(r[i + 0], r[i + 1], r[i + 2]);
+						c = c.to_srgb();
+						w[i + 0] = CLAMP(c.r * 255, 0, 255);
+						w[i + 1] = CLAMP(c.g * 255, 0, 255);
+						w[i + 2] = CLAMP(c.b * 255, 0, 255);
+					}
+				}
+
+				image->create(lm.width, lm.height, false, Image::FORMAT_RGB8, data);
+
+				//This texture is saved to SRGB for two reasons:
+				// 1) first is so it looks better when doing the LINEAR->SRGB conversion (more accurate)
+				// 2) So it can be used in the GLES2 backend, which does not support linkear workflow
+				tex_flags |= Texture::FLAG_CONVERT_TO_LINEAR;
+			}
+
+			Ref<ImageTexture> tex;
+			String image_path = save_path.plus_file(mesh_name + ".tex");
+			bool set_path = true;
+			if (ResourceCache::has(image_path)) {
+				tex = Ref<Resource>((Resource *)ResourceCache::get(image_path));
+				set_path = false;
+			}
+
+			if (!tex.is_valid()) {
+				tex.instance();
+			}
+
+			tex->create_from_image(image, tex_flags);
+
+			err = ResourceSaver::save(image_path, tex, ResourceSaver::FLAG_CHANGE_PATH);
+			if (err != OK) {
+				if (bake_end_function) {
+					bake_end_function();
+				}
+				ERR_FAIL_COND_V(err != OK, BAKE_ERROR_CANT_CREATE_IMAGE);
+			}
+
+			if (set_path) {
+				tex->set_path(image_path);
+			}
+			new_light_data->add_user(E->get().path, tex);
+		}
+	}
+
+	int csubdiv = subdiv_value[capture_subdiv];
+	AABB bounds = AABB(-extents, extents * 2);
+	new_light_data->set_cell_subdiv(csubdiv);
+	new_light_data->set_bounds(bounds);
+	new_light_data->set_octree(baker.create_capture_octree(csubdiv));
+	{
+
+		Transform to_bounds;
+		to_bounds.basis.scale(Vector3(bounds.get_longest_axis_size(), bounds.get_longest_axis_size(), bounds.get_longest_axis_size()));
+		to_bounds.origin = bounds.position;
+
+		Transform to_grid;
+		to_grid.basis.scale(Vector3(1 << (csubdiv - 1), 1 << (csubdiv - 1), 1 << (csubdiv - 1)));
+
+		Transform to_cell_space = to_grid * to_bounds.affine_inverse();
+		new_light_data->set_cell_space_transform(to_cell_space);
+	}
+
+	if (bake_end_function) {
+		bake_end_function();
+	}
+
+	//create the data for visual server
+
+	if (p_create_visual_debug) {
+		MultiMeshInstance *mmi = memnew(MultiMeshInstance);
+		mmi->set_multimesh(baker.create_debug_multimesh(VoxelLightBaker::DEBUG_LIGHT));
+		add_child(mmi);
+#ifdef TOOLS_ENABLED
+		if (get_tree()->get_edited_scene_root() == this) {
+			mmi->set_owner(this);
+		} else {
+			mmi->set_owner(get_owner());
+		}
+#else
+		mmi->set_owner(get_owner());
+#endif
+	}
+
+	set_light_data(new_light_data);
+
+	return BAKE_ERROR_OK;
+}
+
+void BakedLightmap::_notification(int p_what) {
+	if (p_what == NOTIFICATION_READY) {
+
+		if (light_data.is_valid()) {
+			_assign_lightmaps();
+		}
+		request_ready(); //will need ready again if re-enters tree
+	}
+
+	if (p_what == NOTIFICATION_EXIT_TREE) {
+
+		if (light_data.is_valid()) {
+			_clear_lightmaps();
+		}
+	}
+}
+
+void BakedLightmap::_assign_lightmaps() {
+
+	ERR_FAIL_COND(!light_data.is_valid());
+
+	for (int i = 0; i < light_data->get_user_count(); i++) {
+		Node *node = get_node(light_data->get_user_path(i));
+		VisualInstance *vi = Object::cast_to<VisualInstance>(node);
+		ERR_CONTINUE(!vi);
+		Ref<Texture> lightmap = light_data->get_user_lightmap(i);
+		ERR_CONTINUE(!lightmap.is_valid());
+		VS::get_singleton()->instance_set_use_lightmap(vi->get_instance(), get_instance(), lightmap->get_rid());
+	}
+}
+
+void BakedLightmap::_clear_lightmaps() {
+	ERR_FAIL_COND(!light_data.is_valid());
+	for (int i = 0; i < light_data->get_user_count(); i++) {
+		Node *node = get_node(light_data->get_user_path(i));
+		VisualInstance *vi = Object::cast_to<VisualInstance>(node);
+		ERR_CONTINUE(!vi);
+		VS::get_singleton()->instance_set_use_lightmap(vi->get_instance(), RID(), RID());
+	}
+}
+
+void BakedLightmap::set_light_data(const Ref<BakedLightmapData> &p_data) {
+
+	if (light_data.is_valid()) {
+		if (is_inside_tree()) {
+			_clear_lightmaps();
+		}
+		set_base(RID());
+	}
+	light_data = p_data;
+
+	if (light_data.is_valid()) {
+		set_base(light_data->get_rid());
+		if (is_inside_tree()) {
+			_assign_lightmaps();
+		}
+	}
+}
+
+Ref<BakedLightmapData> BakedLightmap::get_light_data() const {
+
+	return light_data;
+}
+
+void BakedLightmap::_debug_bake() {
+	bake(get_parent(), true);
+}
+
+void BakedLightmap::set_propagation(float p_propagation) {
+	propagation = p_propagation;
+}
+
+float BakedLightmap::get_propagation() const {
+
+	return propagation;
+}
+
+void BakedLightmap::set_energy(float p_energy) {
+	energy = p_energy;
+}
+
+float BakedLightmap::get_energy() const {
+
+	return energy;
+}
+
+void BakedLightmap::set_bake_quality(BakeQuality p_quality) {
+	bake_quality = p_quality;
+}
+
+BakedLightmap::BakeQuality BakedLightmap::get_bake_quality() const {
+	return bake_quality;
+}
+
+void BakedLightmap::set_bake_mode(BakeMode p_mode) {
+	bake_mode = p_mode;
+}
+
+BakedLightmap::BakeMode BakedLightmap::get_bake_mode() const {
+	return bake_mode;
+}
+
+void BakedLightmap::set_image_path(const String &p_path) {
+	image_path = p_path;
+}
+
+String BakedLightmap::get_image_path() const {
+	return image_path;
+}
+
+AABB BakedLightmap::get_aabb() const {
+	return AABB(-extents, extents * 2);
+}
+PoolVector<Face3> BakedLightmap::get_faces(uint32_t p_usage_flags) const {
+	return PoolVector<Face3>();
+}
+
+void BakedLightmap::_bind_methods() {
+
+	ClassDB::bind_method(D_METHOD("set_light_data", "data"), &BakedLightmap::set_light_data);
+	ClassDB::bind_method(D_METHOD("get_light_data"), &BakedLightmap::get_light_data);
+
+	ClassDB::bind_method(D_METHOD("set_bake_subdiv", "bake_subdiv"), &BakedLightmap::set_bake_subdiv);
+	ClassDB::bind_method(D_METHOD("get_bake_subdiv"), &BakedLightmap::get_bake_subdiv);
+
+	ClassDB::bind_method(D_METHOD("set_capture_subdiv", "capture_subdiv"), &BakedLightmap::set_capture_subdiv);
+	ClassDB::bind_method(D_METHOD("get_capture_subdiv"), &BakedLightmap::get_capture_subdiv);
+
+	ClassDB::bind_method(D_METHOD("set_bake_quality", "bake_quality"), &BakedLightmap::set_bake_quality);
+	ClassDB::bind_method(D_METHOD("get_bake_quality"), &BakedLightmap::get_bake_quality);
+
+	ClassDB::bind_method(D_METHOD("set_bake_mode", "bake_mode"), &BakedLightmap::set_bake_mode);
+	ClassDB::bind_method(D_METHOD("get_bake_mode"), &BakedLightmap::get_bake_mode);
+
+	ClassDB::bind_method(D_METHOD("set_extents", "extents"), &BakedLightmap::set_extents);
+	ClassDB::bind_method(D_METHOD("get_extents"), &BakedLightmap::get_extents);
+
+	ClassDB::bind_method(D_METHOD("set_propagation", "propagation"), &BakedLightmap::set_propagation);
+	ClassDB::bind_method(D_METHOD("get_propagation"), &BakedLightmap::get_propagation);
+
+	ClassDB::bind_method(D_METHOD("set_energy", "energy"), &BakedLightmap::set_energy);
+	ClassDB::bind_method(D_METHOD("get_energy"), &BakedLightmap::get_energy);
+
+	ClassDB::bind_method(D_METHOD("set_hdr", "hdr"), &BakedLightmap::set_hdr);
+	ClassDB::bind_method(D_METHOD("is_hdr"), &BakedLightmap::is_hdr);
+
+	ClassDB::bind_method(D_METHOD("set_image_path", "image_path"), &BakedLightmap::set_image_path);
+	ClassDB::bind_method(D_METHOD("get_image_path"), &BakedLightmap::get_image_path);
+
+	ClassDB::bind_method(D_METHOD("bake", "from_node", "create_visual_debug"), &BakedLightmap::bake, DEFVAL(Variant()), DEFVAL(false));
+	ClassDB::bind_method(D_METHOD("debug_bake"), &BakedLightmap::_debug_bake);
+	ClassDB::set_method_flags(get_class_static(), _scs_create("debug_bake"), METHOD_FLAGS_DEFAULT | METHOD_FLAG_EDITOR);
+
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "bake_subdiv", PROPERTY_HINT_ENUM, "128,256,512,1024,2048,4096"), "set_bake_subdiv", "get_bake_subdiv");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "capture_subdiv", PROPERTY_HINT_ENUM, "128,256,512"), "set_capture_subdiv", "get_capture_subdiv");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "bake_quality", PROPERTY_HINT_ENUM, "Low,Medium,High"), "set_bake_quality", "get_bake_quality");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "bake_mode", PROPERTY_HINT_ENUM, "ConeTrace,RayTrace"), "set_bake_mode", "get_bake_mode");
+	ADD_PROPERTY(PropertyInfo(Variant::REAL, "propagation", PROPERTY_HINT_RANGE, "0,1,0.01"), "set_propagation", "get_propagation");
+	ADD_PROPERTY(PropertyInfo(Variant::REAL, "energy", PROPERTY_HINT_RANGE, "0,32,0.01"), "set_energy", "get_energy");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "hdr"), "set_hdr", "is_hdr");
+	ADD_PROPERTY(PropertyInfo(Variant::STRING, "image_path", PROPERTY_HINT_DIR), "set_image_path", "get_image_path");
+	ADD_PROPERTY(PropertyInfo(Variant::VECTOR3, "extents"), "set_extents", "get_extents");
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "light_data", PROPERTY_HINT_RESOURCE_TYPE, "BakedIndirectLightData"), "set_light_data", "get_light_data");
+
+	BIND_ENUM_CONSTANT(SUBDIV_128);
+	BIND_ENUM_CONSTANT(SUBDIV_256);
+	BIND_ENUM_CONSTANT(SUBDIV_512);
+	BIND_ENUM_CONSTANT(SUBDIV_1024);
+	BIND_ENUM_CONSTANT(SUBDIV_2048);
+	BIND_ENUM_CONSTANT(SUBDIV_4096);
+	BIND_ENUM_CONSTANT(SUBDIV_MAX);
+
+	BIND_ENUM_CONSTANT(BAKE_QUALITY_LOW);
+	BIND_ENUM_CONSTANT(BAKE_QUALITY_MEDIUM);
+	BIND_ENUM_CONSTANT(BAKE_QUALITY_HIGH);
+	BIND_ENUM_CONSTANT(BAKE_MODE_CONE_TRACE);
+	BIND_ENUM_CONSTANT(BAKE_MODE_RAY_TRACE);
+}
+
+BakedLightmap::BakedLightmap() {
+
+	extents = Vector3(10, 10, 10);
+	bake_subdiv = SUBDIV_256;
+	capture_subdiv = SUBDIV_128;
+	bake_quality = BAKE_QUALITY_MEDIUM;
+	bake_mode = BAKE_MODE_CONE_TRACE;
+	energy = 1;
+	propagation = 1;
+	hdr = false;
+	image_path = ".";
+}
diff --git a/scene/3d/baked_lightmap.h b/scene/3d/baked_lightmap.h
new file mode 100644
index 0000000000..5595ec1e61
--- /dev/null
+++ b/scene/3d/baked_lightmap.h
@@ -0,0 +1,189 @@
+#ifndef BAKED_INDIRECT_LIGHT_H
+#define BAKED_INDIRECT_LIGHT_H
+
+#include "multimesh_instance.h"
+#include "scene/3d/light.h"
+#include "scene/3d/visual_instance.h"
+
+class BakedLightmapData : public Resource {
+	GDCLASS(BakedLightmapData, Resource);
+
+	RID baked_light;
+	AABB bounds;
+	float energy;
+	int cell_subdiv;
+	Transform cell_space_xform;
+
+	struct User {
+
+		NodePath path;
+		Ref<Texture> lightmap;
+	};
+
+	Vector<User> users;
+
+	void _set_user_data(const Array &p_data);
+	Array _get_user_data() const;
+
+protected:
+	static void _bind_methods();
+
+public:
+	void set_bounds(const AABB &p_bounds);
+	AABB get_bounds() const;
+
+	void set_octree(const PoolVector<uint8_t> &p_octree);
+	PoolVector<uint8_t> get_octree() const;
+
+	void set_cell_space_transform(const Transform &p_xform);
+	Transform get_cell_space_transform() const;
+
+	void set_cell_subdiv(int p_cell_subdiv);
+	int get_cell_subdiv() const;
+
+	void set_energy(float p_energy);
+	float get_energy() const;
+
+	void add_user(const NodePath &p_path, const Ref<Texture> &p_lightmap);
+	int get_user_count() const;
+	NodePath get_user_path(int p_user) const;
+	Ref<Texture> get_user_lightmap(int p_user) const;
+	void clear_users();
+
+	virtual RID get_rid() const;
+	BakedLightmapData();
+	~BakedLightmapData();
+};
+
+class BakedLightmap : public VisualInstance {
+	GDCLASS(BakedLightmap, VisualInstance);
+
+public:
+	enum Subdiv {
+		SUBDIV_128,
+		SUBDIV_256,
+		SUBDIV_512,
+		SUBDIV_1024,
+		SUBDIV_2048,
+		SUBDIV_4096,
+		SUBDIV_MAX
+
+	};
+
+	enum BakeQuality {
+		BAKE_QUALITY_LOW,
+		BAKE_QUALITY_MEDIUM,
+		BAKE_QUALITY_HIGH
+	};
+
+	enum BakeMode {
+		BAKE_MODE_CONE_TRACE,
+		BAKE_MODE_RAY_TRACE,
+	};
+
+	enum BakeError {
+		BAKE_ERROR_OK,
+		BAKE_ERROR_NO_SAVE_PATH,
+		BAKE_ERROR_NO_MESHES,
+		BAKE_ERROR_CANT_CREATE_IMAGE,
+		BAKE_ERROR_USER_ABORTED
+
+	};
+
+	typedef void (*BakeBeginFunc)(int);
+	typedef bool (*BakeStepFunc)(int, const String &);
+	typedef void (*BakeEndFunc)();
+
+private:
+	Subdiv bake_subdiv;
+	Subdiv capture_subdiv;
+	Vector3 extents;
+	float propagation;
+	float energy;
+	BakeQuality bake_quality;
+	BakeMode bake_mode;
+	bool hdr;
+	String image_path;
+
+	Ref<BakedLightmapData> light_data;
+
+	struct PlotMesh {
+		Ref<Material> override_material;
+		Vector<Ref<Material> > instance_materials;
+		Ref<Mesh> mesh;
+		Transform local_xform;
+		NodePath path;
+	};
+
+	struct PlotLight {
+		Light *light;
+		Transform local_xform;
+	};
+
+	void _find_meshes_and_lights(Node *p_at_node, List<PlotMesh> &plot_meshes, List<PlotLight> &plot_lights);
+
+	void _debug_bake();
+
+	void _assign_lightmaps();
+	void _clear_lightmaps();
+
+	static bool _bake_time(void *ud, float p_secs, float p_progress);
+
+	struct BakeTimeData {
+		String text;
+		int pass;
+		uint64_t last_step;
+	};
+
+protected:
+	static void _bind_methods();
+	void _notification(int p_what);
+
+public:
+	static BakeBeginFunc bake_begin_function;
+	static BakeStepFunc bake_step_function;
+	static BakeEndFunc bake_end_function;
+
+	void set_light_data(const Ref<BakedLightmapData> &p_data);
+	Ref<BakedLightmapData> get_light_data() const;
+
+	void set_bake_subdiv(Subdiv p_subdiv);
+	Subdiv get_bake_subdiv() const;
+
+	void set_capture_subdiv(Subdiv p_subdiv);
+	Subdiv get_capture_subdiv() const;
+
+	void set_extents(const Vector3 &p_extents);
+	Vector3 get_extents() const;
+
+	void set_propagation(float p_propagation);
+	float get_propagation() const;
+
+	void set_energy(float p_energy);
+	float get_energy() const;
+
+	void set_bake_quality(BakeQuality p_quality);
+	BakeQuality get_bake_quality() const;
+
+	void set_bake_mode(BakeMode p_mode);
+	BakeMode get_bake_mode() const;
+
+	void set_hdr(bool p_enable);
+	bool is_hdr() const;
+
+	void set_image_path(const String &p_path);
+	String get_image_path() const;
+
+	AABB get_aabb() const;
+	PoolVector<Face3> get_faces(uint32_t p_usage_flags) const;
+
+	BakeError bake(Node *p_from_node, bool p_create_visual_debug = false);
+	BakedLightmap();
+};
+
+VARIANT_ENUM_CAST(BakedLightmap::Subdiv);
+VARIANT_ENUM_CAST(BakedLightmap::BakeQuality);
+VARIANT_ENUM_CAST(BakedLightmap::BakeMode);
+VARIANT_ENUM_CAST(BakedLightmap::BakeError);
+
+#endif // BAKED_INDIRECT_LIGHT_H
diff --git a/scene/3d/gi_probe.cpp b/scene/3d/gi_probe.cpp
index 1f2b43165e..9c811a74bf 100644
--- a/scene/3d/gi_probe.cpp
+++ b/scene/3d/gi_probe.cpp
@@ -30,6 +30,7 @@
 #include "gi_probe.h"
 
 #include "mesh_instance.h"
+#include "voxel_light_baker.h"
 
 void GIProbeData::set_bounds(const AABB &p_bounds) {
 
@@ -329,754 +330,7 @@ bool GIProbe::is_compressed() const {
 	return compress;
 }
 
-#include "math.h"
-
-#define FINDMINMAX(x0, x1, x2, min, max) \
-	min = max = x0;                      \
-	if (x1 < min) min = x1;              \
-	if (x1 > max) max = x1;              \
-	if (x2 < min) min = x2;              \
-	if (x2 > max) max = x2;
-
-static bool planeBoxOverlap(Vector3 normal, float d, Vector3 maxbox) {
-	int q;
-	Vector3 vmin, vmax;
-	for (q = 0; q <= 2; q++) {
-		if (normal[q] > 0.0f) {
-			vmin[q] = -maxbox[q];
-			vmax[q] = maxbox[q];
-		} else {
-			vmin[q] = maxbox[q];
-			vmax[q] = -maxbox[q];
-		}
-	}
-	if (normal.dot(vmin) + d > 0.0f) return false;
-	if (normal.dot(vmax) + d >= 0.0f) return true;
-
-	return false;
-}
-
-/*======================== X-tests ========================*/
-#define AXISTEST_X01(a, b, fa, fb)                 \
-	p0 = a * v0.y - b * v0.z;                      \
-	p2 = a * v2.y - b * v2.z;                      \
-	if (p0 < p2) {                                 \
-		min = p0;                                  \
-		max = p2;                                  \
-	} else {                                       \
-		min = p2;                                  \
-		max = p0;                                  \
-	}                                              \
-	rad = fa * boxhalfsize.y + fb * boxhalfsize.z; \
-	if (min > rad || max < -rad) return false;
-
-#define AXISTEST_X2(a, b, fa, fb)                  \
-	p0 = a * v0.y - b * v0.z;                      \
-	p1 = a * v1.y - b * v1.z;                      \
-	if (p0 < p1) {                                 \
-		min = p0;                                  \
-		max = p1;                                  \
-	} else {                                       \
-		min = p1;                                  \
-		max = p0;                                  \
-	}                                              \
-	rad = fa * boxhalfsize.y + fb * boxhalfsize.z; \
-	if (min > rad || max < -rad) return false;
-
-/*======================== Y-tests ========================*/
-#define AXISTEST_Y02(a, b, fa, fb)                 \
-	p0 = -a * v0.x + b * v0.z;                     \
-	p2 = -a * v2.x + b * v2.z;                     \
-	if (p0 < p2) {                                 \
-		min = p0;                                  \
-		max = p2;                                  \
-	} else {                                       \
-		min = p2;                                  \
-		max = p0;                                  \
-	}                                              \
-	rad = fa * boxhalfsize.x + fb * boxhalfsize.z; \
-	if (min > rad || max < -rad) return false;
-
-#define AXISTEST_Y1(a, b, fa, fb)                  \
-	p0 = -a * v0.x + b * v0.z;                     \
-	p1 = -a * v1.x + b * v1.z;                     \
-	if (p0 < p1) {                                 \
-		min = p0;                                  \
-		max = p1;                                  \
-	} else {                                       \
-		min = p1;                                  \
-		max = p0;                                  \
-	}                                              \
-	rad = fa * boxhalfsize.x + fb * boxhalfsize.z; \
-	if (min > rad || max < -rad) return false;
-
-	/*======================== Z-tests ========================*/
-
-#define AXISTEST_Z12(a, b, fa, fb)                 \
-	p1 = a * v1.x - b * v1.y;                      \
-	p2 = a * v2.x - b * v2.y;                      \
-	if (p2 < p1) {                                 \
-		min = p2;                                  \
-		max = p1;                                  \
-	} else {                                       \
-		min = p1;                                  \
-		max = p2;                                  \
-	}                                              \
-	rad = fa * boxhalfsize.x + fb * boxhalfsize.y; \
-	if (min > rad || max < -rad) return false;
-
-#define AXISTEST_Z0(a, b, fa, fb)                  \
-	p0 = a * v0.x - b * v0.y;                      \
-	p1 = a * v1.x - b * v1.y;                      \
-	if (p0 < p1) {                                 \
-		min = p0;                                  \
-		max = p1;                                  \
-	} else {                                       \
-		min = p1;                                  \
-		max = p0;                                  \
-	}                                              \
-	rad = fa * boxhalfsize.x + fb * boxhalfsize.y; \
-	if (min > rad || max < -rad) return false;
-
-static bool fast_tri_box_overlap(const Vector3 &boxcenter, const Vector3 boxhalfsize, const Vector3 *triverts) {
-
-	/*    use separating axis theorem to test overlap between triangle and box */
-	/*    need to test for overlap in these directions: */
-	/*    1) the {x,y,z}-directions (actually, since we use the AABB of the triangle */
-	/*       we do not even need to test these) */
-	/*    2) normal of the triangle */
-	/*    3) crossproduct(edge from tri, {x,y,z}-directin) */
-	/*       this gives 3x3=9 more tests */
-	Vector3 v0, v1, v2;
-	float min, max, d, p0, p1, p2, rad, fex, fey, fez;
-	Vector3 normal, e0, e1, e2;
-
-	/* This is the fastest branch on Sun */
-	/* move everything so that the boxcenter is in (0,0,0) */
-
-	v0 = triverts[0] - boxcenter;
-	v1 = triverts[1] - boxcenter;
-	v2 = triverts[2] - boxcenter;
-
-	/* compute triangle edges */
-	e0 = v1 - v0; /* tri edge 0 */
-	e1 = v2 - v1; /* tri edge 1 */
-	e2 = v0 - v2; /* tri edge 2 */
-
-	/* Bullet 3:  */
-	/*  test the 9 tests first (this was faster) */
-	fex = Math::abs(e0.x);
-	fey = Math::abs(e0.y);
-	fez = Math::abs(e0.z);
-	AXISTEST_X01(e0.z, e0.y, fez, fey);
-	AXISTEST_Y02(e0.z, e0.x, fez, fex);
-	AXISTEST_Z12(e0.y, e0.x, fey, fex);
-
-	fex = Math::abs(e1.x);
-	fey = Math::abs(e1.y);
-	fez = Math::abs(e1.z);
-	AXISTEST_X01(e1.z, e1.y, fez, fey);
-	AXISTEST_Y02(e1.z, e1.x, fez, fex);
-	AXISTEST_Z0(e1.y, e1.x, fey, fex);
-
-	fex = Math::abs(e2.x);
-	fey = Math::abs(e2.y);
-	fez = Math::abs(e2.z);
-	AXISTEST_X2(e2.z, e2.y, fez, fey);
-	AXISTEST_Y1(e2.z, e2.x, fez, fex);
-	AXISTEST_Z12(e2.y, e2.x, fey, fex);
-
-	/* Bullet 1: */
-	/*  first test overlap in the {x,y,z}-directions */
-	/*  find min, max of the triangle each direction, and test for overlap in */
-	/*  that direction -- this is equivalent to testing a minimal AABB around */
-	/*  the triangle against the AABB */
-
-	/* test in X-direction */
-	FINDMINMAX(v0.x, v1.x, v2.x, min, max);
-	if (min > boxhalfsize.x || max < -boxhalfsize.x) return false;
-
-	/* test in Y-direction */
-	FINDMINMAX(v0.y, v1.y, v2.y, min, max);
-	if (min > boxhalfsize.y || max < -boxhalfsize.y) return false;
-
-	/* test in Z-direction */
-	FINDMINMAX(v0.z, v1.z, v2.z, min, max);
-	if (min > boxhalfsize.z || max < -boxhalfsize.z) return false;
-
-	/* Bullet 2: */
-	/*  test if the box intersects the plane of the triangle */
-	/*  compute plane equation of triangle: normal*x+d=0 */
-	normal = e0.cross(e1);
-	d = -normal.dot(v0); /* plane eq: normal.x+d=0 */
-	if (!planeBoxOverlap(normal, d, boxhalfsize)) return false;
-
-	return true; /* box and triangle overlaps */
-}
-
-static _FORCE_INLINE_ Vector2 get_uv(const Vector3 &p_pos, const Vector3 *p_vtx, const Vector2 *p_uv) {
-
-	if (p_pos.distance_squared_to(p_vtx[0]) < CMP_EPSILON2)
-		return p_uv[0];
-	if (p_pos.distance_squared_to(p_vtx[1]) < CMP_EPSILON2)
-		return p_uv[1];
-	if (p_pos.distance_squared_to(p_vtx[2]) < CMP_EPSILON2)
-		return p_uv[2];
-
-	Vector3 v0 = p_vtx[1] - p_vtx[0];
-	Vector3 v1 = p_vtx[2] - p_vtx[0];
-	Vector3 v2 = p_pos - p_vtx[0];
-
-	float d00 = v0.dot(v0);
-	float d01 = v0.dot(v1);
-	float d11 = v1.dot(v1);
-	float d20 = v2.dot(v0);
-	float d21 = v2.dot(v1);
-	float denom = (d00 * d11 - d01 * d01);
-	if (denom == 0)
-		return p_uv[0];
-	float v = (d11 * d20 - d01 * d21) / denom;
-	float w = (d00 * d21 - d01 * d20) / denom;
-	float u = 1.0f - v - w;
-
-	return p_uv[0] * u + p_uv[1] * v + p_uv[2] * w;
-}
-
-void GIProbe::_plot_face(int p_idx, int p_level, int p_x, int p_y, int p_z, const Vector3 *p_vtx, const Vector2 *p_uv, const Baker::MaterialCache &p_material, const AABB &p_aabb, Baker *p_baker) {
-
-	if (p_level == p_baker->cell_subdiv - 1) {
-		//plot the face by guessing it's albedo and emission value
-
-		//find best axis to map to, for scanning values
-		int closest_axis = 0;
-		float closest_dot = 0;
-
-		Plane plane = Plane(p_vtx[0], p_vtx[1], p_vtx[2]);
-		Vector3 normal = plane.normal;
-
-		for (int i = 0; i < 3; i++) {
-
-			Vector3 axis;
-			axis[i] = 1.0;
-			float dot = ABS(normal.dot(axis));
-			if (i == 0 || dot > closest_dot) {
-				closest_axis = i;
-				closest_dot = dot;
-			}
-		}
-
-		Vector3 axis;
-		axis[closest_axis] = 1.0;
-		Vector3 t1;
-		t1[(closest_axis + 1) % 3] = 1.0;
-		Vector3 t2;
-		t2[(closest_axis + 2) % 3] = 1.0;
-
-		t1 *= p_aabb.size[(closest_axis + 1) % 3] / float(color_scan_cell_width);
-		t2 *= p_aabb.size[(closest_axis + 2) % 3] / float(color_scan_cell_width);
-
-		Color albedo_accum;
-		Color emission_accum;
-		Vector3 normal_accum;
-
-		float alpha = 0.0;
-
-		//map to a grid average in the best axis for this face
-		for (int i = 0; i < color_scan_cell_width; i++) {
-
-			Vector3 ofs_i = float(i) * t1;
-
-			for (int j = 0; j < color_scan_cell_width; j++) {
-
-				Vector3 ofs_j = float(j) * t2;
-
-				Vector3 from = p_aabb.position + ofs_i + ofs_j;
-				Vector3 to = from + t1 + t2 + axis * p_aabb.size[closest_axis];
-				Vector3 half = (to - from) * 0.5;
-
-				//is in this cell?
-				if (!fast_tri_box_overlap(from + half, half, p_vtx)) {
-					continue; //face does not span this cell
-				}
-
-				//go from -size to +size*2 to avoid skipping collisions
-				Vector3 ray_from = from + (t1 + t2) * 0.5 - axis * p_aabb.size[closest_axis];
-				Vector3 ray_to = ray_from + axis * p_aabb.size[closest_axis] * 2;
-
-				if (normal.dot(ray_from - ray_to) < 0) {
-					SWAP(ray_from, ray_to);
-				}
-
-				Vector3 intersection;
-
-				if (!plane.intersects_segment(ray_from, ray_to, &intersection)) {
-					if (ABS(plane.distance_to(ray_from)) < ABS(plane.distance_to(ray_to))) {
-						intersection = plane.project(ray_from);
-					} else {
-
-						intersection = plane.project(ray_to);
-					}
-				}
-
-				intersection = Face3(p_vtx[0], p_vtx[1], p_vtx[2]).get_closest_point_to(intersection);
-
-				Vector2 uv = get_uv(intersection, p_vtx, p_uv);
-
-				int uv_x = CLAMP(Math::fposmod(uv.x, 1.0f) * bake_texture_size, 0, bake_texture_size - 1);
-				int uv_y = CLAMP(Math::fposmod(uv.y, 1.0f) * bake_texture_size, 0, bake_texture_size - 1);
-
-				int ofs = uv_y * bake_texture_size + uv_x;
-				albedo_accum.r += p_material.albedo[ofs].r;
-				albedo_accum.g += p_material.albedo[ofs].g;
-				albedo_accum.b += p_material.albedo[ofs].b;
-				albedo_accum.a += p_material.albedo[ofs].a;
-
-				emission_accum.r += p_material.emission[ofs].r;
-				emission_accum.g += p_material.emission[ofs].g;
-				emission_accum.b += p_material.emission[ofs].b;
-
-				normal_accum += normal;
-
-				alpha += 1.0;
-			}
-		}
-
-		if (alpha == 0) {
-			//could not in any way get texture information.. so use closest point to center
-
-			Face3 f(p_vtx[0], p_vtx[1], p_vtx[2]);
-			Vector3 inters = f.get_closest_point_to(p_aabb.position + p_aabb.size * 0.5);
-
-			Vector2 uv = get_uv(inters, p_vtx, p_uv);
-
-			int uv_x = CLAMP(Math::fposmod(uv.x, 1.0f) * bake_texture_size, 0, bake_texture_size - 1);
-			int uv_y = CLAMP(Math::fposmod(uv.y, 1.0f) * bake_texture_size, 0, bake_texture_size - 1);
-
-			int ofs = uv_y * bake_texture_size + uv_x;
-
-			alpha = 1.0 / (color_scan_cell_width * color_scan_cell_width);
-
-			albedo_accum.r = p_material.albedo[ofs].r * alpha;
-			albedo_accum.g = p_material.albedo[ofs].g * alpha;
-			albedo_accum.b = p_material.albedo[ofs].b * alpha;
-			albedo_accum.a = p_material.albedo[ofs].a * alpha;
-
-			emission_accum.r = p_material.emission[ofs].r * alpha;
-			emission_accum.g = p_material.emission[ofs].g * alpha;
-			emission_accum.b = p_material.emission[ofs].b * alpha;
-
-			normal_accum *= alpha;
-
-		} else {
-
-			float accdiv = 1.0 / (color_scan_cell_width * color_scan_cell_width);
-			alpha *= accdiv;
-
-			albedo_accum.r *= accdiv;
-			albedo_accum.g *= accdiv;
-			albedo_accum.b *= accdiv;
-			albedo_accum.a *= accdiv;
-
-			emission_accum.r *= accdiv;
-			emission_accum.g *= accdiv;
-			emission_accum.b *= accdiv;
-
-			normal_accum *= accdiv;
-		}
-
-		//put this temporarily here, corrected in a later step
-		p_baker->bake_cells[p_idx].albedo[0] += albedo_accum.r;
-		p_baker->bake_cells[p_idx].albedo[1] += albedo_accum.g;
-		p_baker->bake_cells[p_idx].albedo[2] += albedo_accum.b;
-		p_baker->bake_cells[p_idx].emission[0] += emission_accum.r;
-		p_baker->bake_cells[p_idx].emission[1] += emission_accum.g;
-		p_baker->bake_cells[p_idx].emission[2] += emission_accum.b;
-		p_baker->bake_cells[p_idx].normal[0] += normal_accum.x;
-		p_baker->bake_cells[p_idx].normal[1] += normal_accum.y;
-		p_baker->bake_cells[p_idx].normal[2] += normal_accum.z;
-		p_baker->bake_cells[p_idx].alpha += alpha;
-
-	} else {
-		//go down
-
-		int half = (1 << (p_baker->cell_subdiv - 1)) >> (p_level + 1);
-		for (int i = 0; i < 8; i++) {
-
-			AABB aabb = p_aabb;
-			aabb.size *= 0.5;
-
-			int nx = p_x;
-			int ny = p_y;
-			int nz = p_z;
-
-			if (i & 1) {
-				aabb.position.x += aabb.size.x;
-				nx += half;
-			}
-			if (i & 2) {
-				aabb.position.y += aabb.size.y;
-				ny += half;
-			}
-			if (i & 4) {
-				aabb.position.z += aabb.size.z;
-				nz += half;
-			}
-			//make sure to not plot beyond limits
-			if (nx < 0 || nx >= p_baker->axis_cell_size[0] || ny < 0 || ny >= p_baker->axis_cell_size[1] || nz < 0 || nz >= p_baker->axis_cell_size[2])
-				continue;
-
-			{
-				AABB test_aabb = aabb;
-				//test_aabb.grow_by(test_aabb.get_longest_axis_size()*0.05); //grow a bit to avoid numerical error in real-time
-				Vector3 qsize = test_aabb.size * 0.5; //quarter size, for fast aabb test
-
-				if (!fast_tri_box_overlap(test_aabb.position + qsize, qsize, p_vtx)) {
-					//if (!Face3(p_vtx[0],p_vtx[1],p_vtx[2]).intersects_aabb2(aabb)) {
-					//does not fit in child, go on
-					continue;
-				}
-			}
-
-			if (p_baker->bake_cells[p_idx].childs[i] == Baker::CHILD_EMPTY) {
-				//sub cell must be created
-
-				uint32_t child_idx = p_baker->bake_cells.size();
-				p_baker->bake_cells[p_idx].childs[i] = child_idx;
-				p_baker->bake_cells.resize(p_baker->bake_cells.size() + 1);
-				p_baker->bake_cells[child_idx].level = p_level + 1;
-			}
-
-			_plot_face(p_baker->bake_cells[p_idx].childs[i], p_level + 1, nx, ny, nz, p_vtx, p_uv, p_material, aabb, p_baker);
-		}
-	}
-}
-
-void GIProbe::_fixup_plot(int p_idx, int p_level, int p_x, int p_y, int p_z, Baker *p_baker) {
-
-	if (p_level == p_baker->cell_subdiv - 1) {
-
-		p_baker->leaf_voxel_count++;
-		float alpha = p_baker->bake_cells[p_idx].alpha;
-
-		p_baker->bake_cells[p_idx].albedo[0] /= alpha;
-		p_baker->bake_cells[p_idx].albedo[1] /= alpha;
-		p_baker->bake_cells[p_idx].albedo[2] /= alpha;
-
-		//transfer emission to light
-		p_baker->bake_cells[p_idx].emission[0] /= alpha;
-		p_baker->bake_cells[p_idx].emission[1] /= alpha;
-		p_baker->bake_cells[p_idx].emission[2] /= alpha;
-
-		p_baker->bake_cells[p_idx].normal[0] /= alpha;
-		p_baker->bake_cells[p_idx].normal[1] /= alpha;
-		p_baker->bake_cells[p_idx].normal[2] /= alpha;
-
-		Vector3 n(p_baker->bake_cells[p_idx].normal[0], p_baker->bake_cells[p_idx].normal[1], p_baker->bake_cells[p_idx].normal[2]);
-		if (n.length() < 0.01) {
-			//too much fight over normal, zero it
-			p_baker->bake_cells[p_idx].normal[0] = 0;
-			p_baker->bake_cells[p_idx].normal[1] = 0;
-			p_baker->bake_cells[p_idx].normal[2] = 0;
-		} else {
-			n.normalize();
-			p_baker->bake_cells[p_idx].normal[0] = n.x;
-			p_baker->bake_cells[p_idx].normal[1] = n.y;
-			p_baker->bake_cells[p_idx].normal[2] = n.z;
-		}
-
-		p_baker->bake_cells[p_idx].alpha = 1.0;
-
-		/*
-		//remove neighbours from used sides
-
-		for(int n=0;n<6;n++) {
-
-			int ofs[3]={0,0,0};
-
-			ofs[n/2]=(n&1)?1:-1;
-
-			//convert to x,y,z on this level
-			int x=p_x;
-			int y=p_y;
-			int z=p_z;
-
-			x+=ofs[0];
-			y+=ofs[1];
-			z+=ofs[2];
-
-			int ofs_x=0;
-			int ofs_y=0;
-			int ofs_z=0;
-			int size = 1<<p_level;
-			int half=size/2;
-
-
-			if (x<0 || x>=size || y<0 || y>=size || z<0 || z>=size) {
-				//neighbour is out, can't use it
-				p_baker->bake_cells[p_idx].used_sides&=~(1<<uint32_t(n));
-				continue;
-			}
-
-			uint32_t neighbour=0;
-
-			for(int i=0;i<p_baker->cell_subdiv-1;i++) {
-
-				Baker::Cell *bc = &p_baker->bake_cells[neighbour];
-
-				int child = 0;
-				if (x >= ofs_x + half) {
-					child|=1;
-					ofs_x+=half;
-				}
-				if (y >= ofs_y + half) {
-					child|=2;
-					ofs_y+=half;
-				}
-				if (z >= ofs_z + half) {
-					child|=4;
-					ofs_z+=half;
-				}
-
-				neighbour = bc->childs[child];
-				if (neighbour==Baker::CHILD_EMPTY) {
-					break;
-				}
-
-				half>>=1;
-			}
-
-			if (neighbour!=Baker::CHILD_EMPTY) {
-				p_baker->bake_cells[p_idx].used_sides&=~(1<<uint32_t(n));
-			}
-		}
-		*/
-	} else {
-
-		//go down
-
-		float alpha_average = 0;
-		int half = (1 << (p_baker->cell_subdiv - 1)) >> (p_level + 1);
-		for (int i = 0; i < 8; i++) {
-
-			uint32_t child = p_baker->bake_cells[p_idx].childs[i];
-
-			if (child == Baker::CHILD_EMPTY)
-				continue;
-
-			int nx = p_x;
-			int ny = p_y;
-			int nz = p_z;
-
-			if (i & 1)
-				nx += half;
-			if (i & 2)
-				ny += half;
-			if (i & 4)
-				nz += half;
-
-			_fixup_plot(child, p_level + 1, nx, ny, nz, p_baker);
-			alpha_average += p_baker->bake_cells[child].alpha;
-		}
-
-		p_baker->bake_cells[p_idx].alpha = alpha_average / 8.0;
-		p_baker->bake_cells[p_idx].emission[0] = 0;
-		p_baker->bake_cells[p_idx].emission[1] = 0;
-		p_baker->bake_cells[p_idx].emission[2] = 0;
-		p_baker->bake_cells[p_idx].normal[0] = 0;
-		p_baker->bake_cells[p_idx].normal[1] = 0;
-		p_baker->bake_cells[p_idx].normal[2] = 0;
-		p_baker->bake_cells[p_idx].albedo[0] = 0;
-		p_baker->bake_cells[p_idx].albedo[1] = 0;
-		p_baker->bake_cells[p_idx].albedo[2] = 0;
-	}
-}
-
-Vector<Color> GIProbe::_get_bake_texture(Ref<Image> p_image, const Color &p_color_mul, const Color &p_color_add) {
-
-	Vector<Color> ret;
-
-	if (p_image.is_null() || p_image->empty()) {
-
-		ret.resize(bake_texture_size * bake_texture_size);
-		for (int i = 0; i < bake_texture_size * bake_texture_size; i++) {
-			ret[i] = p_color_add;
-		}
-
-		return ret;
-	}
-	p_image = p_image->duplicate();
-
-	if (p_image->is_compressed()) {
-		print_line("DECOMPRESSING!!!!");
-
-		p_image->decompress();
-	}
-	p_image->convert(Image::FORMAT_RGBA8);
-	p_image->resize(bake_texture_size, bake_texture_size, Image::INTERPOLATE_CUBIC);
-
-	PoolVector<uint8_t>::Read r = p_image->get_data().read();
-	ret.resize(bake_texture_size * bake_texture_size);
-
-	for (int i = 0; i < bake_texture_size * bake_texture_size; i++) {
-		Color c;
-		c.r = (r[i * 4 + 0] / 255.0) * p_color_mul.r + p_color_add.r;
-		c.g = (r[i * 4 + 1] / 255.0) * p_color_mul.g + p_color_add.g;
-		c.b = (r[i * 4 + 2] / 255.0) * p_color_mul.b + p_color_add.b;
-
-		c.a = r[i * 4 + 3] / 255.0;
-
-		ret[i] = c;
-	}
-
-	return ret;
-}
-
-GIProbe::Baker::MaterialCache GIProbe::_get_material_cache(Ref<Material> p_material, Baker *p_baker) {
-
-	//this way of obtaining materials is inaccurate and also does not support some compressed formats very well
-	Ref<SpatialMaterial> mat = p_material;
-
-	Ref<Material> material = mat; //hack for now
-
-	if (p_baker->material_cache.has(material)) {
-		return p_baker->material_cache[material];
-	}
-
-	Baker::MaterialCache mc;
-
-	if (mat.is_valid()) {
-
-		Ref<Texture> albedo_tex = mat->get_texture(SpatialMaterial::TEXTURE_ALBEDO);
-
-		Ref<Image> img_albedo;
-		if (albedo_tex.is_valid()) {
-
-			img_albedo = albedo_tex->get_data();
-			mc.albedo = _get_bake_texture(img_albedo, mat->get_albedo(), Color(0, 0, 0)); // albedo texture, color is multiplicative
-		} else {
-			mc.albedo = _get_bake_texture(img_albedo, Color(1, 1, 1), mat->get_albedo()); // no albedo texture, color is additive
-		}
-
-		Ref<Texture> emission_tex = mat->get_texture(SpatialMaterial::TEXTURE_EMISSION);
-
-		Color emission_col = mat->get_emission();
-		float emission_energy = mat->get_emission_energy();
-
-		Ref<Image> img_emission;
-
-		if (emission_tex.is_valid()) {
-
-			img_emission = emission_tex->get_data();
-		}
-
-		if (mat->get_emission_operator() == SpatialMaterial::EMISSION_OP_ADD) {
-			mc.emission = _get_bake_texture(img_emission, Color(1, 1, 1) * emission_energy, emission_col * emission_energy);
-		} else {
-			mc.emission = _get_bake_texture(img_emission, emission_col * emission_energy, Color(0, 0, 0));
-		}
-
-	} else {
-		Ref<Image> empty;
-
-		mc.albedo = _get_bake_texture(empty, Color(0, 0, 0), Color(1, 1, 1));
-		mc.emission = _get_bake_texture(empty, Color(0, 0, 0), Color(0, 0, 0));
-	}
-
-	p_baker->material_cache[p_material] = mc;
-	return mc;
-}
-
-void GIProbe::_plot_mesh(const Transform &p_xform, Ref<Mesh> &p_mesh, Baker *p_baker, const Vector<Ref<Material> > &p_materials, const Ref<Material> &p_override_material) {
-
-	for (int i = 0; i < p_mesh->get_surface_count(); i++) {
-
-		if (p_mesh->surface_get_primitive_type(i) != Mesh::PRIMITIVE_TRIANGLES)
-			continue; //only triangles
-
-		Ref<Material> src_material;
-
-		if (p_override_material.is_valid()) {
-			src_material = p_override_material;
-		} else if (i < p_materials.size() && p_materials[i].is_valid()) {
-			src_material = p_materials[i];
-		} else {
-			src_material = p_mesh->surface_get_material(i);
-		}
-		Baker::MaterialCache material = _get_material_cache(src_material, p_baker);
-
-		Array a = p_mesh->surface_get_arrays(i);
-
-		PoolVector<Vector3> vertices = a[Mesh::ARRAY_VERTEX];
-		PoolVector<Vector3>::Read vr = vertices.read();
-		PoolVector<Vector2> uv = a[Mesh::ARRAY_TEX_UV];
-		PoolVector<Vector2>::Read uvr;
-		PoolVector<int> index = a[Mesh::ARRAY_INDEX];
-
-		bool read_uv = false;
-
-		if (uv.size()) {
-
-			uvr = uv.read();
-			read_uv = true;
-		}
-
-		if (index.size()) {
-
-			int facecount = index.size() / 3;
-			PoolVector<int>::Read ir = index.read();
-
-			for (int j = 0; j < facecount; j++) {
-
-				Vector3 vtxs[3];
-				Vector2 uvs[3];
-
-				for (int k = 0; k < 3; k++) {
-					vtxs[k] = p_xform.xform(vr[ir[j * 3 + k]]);
-				}
-
-				if (read_uv) {
-					for (int k = 0; k < 3; k++) {
-						uvs[k] = uvr[ir[j * 3 + k]];
-					}
-				}
-
-				//test against original bounds
-				if (!fast_tri_box_overlap(-extents, extents * 2, vtxs))
-					continue;
-				//plot
-				_plot_face(0, 0, 0, 0, 0, vtxs, uvs, material, p_baker->po2_bounds, p_baker);
-			}
-
-		} else {
-
-			int facecount = vertices.size() / 3;
-
-			for (int j = 0; j < facecount; j++) {
-
-				Vector3 vtxs[3];
-				Vector2 uvs[3];
-
-				for (int k = 0; k < 3; k++) {
-					vtxs[k] = p_xform.xform(vr[j * 3 + k]);
-				}
-
-				if (read_uv) {
-					for (int k = 0; k < 3; k++) {
-						uvs[k] = uvr[j * 3 + k];
-					}
-				}
-
-				//test against original bounds
-				if (!fast_tri_box_overlap(-extents, extents * 2, vtxs))
-					continue;
-				//plot face
-				_plot_face(0, 0, 0, 0, 0, vtxs, uvs, material, p_baker->po2_bounds, p_baker);
-			}
-		}
-	}
-}
-
-void GIProbe::_find_meshes(Node *p_at_node, Baker *p_baker) {
+void GIProbe::_find_meshes(Node *p_at_node, List<PlotMesh> &plot_meshes) {
 
 	MeshInstance *mi = Object::cast_to<MeshInstance>(p_at_node);
 	if (mi && mi->get_flag(GeometryInstance::FLAG_USE_BAKED_LIGHT) && mi->is_visible_in_tree()) {
@@ -1088,14 +342,14 @@ void GIProbe::_find_meshes(Node *p_at_node, Baker *p_baker) {
 			Transform xf = get_global_transform().affine_inverse() * mi->get_global_transform();
 
 			if (AABB(-extents, extents * 2).intersects(xf.xform(aabb))) {
-				Baker::PlotMesh pm;
+				PlotMesh pm;
 				pm.local_xform = xf;
 				pm.mesh = mesh;
 				for (int i = 0; i < mesh->get_surface_count(); i++) {
 					pm.instance_materials.push_back(mi->get_surface_material(i));
 				}
 				pm.override_material = mi->get_material_override();
-				p_baker->mesh_list.push_back(pm);
+				plot_meshes.push_back(pm);
 			}
 		}
 	}
@@ -1118,10 +372,10 @@ void GIProbe::_find_meshes(Node *p_at_node, Baker *p_baker) {
 				Transform xf = get_global_transform().affine_inverse() * (s->get_global_transform() * mxf);
 
 				if (AABB(-extents, extents * 2).intersects(xf.xform(aabb))) {
-					Baker::PlotMesh pm;
+					PlotMesh pm;
 					pm.local_xform = xf;
 					pm.mesh = mesh;
-					p_baker->mesh_list.push_back(pm);
+					plot_meshes.push_back(pm);
 				}
 			}
 		}
@@ -1133,7 +387,7 @@ void GIProbe::_find_meshes(Node *p_at_node, Baker *p_baker) {
 		if (!child->get_owner())
 			continue; //maybe a helper
 
-		_find_meshes(child, p_baker);
+		_find_meshes(child, plot_meshes);
 	}
 }
 
@@ -1143,145 +397,56 @@ GIProbe::BakeEndFunc GIProbe::bake_end_function = NULL;
 
 void GIProbe::bake(Node *p_from_node, bool p_create_visual_debug) {
 
-	Baker baker;
-
 	static const int subdiv_value[SUBDIV_MAX] = { 7, 8, 9, 10 };
 
-	baker.cell_subdiv = subdiv_value[subdiv];
-	baker.bake_cells.resize(1);
-
-	//find out the actual real bounds, power of 2, which gets the highest subdivision
-	baker.po2_bounds = AABB(-extents, extents * 2.0);
-	int longest_axis = baker.po2_bounds.get_longest_axis_index();
-	baker.axis_cell_size[longest_axis] = (1 << (baker.cell_subdiv - 1));
-	baker.leaf_voxel_count = 0;
-
-	for (int i = 0; i < 3; i++) {
-
-		if (i == longest_axis)
-			continue;
+	VoxelLightBaker baker;
 
-		baker.axis_cell_size[i] = baker.axis_cell_size[longest_axis];
-		float axis_size = baker.po2_bounds.size[longest_axis];
+	baker.begin_bake(subdiv_value[subdiv], AABB(-extents, extents * 2.0));
 
-		//shrink until fit subdiv
-		while (axis_size / 2.0 >= baker.po2_bounds.size[i]) {
-			axis_size /= 2.0;
-			baker.axis_cell_size[i] >>= 1;
-		}
-
-		baker.po2_bounds.size[i] = baker.po2_bounds.size[longest_axis];
-	}
-
-	Transform to_bounds;
-	to_bounds.basis.scale(Vector3(baker.po2_bounds.size[longest_axis], baker.po2_bounds.size[longest_axis], baker.po2_bounds.size[longest_axis]));
-	to_bounds.origin = baker.po2_bounds.position;
-
-	Transform to_grid;
-	to_grid.basis.scale(Vector3(baker.axis_cell_size[longest_axis], baker.axis_cell_size[longest_axis], baker.axis_cell_size[longest_axis]));
+	List<PlotMesh> mesh_list;
 
-	baker.to_cell_space = to_grid * to_bounds.affine_inverse();
-
-	_find_meshes(p_from_node ? p_from_node : get_parent(), &baker);
+	_find_meshes(p_from_node ? p_from_node : get_parent(), mesh_list);
 
 	if (bake_begin_function) {
-		bake_begin_function(baker.mesh_list.size() + 1);
+		bake_begin_function(mesh_list.size() + 1);
 	}
 
 	int pmc = 0;
 
-	for (List<Baker::PlotMesh>::Element *E = baker.mesh_list.front(); E; E = E->next()) {
+	for (List<PlotMesh>::Element *E = mesh_list.front(); E; E = E->next()) {
 
 		if (bake_step_function) {
-			bake_step_function(pmc, RTR("Plotting Meshes") + " " + itos(pmc) + "/" + itos(baker.mesh_list.size()));
+			bake_step_function(pmc, RTR("Plotting Meshes") + " " + itos(pmc) + "/" + itos(mesh_list.size()));
 		}
 
 		pmc++;
 
-		_plot_mesh(E->get().local_xform, E->get().mesh, &baker, E->get().instance_materials, E->get().override_material);
+		baker.plot_mesh(E->get().local_xform, E->get().mesh, E->get().instance_materials, E->get().override_material);
 	}
 	if (bake_step_function) {
 		bake_step_function(pmc++, RTR("Finishing Plot"));
 	}
 
-	_fixup_plot(0, 0, 0, 0, 0, &baker);
+	baker.end_bake();
 
 	//create the data for visual server
 
-	PoolVector<int> data;
-
-	data.resize(16 + (8 + 1 + 1 + 1 + 1) * baker.bake_cells.size()); //4 for header, rest for rest.
-
-	{
-		PoolVector<int>::Write w = data.write();
-
-		uint32_t *w32 = (uint32_t *)w.ptr();
+	PoolVector<int> data = baker.create_gi_probe_data();
 
-		w32[0] = 0; //version
-		w32[1] = baker.cell_subdiv; //subdiv
-		w32[2] = baker.axis_cell_size[0];
-		w32[3] = baker.axis_cell_size[1];
-		w32[4] = baker.axis_cell_size[2];
-		w32[5] = baker.bake_cells.size();
-		w32[6] = baker.leaf_voxel_count;
-
-		int ofs = 16;
-
-		for (int i = 0; i < baker.bake_cells.size(); i++) {
-
-			for (int j = 0; j < 8; j++) {
-				w32[ofs++] = baker.bake_cells[i].childs[j];
-			}
-
-			{ //albedo
-				uint32_t rgba = uint32_t(CLAMP(baker.bake_cells[i].albedo[0] * 255.0, 0, 255)) << 16;
-				rgba |= uint32_t(CLAMP(baker.bake_cells[i].albedo[1] * 255.0, 0, 255)) << 8;
-				rgba |= uint32_t(CLAMP(baker.bake_cells[i].albedo[2] * 255.0, 0, 255)) << 0;
-
-				w32[ofs++] = rgba;
-			}
-			{ //emission
-
-				Vector3 e(baker.bake_cells[i].emission[0], baker.bake_cells[i].emission[1], baker.bake_cells[i].emission[2]);
-				float l = e.length();
-				if (l > 0) {
-					e.normalize();
-					l = CLAMP(l / 8.0, 0, 1.0);
-				}
-
-				uint32_t em = uint32_t(CLAMP(e[0] * 255, 0, 255)) << 24;
-				em |= uint32_t(CLAMP(e[1] * 255, 0, 255)) << 16;
-				em |= uint32_t(CLAMP(e[2] * 255, 0, 255)) << 8;
-				em |= uint32_t(CLAMP(l * 255, 0, 255));
-
-				w32[ofs++] = em;
-			}
-
-			//w32[ofs++]=baker.bake_cells[i].used_sides;
-			{ //normal
-
-				Vector3 n(baker.bake_cells[i].normal[0], baker.bake_cells[i].normal[1], baker.bake_cells[i].normal[2]);
-				n = n * Vector3(0.5, 0.5, 0.5) + Vector3(0.5, 0.5, 0.5);
-				uint32_t norm = 0;
-
-				norm |= uint32_t(CLAMP(n.x * 255.0, 0, 255)) << 16;
-				norm |= uint32_t(CLAMP(n.y * 255.0, 0, 255)) << 8;
-				norm |= uint32_t(CLAMP(n.z * 255.0, 0, 255)) << 0;
-
-				w32[ofs++] = norm;
-			}
-
-			{
-				uint16_t alpha = CLAMP(uint32_t(baker.bake_cells[i].alpha * 65535.0), 0, 65535);
-				uint16_t level = baker.bake_cells[i].level;
-
-				w32[ofs++] = (uint32_t(level) << 16) | uint32_t(alpha);
-			}
+	if (p_create_visual_debug) {
+		MultiMeshInstance *mmi = memnew(MultiMeshInstance);
+		mmi->set_multimesh(baker.create_debug_multimesh());
+		add_child(mmi);
+#ifdef TOOLS_ENABLED
+		if (get_tree()->get_edited_scene_root() == this) {
+			mmi->set_owner(this);
+		} else {
+			mmi->set_owner(get_owner());
 		}
-	}
+#else
+		mmi->set_owner(get_owner());
+#endif
 
-	if (p_create_visual_debug) {
-		_create_debug_mesh(&baker);
 	} else {
 
 		Ref<GIProbeData> probe_data = get_probe_data();
@@ -1290,7 +455,7 @@ void GIProbe::bake(Node *p_from_node, bool p_create_visual_debug) {
 			probe_data.instance();
 
 		probe_data->set_bounds(AABB(-extents, extents * 2.0));
-		probe_data->set_cell_size(baker.po2_bounds.size[longest_axis] / baker.axis_cell_size[longest_axis]);
+		probe_data->set_cell_size(baker.get_cell_size());
 		probe_data->set_dynamic_data(data);
 		probe_data->set_dynamic_range(dynamic_range);
 		probe_data->set_energy(energy);
@@ -1299,7 +464,7 @@ void GIProbe::bake(Node *p_from_node, bool p_create_visual_debug) {
 		probe_data->set_propagation(propagation);
 		probe_data->set_interior(interior);
 		probe_data->set_compress(compress);
-		probe_data->set_to_cell_xform(baker.to_cell_space);
+		probe_data->set_to_cell_xform(baker.get_to_cell_space_xform());
 
 		set_probe_data(probe_data);
 	}
@@ -1309,135 +474,6 @@ void GIProbe::bake(Node *p_from_node, bool p_create_visual_debug) {
 	}
 }
 
-void GIProbe::_debug_mesh(int p_idx, int p_level, const AABB &p_aabb, Ref<MultiMesh> &p_multimesh, int &idx, Baker *p_baker) {
-
-	if (p_level == p_baker->cell_subdiv - 1) {
-
-		Vector3 center = p_aabb.position + p_aabb.size * 0.5;
-		Transform xform;
-		xform.origin = center;
-		xform.basis.scale(p_aabb.size * 0.5);
-		p_multimesh->set_instance_transform(idx, xform);
-		Color col = Color(p_baker->bake_cells[p_idx].albedo[0], p_baker->bake_cells[p_idx].albedo[1], p_baker->bake_cells[p_idx].albedo[2]);
-		//Color col = Color(p_baker->bake_cells[p_idx].emission[0], p_baker->bake_cells[p_idx].emission[1], p_baker->bake_cells[p_idx].emission[2]);
-		p_multimesh->set_instance_color(idx, col);
-
-		idx++;
-
-	} else {
-
-		for (int i = 0; i < 8; i++) {
-
-			if (p_baker->bake_cells[p_idx].childs[i] == Baker::CHILD_EMPTY)
-				continue;
-
-			AABB aabb = p_aabb;
-			aabb.size *= 0.5;
-
-			if (i & 1)
-				aabb.position.x += aabb.size.x;
-			if (i & 2)
-				aabb.position.y += aabb.size.y;
-			if (i & 4)
-				aabb.position.z += aabb.size.z;
-
-			_debug_mesh(p_baker->bake_cells[p_idx].childs[i], p_level + 1, aabb, p_multimesh, idx, p_baker);
-		}
-	}
-}
-
-void GIProbe::_create_debug_mesh(Baker *p_baker) {
-
-	Ref<MultiMesh> mm;
-	mm.instance();
-
-	mm->set_transform_format(MultiMesh::TRANSFORM_3D);
-	mm->set_color_format(MultiMesh::COLOR_8BIT);
-	print_line("leaf voxels: " + itos(p_baker->leaf_voxel_count));
-	mm->set_instance_count(p_baker->leaf_voxel_count);
-
-	Ref<ArrayMesh> mesh;
-	mesh.instance();
-
-	{
-		Array arr;
-		arr.resize(Mesh::ARRAY_MAX);
-
-		PoolVector<Vector3> vertices;
-		PoolVector<Color> colors;
-
-		int vtx_idx = 0;
-#define ADD_VTX(m_idx)                      \
-	;                                       \
-	vertices.push_back(face_points[m_idx]); \
-	colors.push_back(Color(1, 1, 1, 1));    \
-	vtx_idx++;
-
-		for (int i = 0; i < 6; i++) {
-
-			Vector3 face_points[4];
-
-			for (int j = 0; j < 4; j++) {
-
-				float v[3];
-				v[0] = 1.0;
-				v[1] = 1 - 2 * ((j >> 1) & 1);
-				v[2] = v[1] * (1 - 2 * (j & 1));
-
-				for (int k = 0; k < 3; k++) {
-
-					if (i < 3)
-						face_points[j][(i + k) % 3] = v[k] * (i >= 3 ? -1 : 1);
-					else
-						face_points[3 - j][(i + k) % 3] = v[k] * (i >= 3 ? -1 : 1);
-				}
-			}
-
-			//tri 1
-			ADD_VTX(0);
-			ADD_VTX(1);
-			ADD_VTX(2);
-			//tri 2
-			ADD_VTX(2);
-			ADD_VTX(3);
-			ADD_VTX(0);
-		}
-
-		arr[Mesh::ARRAY_VERTEX] = vertices;
-		arr[Mesh::ARRAY_COLOR] = colors;
-		mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, arr);
-	}
-
-	{
-		Ref<SpatialMaterial> fsm;
-		fsm.instance();
-		fsm->set_flag(SpatialMaterial::FLAG_SRGB_VERTEX_COLOR, true);
-		fsm->set_flag(SpatialMaterial::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
-		fsm->set_flag(SpatialMaterial::FLAG_UNSHADED, true);
-		fsm->set_albedo(Color(1, 1, 1, 1));
-
-		mesh->surface_set_material(0, fsm);
-	}
-
-	mm->set_mesh(mesh);
-
-	int idx = 0;
-	_debug_mesh(0, 0, p_baker->po2_bounds, mm, idx, p_baker);
-
-	MultiMeshInstance *mmi = memnew(MultiMeshInstance);
-	mmi->set_multimesh(mm);
-	add_child(mmi);
-#ifdef TOOLS_ENABLED
-	if (get_tree()->get_edited_scene_root() == this) {
-		mmi->set_owner(this);
-	} else {
-		mmi->set_owner(get_owner());
-	}
-#else
-	mmi->set_owner(get_owner());
-#endif
-}
-
 void GIProbe::_debug_bake() {
 
 	bake(NULL, true);
@@ -1516,8 +552,6 @@ GIProbe::GIProbe() {
 	normal_bias = 0.0;
 	propagation = 0.7;
 	extents = Vector3(10, 10, 10);
-	color_scan_cell_width = 4;
-	bake_texture_size = 128;
 	interior = false;
 	compress = false;
 
diff --git a/scene/3d/gi_probe.h b/scene/3d/gi_probe.h
index 324ff8e917..0858af0001 100644
--- a/scene/3d/gi_probe.h
+++ b/scene/3d/gi_probe.h
@@ -100,67 +100,6 @@ public:
 	typedef void (*BakeEndFunc)();
 
 private:
-	//stuff used for bake
-	struct Baker {
-
-		enum {
-			CHILD_EMPTY = 0xFFFFFFFF
-		};
-		struct Cell {
-
-			uint32_t childs[8];
-			float albedo[3]; //albedo in RGB24
-			float emission[3]; //accumulated light in 16:16 fixed point (needs to be integer for moving lights fast)
-			float normal[3];
-			uint32_t used_sides;
-			float alpha; //used for upsampling
-			int level;
-
-			Cell() {
-				for (int i = 0; i < 8; i++) {
-					childs[i] = CHILD_EMPTY;
-				}
-
-				for (int i = 0; i < 3; i++) {
-					emission[i] = 0;
-					albedo[i] = 0;
-					normal[i] = 0;
-				}
-				alpha = 0;
-				used_sides = 0;
-				level = 0;
-			}
-		};
-
-		Vector<Cell> bake_cells;
-		int cell_subdiv;
-
-		struct MaterialCache {
-			//128x128 textures
-			Vector<Color> albedo;
-			Vector<Color> emission;
-		};
-
-		Vector<Color> _get_bake_texture(Ref<Image> p_image, const Color &p_color);
-		Map<Ref<Material>, MaterialCache> material_cache;
-		MaterialCache _get_material_cache(Ref<Material> p_material);
-		int leaf_voxel_count;
-
-		AABB po2_bounds;
-		int axis_cell_size[3];
-
-		struct PlotMesh {
-			Ref<Material> override_material;
-			Vector<Ref<Material> > instance_materials;
-			Ref<Mesh> mesh;
-			Transform local_xform;
-		};
-
-		Transform to_cell_space;
-
-		List<PlotMesh> mesh_list;
-	};
-
 	Ref<GIProbeData> probe_data;
 
 	RID gi_probe;
@@ -175,19 +114,14 @@ private:
 	bool interior;
 	bool compress;
 
-	int color_scan_cell_width;
-	int bake_texture_size;
-
-	Vector<Color> _get_bake_texture(Ref<Image> p_image, const Color &p_color_mul, const Color &p_color_add);
-	Baker::MaterialCache _get_material_cache(Ref<Material> p_material, Baker *p_baker);
-	void _plot_face(int p_idx, int p_level, int p_x, int p_y, int p_z, const Vector3 *p_vtx, const Vector2 *p_uv, const Baker::MaterialCache &p_material, const AABB &p_aabb, Baker *p_baker);
-	void _plot_mesh(const Transform &p_xform, Ref<Mesh> &p_mesh, Baker *p_baker, const Vector<Ref<Material> > &p_materials, const Ref<Material> &p_override_material);
-	void _find_meshes(Node *p_at_node, Baker *p_baker);
-	void _fixup_plot(int p_idx, int p_level, int p_x, int p_y, int p_z, Baker *p_baker);
-
-	void _debug_mesh(int p_idx, int p_level, const AABB &p_aabb, Ref<MultiMesh> &p_multimesh, int &idx, Baker *p_baker);
-	void _create_debug_mesh(Baker *p_baker);
+	struct PlotMesh {
+		Ref<Material> override_material;
+		Vector<Ref<Material> > instance_materials;
+		Ref<Mesh> mesh;
+		Transform local_xform;
+	};
 
+	void _find_meshes(Node *p_at_node, List<PlotMesh> &plot_meshes);
 	void _debug_bake();
 
 protected:
diff --git a/scene/3d/light.cpp b/scene/3d/light.cpp
index 1fc4e932e8..6eb2028d8e 100644
--- a/scene/3d/light.cpp
+++ b/scene/3d/light.cpp
@@ -142,6 +142,14 @@ PoolVector<Face3> Light::get_faces(uint32_t p_usage_flags) const {
 	return PoolVector<Face3>();
 }
 
+void Light::set_bake_mode(BakeMode p_mode) {
+	bake_mode = p_mode;
+}
+
+Light::BakeMode Light::get_bake_mode() const {
+	return bake_mode;
+}
+
 void Light::_update_visibility() {
 
 	if (!is_inside_tree())
@@ -219,12 +227,16 @@ void Light::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_shadow_color", "shadow_color"), &Light::set_shadow_color);
 	ClassDB::bind_method(D_METHOD("get_shadow_color"), &Light::get_shadow_color);
 
+	ClassDB::bind_method(D_METHOD("set_bake_mode", "bake_mode"), &Light::set_bake_mode);
+	ClassDB::bind_method(D_METHOD("get_bake_mode"), &Light::get_bake_mode);
+
 	ADD_GROUP("Light", "light_");
 	ADD_PROPERTY(PropertyInfo(Variant::COLOR, "light_color", PROPERTY_HINT_COLOR_NO_ALPHA), "set_color", "get_color");
 	ADD_PROPERTYI(PropertyInfo(Variant::REAL, "light_energy", PROPERTY_HINT_RANGE, "0,16,0.01"), "set_param", "get_param", PARAM_ENERGY);
 	ADD_PROPERTYI(PropertyInfo(Variant::REAL, "light_indirect_energy", PROPERTY_HINT_RANGE, "0,16,0.01"), "set_param", "get_param", PARAM_INDIRECT_ENERGY);
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "light_negative"), "set_negative", "is_negative");
 	ADD_PROPERTYI(PropertyInfo(Variant::REAL, "light_specular", PROPERTY_HINT_RANGE, "0,1,0.01"), "set_param", "get_param", PARAM_SPECULAR);
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "light_bake_mode", PROPERTY_HINT_ENUM, "Disable,Indirect,All"), "set_bake_mode", "get_bake_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "light_cull_mask", PROPERTY_HINT_LAYERS_3D_RENDER), "set_cull_mask", "get_cull_mask");
 	ADD_GROUP("Shadow", "shadow_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "shadow_enabled"), "set_shadow", "has_shadow");
@@ -252,6 +264,10 @@ void Light::_bind_methods() {
 	BIND_ENUM_CONSTANT(PARAM_SHADOW_BIAS);
 	BIND_ENUM_CONSTANT(PARAM_SHADOW_BIAS_SPLIT_SCALE);
 	BIND_ENUM_CONSTANT(PARAM_MAX);
+
+	BIND_ENUM_CONSTANT(BAKE_DISABLED);
+	BIND_ENUM_CONSTANT(BAKE_INDIRECT);
+	BIND_ENUM_CONSTANT(BAKE_ALL);
 }
 
 Light::Light(VisualServer::LightType p_type) {
@@ -267,6 +283,7 @@ Light::Light(VisualServer::LightType p_type) {
 	VS::get_singleton()->instance_set_base(get_instance(), light);
 
 	reverse_cull = false;
+	bake_mode = BAKE_INDIRECT;
 
 	editor_only = false;
 	set_color(Color(1, 1, 1, 1));
diff --git a/scene/3d/light.h b/scene/3d/light.h
index 33e62214b1..7ba25731d9 100644
--- a/scene/3d/light.h
+++ b/scene/3d/light.h
@@ -63,6 +63,12 @@ public:
 		PARAM_MAX = VS::LIGHT_PARAM_MAX
 	};
 
+	enum BakeMode {
+		BAKE_DISABLED,
+		BAKE_INDIRECT,
+		BAKE_ALL
+	};
+
 private:
 	Color color;
 	float param[PARAM_MAX];
@@ -74,6 +80,7 @@ private:
 	VS::LightType type;
 	bool editor_only;
 	void _update_visibility();
+	BakeMode bake_mode;
 
 	// bind helpers
 
@@ -114,6 +121,9 @@ public:
 	void set_shadow_reverse_cull_face(bool p_enable);
 	bool get_shadow_reverse_cull_face() const;
 
+	void set_bake_mode(BakeMode p_mode);
+	BakeMode get_bake_mode() const;
+
 	virtual AABB get_aabb() const;
 	virtual PoolVector<Face3> get_faces(uint32_t p_usage_flags) const;
 
@@ -122,6 +132,7 @@ public:
 };
 
 VARIANT_ENUM_CAST(Light::Param);
+VARIANT_ENUM_CAST(Light::BakeMode);
 
 class DirectionalLight : public Light {
 
diff --git a/scene/3d/navigation.cpp b/scene/3d/navigation.cpp
index b6507aedb3..78cf75e3b3 100644
--- a/scene/3d/navigation.cpp
+++ b/scene/3d/navigation.cpp
@@ -202,7 +202,7 @@ void Navigation::_navmesh_unlink(int p_id) {
 	nm.linked = false;
 }
 
-int Navigation::navmesh_create(const Ref<NavigationMesh> &p_mesh, const Transform &p_xform, Object *p_owner) {
+int Navigation::navmesh_add(const Ref<NavigationMesh> &p_mesh, const Transform &p_xform, Object *p_owner) {
 
 	int id = last_id++;
 	NavMesh nm;
@@ -686,7 +686,7 @@ Vector3 Navigation::get_up_vector() const {
 
 void Navigation::_bind_methods() {
 
-	ClassDB::bind_method(D_METHOD("navmesh_create", "mesh", "xform", "owner"), &Navigation::navmesh_create, DEFVAL(Variant()));
+	ClassDB::bind_method(D_METHOD("navmesh_add", "mesh", "xform", "owner"), &Navigation::navmesh_add, DEFVAL(Variant()));
 	ClassDB::bind_method(D_METHOD("navmesh_set_transform", "id", "xform"), &Navigation::navmesh_set_transform);
 	ClassDB::bind_method(D_METHOD("navmesh_remove", "id"), &Navigation::navmesh_remove);
 
diff --git a/scene/3d/navigation.h b/scene/3d/navigation.h
index d9a38f7b00..134afa2278 100644
--- a/scene/3d/navigation.h
+++ b/scene/3d/navigation.h
@@ -166,7 +166,7 @@ public:
 	Vector3 get_up_vector() const;
 
 	//API should be as dynamic as possible
-	int navmesh_create(const Ref<NavigationMesh> &p_mesh, const Transform &p_xform, Object *p_owner = NULL);
+	int navmesh_add(const Ref<NavigationMesh> &p_mesh, const Transform &p_xform, Object *p_owner = NULL);
 	void navmesh_set_transform(int p_id, const Transform &p_xform);
 	void navmesh_remove(int p_id);
 
diff --git a/scene/3d/navigation_mesh.cpp b/scene/3d/navigation_mesh.cpp
index 40750cdfe8..4fb12b8fac 100644
--- a/scene/3d/navigation_mesh.cpp
+++ b/scene/3d/navigation_mesh.cpp
@@ -471,7 +471,7 @@ void NavigationMeshInstance::set_enabled(bool p_enabled) {
 
 			if (navmesh.is_valid()) {
 
-				nav_id = navigation->navmesh_create(navmesh, get_relative_transform(navigation), this);
+				nav_id = navigation->navmesh_add(navmesh, get_relative_transform(navigation), this);
 			}
 		}
 	}
@@ -508,7 +508,7 @@ void NavigationMeshInstance::_notification(int p_what) {
 
 					if (enabled && navmesh.is_valid()) {
 
-						nav_id = navigation->navmesh_create(navmesh, get_relative_transform(navigation), this);
+						nav_id = navigation->navmesh_add(navmesh, get_relative_transform(navigation), this);
 					}
 					break;
 				}
@@ -568,7 +568,7 @@ void NavigationMeshInstance::set_navigation_mesh(const Ref<NavigationMesh> &p_na
 	navmesh = p_navmesh;
 
 	if (navigation && navmesh.is_valid() && enabled) {
-		nav_id = navigation->navmesh_create(navmesh, get_relative_transform(navigation), this);
+		nav_id = navigation->navmesh_add(navmesh, get_relative_transform(navigation), this);
 	}
 
 	if (debug_view && navmesh.is_valid()) {
diff --git a/scene/3d/voxel_light_baker.cpp b/scene/3d/voxel_light_baker.cpp
new file mode 100644
index 0000000000..98dc1590d8
--- /dev/null
+++ b/scene/3d/voxel_light_baker.cpp
@@ -0,0 +1,2373 @@
+#include "voxel_light_baker.h"
+#include "os/os.h"
+#define FINDMINMAX(x0, x1, x2, min, max) \
+	min = max = x0;                      \
+	if (x1 < min) min = x1;              \
+	if (x1 > max) max = x1;              \
+	if (x2 < min) min = x2;              \
+	if (x2 > max) max = x2;
+
+static bool planeBoxOverlap(Vector3 normal, float d, Vector3 maxbox) {
+	int q;
+	Vector3 vmin, vmax;
+	for (q = 0; q <= 2; q++) {
+		if (normal[q] > 0.0f) {
+			vmin[q] = -maxbox[q];
+			vmax[q] = maxbox[q];
+		} else {
+			vmin[q] = maxbox[q];
+			vmax[q] = -maxbox[q];
+		}
+	}
+	if (normal.dot(vmin) + d > 0.0f) return false;
+	if (normal.dot(vmax) + d >= 0.0f) return true;
+
+	return false;
+}
+
+/*======================== X-tests ========================*/
+#define AXISTEST_X01(a, b, fa, fb)                 \
+	p0 = a * v0.y - b * v0.z;                      \
+	p2 = a * v2.y - b * v2.z;                      \
+	if (p0 < p2) {                                 \
+		min = p0;                                  \
+		max = p2;                                  \
+	} else {                                       \
+		min = p2;                                  \
+		max = p0;                                  \
+	}                                              \
+	rad = fa * boxhalfsize.y + fb * boxhalfsize.z; \
+	if (min > rad || max < -rad) return false;
+
+#define AXISTEST_X2(a, b, fa, fb)                  \
+	p0 = a * v0.y - b * v0.z;                      \
+	p1 = a * v1.y - b * v1.z;                      \
+	if (p0 < p1) {                                 \
+		min = p0;                                  \
+		max = p1;                                  \
+	} else {                                       \
+		min = p1;                                  \
+		max = p0;                                  \
+	}                                              \
+	rad = fa * boxhalfsize.y + fb * boxhalfsize.z; \
+	if (min > rad || max < -rad) return false;
+
+/*======================== Y-tests ========================*/
+#define AXISTEST_Y02(a, b, fa, fb)                 \
+	p0 = -a * v0.x + b * v0.z;                     \
+	p2 = -a * v2.x + b * v2.z;                     \
+	if (p0 < p2) {                                 \
+		min = p0;                                  \
+		max = p2;                                  \
+	} else {                                       \
+		min = p2;                                  \
+		max = p0;                                  \
+	}                                              \
+	rad = fa * boxhalfsize.x + fb * boxhalfsize.z; \
+	if (min > rad || max < -rad) return false;
+
+#define AXISTEST_Y1(a, b, fa, fb)                  \
+	p0 = -a * v0.x + b * v0.z;                     \
+	p1 = -a * v1.x + b * v1.z;                     \
+	if (p0 < p1) {                                 \
+		min = p0;                                  \
+		max = p1;                                  \
+	} else {                                       \
+		min = p1;                                  \
+		max = p0;                                  \
+	}                                              \
+	rad = fa * boxhalfsize.x + fb * boxhalfsize.z; \
+	if (min > rad || max < -rad) return false;
+
+	/*======================== Z-tests ========================*/
+
+#define AXISTEST_Z12(a, b, fa, fb)                 \
+	p1 = a * v1.x - b * v1.y;                      \
+	p2 = a * v2.x - b * v2.y;                      \
+	if (p2 < p1) {                                 \
+		min = p2;                                  \
+		max = p1;                                  \
+	} else {                                       \
+		min = p1;                                  \
+		max = p2;                                  \
+	}                                              \
+	rad = fa * boxhalfsize.x + fb * boxhalfsize.y; \
+	if (min > rad || max < -rad) return false;
+
+#define AXISTEST_Z0(a, b, fa, fb)                  \
+	p0 = a * v0.x - b * v0.y;                      \
+	p1 = a * v1.x - b * v1.y;                      \
+	if (p0 < p1) {                                 \
+		min = p0;                                  \
+		max = p1;                                  \
+	} else {                                       \
+		min = p1;                                  \
+		max = p0;                                  \
+	}                                              \
+	rad = fa * boxhalfsize.x + fb * boxhalfsize.y; \
+	if (min > rad || max < -rad) return false;
+
+static bool fast_tri_box_overlap(const Vector3 &boxcenter, const Vector3 boxhalfsize, const Vector3 *triverts) {
+
+	/*    use separating axis theorem to test overlap between triangle and box */
+	/*    need to test for overlap in these directions: */
+	/*    1) the {x,y,z}-directions (actually, since we use the AABB of the triangle */
+	/*       we do not even need to test these) */
+	/*    2) normal of the triangle */
+	/*    3) crossproduct(edge from tri, {x,y,z}-directin) */
+	/*       this gives 3x3=9 more tests */
+	Vector3 v0, v1, v2;
+	float min, max, d, p0, p1, p2, rad, fex, fey, fez;
+	Vector3 normal, e0, e1, e2;
+
+	/* This is the fastest branch on Sun */
+	/* move everything so that the boxcenter is in (0,0,0) */
+
+	v0 = triverts[0] - boxcenter;
+	v1 = triverts[1] - boxcenter;
+	v2 = triverts[2] - boxcenter;
+
+	/* compute triangle edges */
+	e0 = v1 - v0; /* tri edge 0 */
+	e1 = v2 - v1; /* tri edge 1 */
+	e2 = v0 - v2; /* tri edge 2 */
+
+	/* Bullet 3:  */
+	/*  test the 9 tests first (this was faster) */
+	fex = Math::abs(e0.x);
+	fey = Math::abs(e0.y);
+	fez = Math::abs(e0.z);
+	AXISTEST_X01(e0.z, e0.y, fez, fey);
+	AXISTEST_Y02(e0.z, e0.x, fez, fex);
+	AXISTEST_Z12(e0.y, e0.x, fey, fex);
+
+	fex = Math::abs(e1.x);
+	fey = Math::abs(e1.y);
+	fez = Math::abs(e1.z);
+	AXISTEST_X01(e1.z, e1.y, fez, fey);
+	AXISTEST_Y02(e1.z, e1.x, fez, fex);
+	AXISTEST_Z0(e1.y, e1.x, fey, fex);
+
+	fex = Math::abs(e2.x);
+	fey = Math::abs(e2.y);
+	fez = Math::abs(e2.z);
+	AXISTEST_X2(e2.z, e2.y, fez, fey);
+	AXISTEST_Y1(e2.z, e2.x, fez, fex);
+	AXISTEST_Z12(e2.y, e2.x, fey, fex);
+
+	/* Bullet 1: */
+	/*  first test overlap in the {x,y,z}-directions */
+	/*  find min, max of the triangle each direction, and test for overlap in */
+	/*  that direction -- this is equivalent to testing a minimal AABB around */
+	/*  the triangle against the AABB */
+
+	/* test in X-direction */
+	FINDMINMAX(v0.x, v1.x, v2.x, min, max);
+	if (min > boxhalfsize.x || max < -boxhalfsize.x) return false;
+
+	/* test in Y-direction */
+	FINDMINMAX(v0.y, v1.y, v2.y, min, max);
+	if (min > boxhalfsize.y || max < -boxhalfsize.y) return false;
+
+	/* test in Z-direction */
+	FINDMINMAX(v0.z, v1.z, v2.z, min, max);
+	if (min > boxhalfsize.z || max < -boxhalfsize.z) return false;
+
+	/* Bullet 2: */
+	/*  test if the box intersects the plane of the triangle */
+	/*  compute plane equation of triangle: normal*x+d=0 */
+	normal = e0.cross(e1);
+	d = -normal.dot(v0); /* plane eq: normal.x+d=0 */
+	if (!planeBoxOverlap(normal, d, boxhalfsize)) return false;
+
+	return true; /* box and triangle overlaps */
+}
+
+static _FORCE_INLINE_ Vector2 get_uv(const Vector3 &p_pos, const Vector3 *p_vtx, const Vector2 *p_uv) {
+
+	if (p_pos.distance_squared_to(p_vtx[0]) < CMP_EPSILON2)
+		return p_uv[0];
+	if (p_pos.distance_squared_to(p_vtx[1]) < CMP_EPSILON2)
+		return p_uv[1];
+	if (p_pos.distance_squared_to(p_vtx[2]) < CMP_EPSILON2)
+		return p_uv[2];
+
+	Vector3 v0 = p_vtx[1] - p_vtx[0];
+	Vector3 v1 = p_vtx[2] - p_vtx[0];
+	Vector3 v2 = p_pos - p_vtx[0];
+
+	float d00 = v0.dot(v0);
+	float d01 = v0.dot(v1);
+	float d11 = v1.dot(v1);
+	float d20 = v2.dot(v0);
+	float d21 = v2.dot(v1);
+	float denom = (d00 * d11 - d01 * d01);
+	if (denom == 0)
+		return p_uv[0];
+	float v = (d11 * d20 - d01 * d21) / denom;
+	float w = (d00 * d21 - d01 * d20) / denom;
+	float u = 1.0f - v - w;
+
+	return p_uv[0] * u + p_uv[1] * v + p_uv[2] * w;
+}
+
+void VoxelLightBaker::_plot_face(int p_idx, int p_level, int p_x, int p_y, int p_z, const Vector3 *p_vtx, const Vector2 *p_uv, const MaterialCache &p_material, const AABB &p_aabb) {
+
+	if (p_level == cell_subdiv - 1) {
+		//plot the face by guessing it's albedo and emission value
+
+		//find best axis to map to, for scanning values
+		int closest_axis = 0;
+		float closest_dot = 0;
+
+		Plane plane = Plane(p_vtx[0], p_vtx[1], p_vtx[2]);
+		Vector3 normal = plane.normal;
+
+		for (int i = 0; i < 3; i++) {
+
+			Vector3 axis;
+			axis[i] = 1.0;
+			float dot = ABS(normal.dot(axis));
+			if (i == 0 || dot > closest_dot) {
+				closest_axis = i;
+				closest_dot = dot;
+			}
+		}
+
+		Vector3 axis;
+		axis[closest_axis] = 1.0;
+		Vector3 t1;
+		t1[(closest_axis + 1) % 3] = 1.0;
+		Vector3 t2;
+		t2[(closest_axis + 2) % 3] = 1.0;
+
+		t1 *= p_aabb.size[(closest_axis + 1) % 3] / float(color_scan_cell_width);
+		t2 *= p_aabb.size[(closest_axis + 2) % 3] / float(color_scan_cell_width);
+
+		Color albedo_accum;
+		Color emission_accum;
+		Vector3 normal_accum;
+
+		float alpha = 0.0;
+
+		//map to a grid average in the best axis for this face
+		for (int i = 0; i < color_scan_cell_width; i++) {
+
+			Vector3 ofs_i = float(i) * t1;
+
+			for (int j = 0; j < color_scan_cell_width; j++) {
+
+				Vector3 ofs_j = float(j) * t2;
+
+				Vector3 from = p_aabb.position + ofs_i + ofs_j;
+				Vector3 to = from + t1 + t2 + axis * p_aabb.size[closest_axis];
+				Vector3 half = (to - from) * 0.5;
+
+				//is in this cell?
+				if (!fast_tri_box_overlap(from + half, half, p_vtx)) {
+					continue; //face does not span this cell
+				}
+
+				//go from -size to +size*2 to avoid skipping collisions
+				Vector3 ray_from = from + (t1 + t2) * 0.5 - axis * p_aabb.size[closest_axis];
+				Vector3 ray_to = ray_from + axis * p_aabb.size[closest_axis] * 2;
+
+				if (normal.dot(ray_from - ray_to) < 0) {
+					SWAP(ray_from, ray_to);
+				}
+
+				Vector3 intersection;
+
+				if (!plane.intersects_segment(ray_from, ray_to, &intersection)) {
+					if (ABS(plane.distance_to(ray_from)) < ABS(plane.distance_to(ray_to))) {
+						intersection = plane.project(ray_from);
+					} else {
+
+						intersection = plane.project(ray_to);
+					}
+				}
+
+				intersection = Face3(p_vtx[0], p_vtx[1], p_vtx[2]).get_closest_point_to(intersection);
+
+				Vector2 uv = get_uv(intersection, p_vtx, p_uv);
+
+				int uv_x = CLAMP(Math::fposmod(uv.x, 1.0f) * bake_texture_size, 0, bake_texture_size - 1);
+				int uv_y = CLAMP(Math::fposmod(uv.y, 1.0f) * bake_texture_size, 0, bake_texture_size - 1);
+
+				int ofs = uv_y * bake_texture_size + uv_x;
+				albedo_accum.r += p_material.albedo[ofs].r;
+				albedo_accum.g += p_material.albedo[ofs].g;
+				albedo_accum.b += p_material.albedo[ofs].b;
+				albedo_accum.a += p_material.albedo[ofs].a;
+
+				emission_accum.r += p_material.emission[ofs].r;
+				emission_accum.g += p_material.emission[ofs].g;
+				emission_accum.b += p_material.emission[ofs].b;
+
+				normal_accum += normal;
+
+				alpha += 1.0;
+			}
+		}
+
+		if (alpha == 0) {
+			//could not in any way get texture information.. so use closest point to center
+
+			Face3 f(p_vtx[0], p_vtx[1], p_vtx[2]);
+			Vector3 inters = f.get_closest_point_to(p_aabb.position + p_aabb.size * 0.5);
+
+			Vector2 uv = get_uv(inters, p_vtx, p_uv);
+
+			int uv_x = CLAMP(Math::fposmod(uv.x, 1.0f) * bake_texture_size, 0, bake_texture_size - 1);
+			int uv_y = CLAMP(Math::fposmod(uv.y, 1.0f) * bake_texture_size, 0, bake_texture_size - 1);
+
+			int ofs = uv_y * bake_texture_size + uv_x;
+
+			alpha = 1.0 / (color_scan_cell_width * color_scan_cell_width);
+
+			albedo_accum.r = p_material.albedo[ofs].r * alpha;
+			albedo_accum.g = p_material.albedo[ofs].g * alpha;
+			albedo_accum.b = p_material.albedo[ofs].b * alpha;
+			albedo_accum.a = p_material.albedo[ofs].a * alpha;
+
+			emission_accum.r = p_material.emission[ofs].r * alpha;
+			emission_accum.g = p_material.emission[ofs].g * alpha;
+			emission_accum.b = p_material.emission[ofs].b * alpha;
+
+			normal_accum *= alpha;
+
+		} else {
+
+			float accdiv = 1.0 / (color_scan_cell_width * color_scan_cell_width);
+			alpha *= accdiv;
+
+			albedo_accum.r *= accdiv;
+			albedo_accum.g *= accdiv;
+			albedo_accum.b *= accdiv;
+			albedo_accum.a *= accdiv;
+
+			emission_accum.r *= accdiv;
+			emission_accum.g *= accdiv;
+			emission_accum.b *= accdiv;
+
+			normal_accum *= accdiv;
+		}
+
+		//put this temporarily here, corrected in a later step
+		bake_cells[p_idx].albedo[0] += albedo_accum.r;
+		bake_cells[p_idx].albedo[1] += albedo_accum.g;
+		bake_cells[p_idx].albedo[2] += albedo_accum.b;
+		bake_cells[p_idx].emission[0] += emission_accum.r;
+		bake_cells[p_idx].emission[1] += emission_accum.g;
+		bake_cells[p_idx].emission[2] += emission_accum.b;
+		bake_cells[p_idx].normal[0] += normal_accum.x;
+		bake_cells[p_idx].normal[1] += normal_accum.y;
+		bake_cells[p_idx].normal[2] += normal_accum.z;
+		bake_cells[p_idx].alpha += alpha;
+
+	} else {
+		//go down
+
+		int half = (1 << (cell_subdiv - 1)) >> (p_level + 1);
+		for (int i = 0; i < 8; i++) {
+
+			AABB aabb = p_aabb;
+			aabb.size *= 0.5;
+
+			int nx = p_x;
+			int ny = p_y;
+			int nz = p_z;
+
+			if (i & 1) {
+				aabb.position.x += aabb.size.x;
+				nx += half;
+			}
+			if (i & 2) {
+				aabb.position.y += aabb.size.y;
+				ny += half;
+			}
+			if (i & 4) {
+				aabb.position.z += aabb.size.z;
+				nz += half;
+			}
+			//make sure to not plot beyond limits
+			if (nx < 0 || nx >= axis_cell_size[0] || ny < 0 || ny >= axis_cell_size[1] || nz < 0 || nz >= axis_cell_size[2])
+				continue;
+
+			{
+				AABB test_aabb = aabb;
+				//test_aabb.grow_by(test_aabb.get_longest_axis_size()*0.05); //grow a bit to avoid numerical error in real-time
+				Vector3 qsize = test_aabb.size * 0.5; //quarter size, for fast aabb test
+
+				if (!fast_tri_box_overlap(test_aabb.position + qsize, qsize, p_vtx)) {
+					//if (!Face3(p_vtx[0],p_vtx[1],p_vtx[2]).intersects_aabb2(aabb)) {
+					//does not fit in child, go on
+					continue;
+				}
+			}
+
+			if (bake_cells[p_idx].childs[i] == CHILD_EMPTY) {
+				//sub cell must be created
+
+				uint32_t child_idx = bake_cells.size();
+				bake_cells[p_idx].childs[i] = child_idx;
+				bake_cells.resize(bake_cells.size() + 1);
+				bake_cells[child_idx].level = p_level + 1;
+			}
+
+			_plot_face(bake_cells[p_idx].childs[i], p_level + 1, nx, ny, nz, p_vtx, p_uv, p_material, aabb);
+		}
+	}
+}
+
+Vector<Color> VoxelLightBaker::_get_bake_texture(Ref<Image> p_image, const Color &p_color_mul, const Color &p_color_add) {
+
+	Vector<Color> ret;
+
+	if (p_image.is_null() || p_image->empty()) {
+
+		ret.resize(bake_texture_size * bake_texture_size);
+		for (int i = 0; i < bake_texture_size * bake_texture_size; i++) {
+			ret[i] = p_color_add;
+		}
+
+		return ret;
+	}
+	p_image = p_image->duplicate();
+
+	if (p_image->is_compressed()) {
+		print_line("DECOMPRESSING!!!!");
+
+		p_image->decompress();
+	}
+	p_image->convert(Image::FORMAT_RGBA8);
+	p_image->resize(bake_texture_size, bake_texture_size, Image::INTERPOLATE_CUBIC);
+
+	PoolVector<uint8_t>::Read r = p_image->get_data().read();
+	ret.resize(bake_texture_size * bake_texture_size);
+
+	for (int i = 0; i < bake_texture_size * bake_texture_size; i++) {
+		Color c;
+		c.r = (r[i * 4 + 0] / 255.0) * p_color_mul.r + p_color_add.r;
+		c.g = (r[i * 4 + 1] / 255.0) * p_color_mul.g + p_color_add.g;
+		c.b = (r[i * 4 + 2] / 255.0) * p_color_mul.b + p_color_add.b;
+
+		c.a = r[i * 4 + 3] / 255.0;
+
+		ret[i] = c;
+	}
+
+	return ret;
+}
+
+VoxelLightBaker::MaterialCache VoxelLightBaker::_get_material_cache(Ref<Material> p_material) {
+
+	//this way of obtaining materials is inaccurate and also does not support some compressed formats very well
+	Ref<SpatialMaterial> mat = p_material;
+
+	Ref<Material> material = mat; //hack for now
+
+	if (material_cache.has(material)) {
+		return material_cache[material];
+	}
+
+	MaterialCache mc;
+
+	if (mat.is_valid()) {
+
+		Ref<Texture> albedo_tex = mat->get_texture(SpatialMaterial::TEXTURE_ALBEDO);
+
+		Ref<Image> img_albedo;
+		if (albedo_tex.is_valid()) {
+
+			img_albedo = albedo_tex->get_data();
+			mc.albedo = _get_bake_texture(img_albedo, mat->get_albedo(), Color(0, 0, 0)); // albedo texture, color is multiplicative
+		} else {
+			mc.albedo = _get_bake_texture(img_albedo, Color(1, 1, 1), mat->get_albedo()); // no albedo texture, color is additive
+		}
+
+		Ref<Texture> emission_tex = mat->get_texture(SpatialMaterial::TEXTURE_EMISSION);
+
+		Color emission_col = mat->get_emission();
+		float emission_energy = mat->get_emission_energy();
+
+		Ref<Image> img_emission;
+
+		if (emission_tex.is_valid()) {
+
+			img_emission = emission_tex->get_data();
+		}
+
+		if (mat->get_emission_operator() == SpatialMaterial::EMISSION_OP_ADD) {
+			mc.emission = _get_bake_texture(img_emission, Color(1, 1, 1) * emission_energy, emission_col * emission_energy);
+		} else {
+			mc.emission = _get_bake_texture(img_emission, emission_col * emission_energy, Color(0, 0, 0));
+		}
+
+	} else {
+		Ref<Image> empty;
+
+		mc.albedo = _get_bake_texture(empty, Color(0, 0, 0), Color(1, 1, 1));
+		mc.emission = _get_bake_texture(empty, Color(0, 0, 0), Color(0, 0, 0));
+	}
+
+	material_cache[p_material] = mc;
+	return mc;
+}
+
+void VoxelLightBaker::plot_mesh(const Transform &p_xform, Ref<Mesh> &p_mesh, const Vector<Ref<Material> > &p_materials, const Ref<Material> &p_override_material) {
+
+	for (int i = 0; i < p_mesh->get_surface_count(); i++) {
+
+		if (p_mesh->surface_get_primitive_type(i) != Mesh::PRIMITIVE_TRIANGLES)
+			continue; //only triangles
+
+		Ref<Material> src_material;
+
+		if (p_override_material.is_valid()) {
+			src_material = p_override_material;
+		} else if (i < p_materials.size() && p_materials[i].is_valid()) {
+			src_material = p_materials[i];
+		} else {
+			src_material = p_mesh->surface_get_material(i);
+		}
+		MaterialCache material = _get_material_cache(src_material);
+
+		Array a = p_mesh->surface_get_arrays(i);
+
+		PoolVector<Vector3> vertices = a[Mesh::ARRAY_VERTEX];
+		PoolVector<Vector3>::Read vr = vertices.read();
+		PoolVector<Vector2> uv = a[Mesh::ARRAY_TEX_UV];
+		PoolVector<Vector2>::Read uvr;
+		PoolVector<int> index = a[Mesh::ARRAY_INDEX];
+
+		bool read_uv = false;
+
+		if (uv.size()) {
+
+			uvr = uv.read();
+			read_uv = true;
+		}
+
+		if (index.size()) {
+
+			int facecount = index.size() / 3;
+			PoolVector<int>::Read ir = index.read();
+
+			for (int j = 0; j < facecount; j++) {
+
+				Vector3 vtxs[3];
+				Vector2 uvs[3];
+
+				for (int k = 0; k < 3; k++) {
+					vtxs[k] = p_xform.xform(vr[ir[j * 3 + k]]);
+				}
+
+				if (read_uv) {
+					for (int k = 0; k < 3; k++) {
+						uvs[k] = uvr[ir[j * 3 + k]];
+					}
+				}
+
+				//test against original bounds
+				if (!fast_tri_box_overlap(original_bounds.position + original_bounds.size * 0.5, original_bounds.size * 0.5, vtxs))
+					continue;
+				//plot
+				_plot_face(0, 0, 0, 0, 0, vtxs, uvs, material, po2_bounds);
+			}
+
+		} else {
+
+			int facecount = vertices.size() / 3;
+
+			for (int j = 0; j < facecount; j++) {
+
+				Vector3 vtxs[3];
+				Vector2 uvs[3];
+
+				for (int k = 0; k < 3; k++) {
+					vtxs[k] = p_xform.xform(vr[j * 3 + k]);
+				}
+
+				if (read_uv) {
+					for (int k = 0; k < 3; k++) {
+						uvs[k] = uvr[j * 3 + k];
+					}
+				}
+
+				//test against original bounds
+				if (!fast_tri_box_overlap(original_bounds.position + original_bounds.size * 0.5, original_bounds.size * 0.5, vtxs))
+					continue;
+				//plot face
+				_plot_face(0, 0, 0, 0, 0, vtxs, uvs, material, po2_bounds);
+			}
+		}
+	}
+
+	max_original_cells = bake_cells.size();
+}
+
+void VoxelLightBaker::_init_light_plot(int p_idx, int p_level, int p_x, int p_y, int p_z, uint32_t p_parent) {
+
+	bake_light[p_idx].x = p_x;
+	bake_light[p_idx].y = p_y;
+	bake_light[p_idx].z = p_z;
+
+	if (p_level == cell_subdiv - 1) {
+
+		bake_light[p_idx].next_leaf = first_leaf;
+		first_leaf = p_idx;
+	} else {
+
+		//go down
+		int half = (1 << (cell_subdiv - 1)) >> (p_level + 1);
+		for (int i = 0; i < 8; i++) {
+
+			uint32_t child = bake_cells[p_idx].childs[i];
+
+			if (child == CHILD_EMPTY)
+				continue;
+
+			int nx = p_x;
+			int ny = p_y;
+			int nz = p_z;
+
+			if (i & 1)
+				nx += half;
+			if (i & 2)
+				ny += half;
+			if (i & 4)
+				nz += half;
+
+			_init_light_plot(child, p_level + 1, nx, ny, nz, p_idx);
+		}
+	}
+}
+
+void VoxelLightBaker::begin_bake_light(BakeQuality p_quality, BakeMode p_bake_mode, float p_propagation, float p_energy) {
+	_check_init_light();
+	propagation = p_propagation;
+	bake_quality = p_quality;
+	bake_mode = p_bake_mode;
+	energy = p_energy;
+}
+
+void VoxelLightBaker::_check_init_light() {
+	if (bake_light.size() == 0) {
+
+		direct_lights_baked = false;
+		leaf_voxel_count = 0;
+		_fixup_plot(0, 0); //pre fixup, so normal, albedo, emission, etc. work for lighting.
+		bake_light.resize(bake_cells.size());
+		zeromem(bake_light.ptrw(), bake_light.size() * sizeof(Light));
+		first_leaf = -1;
+		_init_light_plot(0, 0, 0, 0, 0, CHILD_EMPTY);
+	}
+}
+
+static float _get_normal_advance(const Vector3 &p_normal) {
+
+	Vector3 normal = p_normal;
+	Vector3 unorm = normal.abs();
+
+	if ((unorm.x >= unorm.y) && (unorm.x >= unorm.z)) {
+		// x code
+		unorm = normal.x > 0.0 ? Vector3(1.0, 0.0, 0.0) : Vector3(-1.0, 0.0, 0.0);
+	} else if ((unorm.y > unorm.x) && (unorm.y >= unorm.z)) {
+		// y code
+		unorm = normal.y > 0.0 ? Vector3(0.0, 1.0, 0.0) : Vector3(0.0, -1.0, 0.0);
+	} else if ((unorm.z > unorm.x) && (unorm.z > unorm.y)) {
+		// z code
+		unorm = normal.z > 0.0 ? Vector3(0.0, 0.0, 1.0) : Vector3(0.0, 0.0, -1.0);
+	} else {
+		// oh-no we messed up code
+		// has to be
+		unorm = Vector3(1.0, 0.0, 0.0);
+	}
+
+	return 1.0 / normal.dot(unorm);
+}
+
+static const Vector3 aniso_normal[6] = {
+	Vector3(-1, 0, 0),
+	Vector3(1, 0, 0),
+	Vector3(0, -1, 0),
+	Vector3(0, 1, 0),
+	Vector3(0, 0, -1),
+	Vector3(0, 0, 1)
+};
+
+uint32_t VoxelLightBaker::_find_cell_at_pos(const Cell *cells, int x, int y, int z) {
+
+	uint32_t cell = 0;
+
+	int ofs_x = 0;
+	int ofs_y = 0;
+	int ofs_z = 0;
+	int size = 1 << (cell_subdiv - 1);
+	int half = size / 2;
+
+	if (x < 0 || x >= size)
+		return -1;
+	if (y < 0 || y >= size)
+		return -1;
+	if (z < 0 || z >= size)
+		return -1;
+
+	for (int i = 0; i < cell_subdiv - 1; i++) {
+
+		const Cell *bc = &cells[cell];
+
+		int child = 0;
+		if (x >= ofs_x + half) {
+			child |= 1;
+			ofs_x += half;
+		}
+		if (y >= ofs_y + half) {
+			child |= 2;
+			ofs_y += half;
+		}
+		if (z >= ofs_z + half) {
+			child |= 4;
+			ofs_z += half;
+		}
+
+		cell = bc->childs[child];
+		if (cell == CHILD_EMPTY)
+			return CHILD_EMPTY;
+
+		half >>= 1;
+	}
+
+	return cell;
+}
+void VoxelLightBaker::plot_light_directional(const Vector3 &p_direction, const Color &p_color, float p_energy, float p_indirect_energy, bool p_direct) {
+
+	_check_init_light();
+
+	float max_len = Vector3(axis_cell_size[0], axis_cell_size[1], axis_cell_size[2]).length() * 1.1;
+
+	if (p_direct)
+		direct_lights_baked = true;
+
+	Vector3 light_axis = p_direction;
+	Plane clip[3];
+	int clip_planes = 0;
+
+	Light *light_data = bake_light.ptrw();
+	const Cell *cells = bake_cells.ptr();
+
+	for (int i = 0; i < 3; i++) {
+
+		if (ABS(light_axis[i]) < CMP_EPSILON)
+			continue;
+		clip[clip_planes].normal[i] = 1.0;
+
+		if (light_axis[i] < 0) {
+
+			clip[clip_planes].d = axis_cell_size[i] + 1;
+		} else {
+			clip[clip_planes].d -= 1.0;
+		}
+
+		clip_planes++;
+	}
+
+	float distance_adv = _get_normal_advance(light_axis);
+
+	int success_count = 0;
+
+	Vector3 light_energy = Vector3(p_color.r, p_color.g, p_color.b) * p_energy * p_indirect_energy;
+
+	int idx = first_leaf;
+	while (idx >= 0) {
+
+		//print_line("plot idx " + itos(idx));
+		Light *light = &light_data[idx];
+
+		Vector3 to(light->x + 0.5, light->y + 0.5, light->z + 0.5);
+		to += -light_axis.sign() * 0.47; //make it more likely to receive a ray
+
+		Vector3 from = to - max_len * light_axis;
+
+		for (int j = 0; j < clip_planes; j++) {
+
+			clip[j].intersects_segment(from, to, &from);
+		}
+
+		float distance = (to - from).length();
+		distance += distance_adv - Math::fmod(distance, distance_adv); //make it reach the center of the box always
+		from = to - light_axis * distance;
+
+		uint32_t result = 0xFFFFFFFF;
+
+		while (distance > -distance_adv) { //use this to avoid precision errors
+
+			result = _find_cell_at_pos(cells, int(floor(from.x)), int(floor(from.y)), int(floor(from.z)));
+			if (result != 0xFFFFFFFF) {
+				break;
+			}
+
+			from += light_axis * distance_adv;
+			distance -= distance_adv;
+		}
+
+		if (result == idx) {
+			//cell hit itself! hooray!
+
+			Vector3 normal(cells[idx].normal[0], cells[idx].normal[1], cells[idx].normal[2]);
+			if (normal == Vector3()) {
+				for (int i = 0; i < 6; i++) {
+					light->accum[i][0] += light_energy.x * cells[idx].albedo[0];
+					light->accum[i][1] += light_energy.y * cells[idx].albedo[1];
+					light->accum[i][2] += light_energy.z * cells[idx].albedo[2];
+				}
+
+			} else {
+
+				for (int i = 0; i < 6; i++) {
+					float s = MAX(0.0, aniso_normal[i].dot(-normal));
+					light->accum[i][0] += light_energy.x * cells[idx].albedo[0] * s;
+					light->accum[i][1] += light_energy.y * cells[idx].albedo[1] * s;
+					light->accum[i][2] += light_energy.z * cells[idx].albedo[2] * s;
+				}
+			}
+
+			for (int i = 0; i < 6; i++) {
+				float s = MAX(0.0, aniso_normal[i].dot(-light_axis)); //light depending on normal for direct
+				light->direct_accum[i][0] += light_energy.x * s;
+				light->direct_accum[i][1] += light_energy.y * s;
+				light->direct_accum[i][2] += light_energy.z * s;
+			}
+			success_count++;
+		}
+
+		idx = light_data[idx].next_leaf;
+	}
+}
+
+void VoxelLightBaker::plot_light_omni(const Vector3 &p_pos, const Color &p_color, float p_energy, float p_indirect_energy, float p_radius, float p_attenutation, bool p_direct) {
+
+	_check_init_light();
+
+	if (p_direct)
+		direct_lights_baked = true;
+
+	Plane clip[3];
+	int clip_planes = 0;
+
+	// uint64_t us = OS::get_singleton()->get_ticks_usec();
+
+	Vector3 light_pos = to_cell_space.xform(p_pos) + Vector3(0.5, 0.5, 0.5);
+	//Vector3 spot_axis = -light_cache.transform.basis.get_axis(2).normalized();
+
+	float local_radius = to_cell_space.basis.xform(Vector3(0, 0, 1)).length() * p_radius;
+
+	Light *light_data = bake_light.ptrw();
+	const Cell *cells = bake_cells.ptr();
+	Vector3 light_energy = Vector3(p_color.r, p_color.g, p_color.b) * p_energy * p_indirect_energy;
+
+	int idx = first_leaf;
+	while (idx >= 0) {
+
+		//print_line("plot idx " + itos(idx));
+		Light *light = &light_data[idx];
+
+		Vector3 to(light->x + 0.5, light->y + 0.5, light->z + 0.5);
+		to += (light_pos - to).sign() * 0.47; //make it more likely to receive a ray
+
+		Vector3 light_axis = (to - light_pos).normalized();
+		float distance_adv = _get_normal_advance(light_axis);
+
+		Vector3 normal(cells[idx].normal[0], cells[idx].normal[1], cells[idx].normal[2]);
+
+		if (normal != Vector3() && normal.dot(-light_axis) < 0.001) {
+			idx = light_data[idx].next_leaf;
+			continue;
+		}
+
+		float att = 1.0;
+		{
+			float d = light_pos.distance_to(to);
+			if (d + distance_adv > local_radius) {
+				idx = light_data[idx].next_leaf;
+				continue; // too far away
+			}
+
+			float dt = CLAMP((d + distance_adv) / local_radius, 0, 1);
+			att *= powf(1.0 - dt, p_attenutation);
+		}
+#if 0
+		if (light_cache.type == VS::LIGHT_SPOT) {
+
+			float angle = Math::rad2deg(acos(light_axis.dot(spot_axis)));
+			if (angle > light_cache.spot_angle)
+				continue;
+
+			float d = CLAMP(angle / light_cache.spot_angle, 1, 0);
+			att *= powf(1.0 - d, light_cache.spot_attenuation);
+		}
+#endif
+		clip_planes = 0;
+
+		for (int c = 0; c < 3; c++) {
+
+			if (ABS(light_axis[c]) < CMP_EPSILON)
+				continue;
+			clip[clip_planes].normal[c] = 1.0;
+
+			if (light_axis[c] < 0) {
+
+				clip[clip_planes].d = (1 << (cell_subdiv - 1)) + 1;
+			} else {
+				clip[clip_planes].d -= 1.0;
+			}
+
+			clip_planes++;
+		}
+
+		Vector3 from = light_pos;
+
+		for (int j = 0; j < clip_planes; j++) {
+
+			clip[j].intersects_segment(from, to, &from);
+		}
+
+		float distance = (to - from).length();
+
+		distance -= Math::fmod(distance, distance_adv); //make it reach the center of the box always, but this tame make it closer
+		from = to - light_axis * distance;
+		to += (light_pos - to).sign() * 0.47; //make it more likely to receive a ray
+
+		uint32_t result = 0xFFFFFFFF;
+
+		while (distance > -distance_adv) { //use this to avoid precision errors
+
+			result = _find_cell_at_pos(cells, int(floor(from.x)), int(floor(from.y)), int(floor(from.z)));
+			if (result != 0xFFFFFFFF) {
+				break;
+			}
+
+			from += light_axis * distance_adv;
+			distance -= distance_adv;
+		}
+
+		if (result == idx) {
+			//cell hit itself! hooray!
+
+			if (normal == Vector3()) {
+				for (int i = 0; i < 6; i++) {
+					light->accum[i][0] += light_energy.x * cells[idx].albedo[0] * att;
+					light->accum[i][1] += light_energy.y * cells[idx].albedo[1] * att;
+					light->accum[i][2] += light_energy.z * cells[idx].albedo[2] * att;
+				}
+
+			} else {
+
+				for (int i = 0; i < 6; i++) {
+					float s = MAX(0.0, aniso_normal[i].dot(-normal));
+					light->accum[i][0] += light_energy.x * cells[idx].albedo[0] * s * att;
+					light->accum[i][1] += light_energy.y * cells[idx].albedo[1] * s * att;
+					light->accum[i][2] += light_energy.z * cells[idx].albedo[2] * s * att;
+				}
+			}
+
+			for (int i = 0; i < 6; i++) {
+				float s = MAX(0.0, aniso_normal[i].dot(-light_axis)); //light depending on normal for direct
+				light->direct_accum[i][0] += light_energy.x * s * att;
+				light->direct_accum[i][1] += light_energy.y * s * att;
+				light->direct_accum[i][2] += light_energy.z * s * att;
+			}
+		}
+
+		idx = light_data[idx].next_leaf;
+	}
+}
+
+void VoxelLightBaker::plot_light_spot(const Vector3 &p_pos, const Vector3 &p_axis, const Color &p_color, float p_energy, float p_indirect_energy, float p_radius, float p_attenutation, float p_spot_angle, float p_spot_attenuation, bool p_direct) {
+
+	_check_init_light();
+
+	if (p_direct)
+		direct_lights_baked = true;
+
+	Plane clip[3];
+	int clip_planes = 0;
+
+	// uint64_t us = OS::get_singleton()->get_ticks_usec();
+
+	Vector3 light_pos = to_cell_space.xform(p_pos) + Vector3(0.5, 0.5, 0.5);
+	Vector3 spot_axis = to_cell_space.basis.xform(p_axis).normalized();
+
+	float local_radius = to_cell_space.basis.xform(Vector3(0, 0, 1)).length() * p_radius;
+
+	Light *light_data = bake_light.ptrw();
+	const Cell *cells = bake_cells.ptr();
+	Vector3 light_energy = Vector3(p_color.r, p_color.g, p_color.b) * p_energy * p_indirect_energy;
+
+	int idx = first_leaf;
+	while (idx >= 0) {
+
+		//print_line("plot idx " + itos(idx));
+		Light *light = &light_data[idx];
+
+		Vector3 to(light->x + 0.5, light->y + 0.5, light->z + 0.5);
+
+		Vector3 light_axis = (to - light_pos).normalized();
+		float distance_adv = _get_normal_advance(light_axis);
+
+		Vector3 normal(cells[idx].normal[0], cells[idx].normal[1], cells[idx].normal[2]);
+
+		if (normal != Vector3() && normal.dot(-light_axis) < 0.001) {
+			idx = light_data[idx].next_leaf;
+			continue;
+		}
+
+		float angle = Math::rad2deg(Math::acos(light_axis.dot(-spot_axis)));
+		if (angle > p_spot_angle) {
+			idx = light_data[idx].next_leaf;
+			continue; // too far away
+		}
+
+		float att = Math::pow(1.0f - angle / p_spot_angle, p_spot_attenuation);
+
+		{
+			float d = light_pos.distance_to(to);
+			if (d + distance_adv > local_radius) {
+				idx = light_data[idx].next_leaf;
+				continue; // too far away
+			}
+
+			float dt = CLAMP((d + distance_adv) / local_radius, 0, 1);
+			att *= powf(1.0 - dt, p_attenutation);
+		}
+#if 0
+		if (light_cache.type == VS::LIGHT_SPOT) {
+
+			float angle = Math::rad2deg(acos(light_axis.dot(spot_axis)));
+			if (angle > light_cache.spot_angle)
+				continue;
+
+			float d = CLAMP(angle / light_cache.spot_angle, 1, 0);
+			att *= powf(1.0 - d, light_cache.spot_attenuation);
+		}
+#endif
+		clip_planes = 0;
+
+		for (int c = 0; c < 3; c++) {
+
+			if (ABS(light_axis[c]) < CMP_EPSILON)
+				continue;
+			clip[clip_planes].normal[c] = 1.0;
+
+			if (light_axis[c] < 0) {
+
+				clip[clip_planes].d = (1 << (cell_subdiv - 1)) + 1;
+			} else {
+				clip[clip_planes].d -= 1.0;
+			}
+
+			clip_planes++;
+		}
+
+		Vector3 from = light_pos;
+
+		for (int j = 0; j < clip_planes; j++) {
+
+			clip[j].intersects_segment(from, to, &from);
+		}
+
+		float distance = (to - from).length();
+
+		distance -= Math::fmod(distance, distance_adv); //make it reach the center of the box always, but this tame make it closer
+		from = to - light_axis * distance;
+
+		uint32_t result = 0xFFFFFFFF;
+
+		while (distance > -distance_adv) { //use this to avoid precision errors
+
+			result = _find_cell_at_pos(cells, int(floor(from.x)), int(floor(from.y)), int(floor(from.z)));
+			if (result != 0xFFFFFFFF) {
+				break;
+			}
+
+			from += light_axis * distance_adv;
+			distance -= distance_adv;
+		}
+
+		if (result == idx) {
+			//cell hit itself! hooray!
+
+			if (normal == Vector3()) {
+				for (int i = 0; i < 6; i++) {
+					light->accum[i][0] += light_energy.x * cells[idx].albedo[0] * att;
+					light->accum[i][1] += light_energy.y * cells[idx].albedo[1] * att;
+					light->accum[i][2] += light_energy.z * cells[idx].albedo[2] * att;
+				}
+
+			} else {
+
+				for (int i = 0; i < 6; i++) {
+					float s = MAX(0.0, aniso_normal[i].dot(-normal));
+					light->accum[i][0] += light_energy.x * cells[idx].albedo[0] * s * att;
+					light->accum[i][1] += light_energy.y * cells[idx].albedo[1] * s * att;
+					light->accum[i][2] += light_energy.z * cells[idx].albedo[2] * s * att;
+				}
+			}
+
+			for (int i = 0; i < 6; i++) {
+				float s = MAX(0.0, aniso_normal[i].dot(-light_axis)); //light depending on normal for direct
+				light->direct_accum[i][0] += light_energy.x * s * att;
+				light->direct_accum[i][1] += light_energy.y * s * att;
+				light->direct_accum[i][2] += light_energy.z * s * att;
+			}
+		}
+
+		idx = light_data[idx].next_leaf;
+	}
+}
+
+void VoxelLightBaker::_fixup_plot(int p_idx, int p_level) {
+
+	if (p_level == cell_subdiv - 1) {
+
+		leaf_voxel_count++;
+		float alpha = bake_cells[p_idx].alpha;
+
+		bake_cells[p_idx].albedo[0] /= alpha;
+		bake_cells[p_idx].albedo[1] /= alpha;
+		bake_cells[p_idx].albedo[2] /= alpha;
+
+		//transfer emission to light
+		bake_cells[p_idx].emission[0] /= alpha;
+		bake_cells[p_idx].emission[1] /= alpha;
+		bake_cells[p_idx].emission[2] /= alpha;
+
+		bake_cells[p_idx].normal[0] /= alpha;
+		bake_cells[p_idx].normal[1] /= alpha;
+		bake_cells[p_idx].normal[2] /= alpha;
+
+		Vector3 n(bake_cells[p_idx].normal[0], bake_cells[p_idx].normal[1], bake_cells[p_idx].normal[2]);
+		if (n.length() < 0.01) {
+			//too much fight over normal, zero it
+			bake_cells[p_idx].normal[0] = 0;
+			bake_cells[p_idx].normal[1] = 0;
+			bake_cells[p_idx].normal[2] = 0;
+		} else {
+			n.normalize();
+			bake_cells[p_idx].normal[0] = n.x;
+			bake_cells[p_idx].normal[1] = n.y;
+			bake_cells[p_idx].normal[2] = n.z;
+		}
+
+		bake_cells[p_idx].alpha = 1.0;
+
+		/*if (bake_light.size()) {
+			for(int i=0;i<6;i++) {
+
+			}
+		}*/
+
+	} else {
+
+		//go down
+
+		bake_cells[p_idx].emission[0] = 0;
+		bake_cells[p_idx].emission[1] = 0;
+		bake_cells[p_idx].emission[2] = 0;
+		bake_cells[p_idx].normal[0] = 0;
+		bake_cells[p_idx].normal[1] = 0;
+		bake_cells[p_idx].normal[2] = 0;
+		bake_cells[p_idx].albedo[0] = 0;
+		bake_cells[p_idx].albedo[1] = 0;
+		bake_cells[p_idx].albedo[2] = 0;
+		if (bake_light.size()) {
+			for (int j = 0; j < 6; j++) {
+				bake_light[p_idx].accum[j][0] = 0;
+				bake_light[p_idx].accum[j][1] = 0;
+				bake_light[p_idx].accum[j][2] = 0;
+			}
+		}
+
+		float alpha_average = 0;
+		int children_found = 0;
+
+		for (int i = 0; i < 8; i++) {
+
+			uint32_t child = bake_cells[p_idx].childs[i];
+
+			if (child == CHILD_EMPTY)
+				continue;
+
+			_fixup_plot(child, p_level + 1);
+			alpha_average += bake_cells[child].alpha;
+
+			if (bake_light.size() > 0) {
+				for (int j = 0; j < 6; j++) {
+					bake_light[p_idx].accum[j][0] += bake_light[child].accum[j][0];
+					bake_light[p_idx].accum[j][1] += bake_light[child].accum[j][1];
+					bake_light[p_idx].accum[j][2] += bake_light[child].accum[j][2];
+				}
+				bake_cells[p_idx].emission[0] += bake_cells[child].emission[0];
+				bake_cells[p_idx].emission[1] += bake_cells[child].emission[1];
+				bake_cells[p_idx].emission[2] += bake_cells[child].emission[2];
+			}
+
+			children_found++;
+		}
+
+		bake_cells[p_idx].alpha = alpha_average / 8.0;
+		if (bake_light.size() && children_found) {
+			float divisor = Math::lerp(8, children_found, propagation);
+			for (int j = 0; j < 6; j++) {
+				bake_light[p_idx].accum[j][0] /= divisor;
+				bake_light[p_idx].accum[j][1] /= divisor;
+				bake_light[p_idx].accum[j][2] /= divisor;
+			}
+			bake_cells[p_idx].emission[0] /= divisor;
+			bake_cells[p_idx].emission[1] /= divisor;
+			bake_cells[p_idx].emission[2] /= divisor;
+		}
+	}
+}
+
+//make sure any cell (save for the root) has an empty cell previous to it, so it can be interpolated into
+
+void VoxelLightBaker::_plot_triangle(Vector2 *vertices, Vector3 *positions, Vector3 *normals, LightMap *pixels, int width, int height) {
+
+	int x[3];
+	int y[3];
+
+	for (int j = 0; j < 3; j++) {
+
+		x[j] = vertices[j].x * width;
+		y[j] = vertices[j].y * height;
+		//x[j] = CLAMP(x[j], 0, bt.width - 1);
+		//y[j] = CLAMP(y[j], 0, bt.height - 1);
+	}
+
+	// sort the points vertically
+	if (y[1] > y[2]) {
+		SWAP(x[1], x[2]);
+		SWAP(y[1], y[2]);
+		SWAP(positions[1], positions[2]);
+		SWAP(normals[1], normals[2]);
+	}
+	if (y[0] > y[1]) {
+		SWAP(x[0], x[1]);
+		SWAP(y[0], y[1]);
+		SWAP(positions[0], positions[1]);
+		SWAP(normals[0], normals[1]);
+	}
+	if (y[1] > y[2]) {
+		SWAP(x[1], x[2]);
+		SWAP(y[1], y[2]);
+		SWAP(positions[1], positions[2]);
+		SWAP(normals[1], normals[2]);
+	}
+
+	double dx_far = double(x[2] - x[0]) / (y[2] - y[0] + 1);
+	double dx_upper = double(x[1] - x[0]) / (y[1] - y[0] + 1);
+	double dx_low = double(x[2] - x[1]) / (y[2] - y[1] + 1);
+	double xf = x[0];
+	double xt = x[0] + dx_upper; // if y[0] == y[1], special case
+	for (int yi = y[0]; yi <= (y[2] > height - 1 ? height - 1 : y[2]); yi++) {
+		if (yi >= 0) {
+			for (int xi = (xf > 0 ? int(xf) : 0); xi <= (xt < width ? xt : width - 1); xi++) {
+				//pixels[int(x + y * width)] = color;
+
+				Vector2 v0 = Vector2(x[1] - x[0], y[1] - y[0]);
+				Vector2 v1 = Vector2(x[2] - x[0], y[2] - y[0]);
+				//vertices[2] - vertices[0];
+				Vector2 v2 = Vector2(xi - x[0], yi - y[0]);
+				float d00 = v0.dot(v0);
+				float d01 = v0.dot(v1);
+				float d11 = v1.dot(v1);
+				float d20 = v2.dot(v0);
+				float d21 = v2.dot(v1);
+				float denom = (d00 * d11 - d01 * d01);
+				Vector3 pos;
+				Vector3 normal;
+				if (denom == 0) {
+					pos = positions[0];
+					normal = normals[0];
+				} else {
+					float v = (d11 * d20 - d01 * d21) / denom;
+					float w = (d00 * d21 - d01 * d20) / denom;
+					float u = 1.0f - v - w;
+					pos = positions[0] * u + positions[1] * v + positions[2] * w;
+					normal = normals[0] * u + normals[1] * v + normals[2] * w;
+				}
+
+				int ofs = yi * width + xi;
+				pixels[ofs].normal = normal;
+				pixels[ofs].pos = pos;
+			}
+
+			for (int xi = (xf < width ? int(xf) : width - 1); xi >= (xt > 0 ? xt : 0); xi--) {
+				//pixels[int(x + y * width)] = color;
+				Vector2 v0 = Vector2(x[1] - x[0], y[1] - y[0]);
+				Vector2 v1 = Vector2(x[2] - x[0], y[2] - y[0]);
+				//vertices[2] - vertices[0];
+				Vector2 v2 = Vector2(xi - x[0], yi - y[0]);
+				float d00 = v0.dot(v0);
+				float d01 = v0.dot(v1);
+				float d11 = v1.dot(v1);
+				float d20 = v2.dot(v0);
+				float d21 = v2.dot(v1);
+				float denom = (d00 * d11 - d01 * d01);
+				Vector3 pos;
+				Vector3 normal;
+				if (denom == 0) {
+					pos = positions[0];
+					normal = normals[0];
+				} else {
+					float v = (d11 * d20 - d01 * d21) / denom;
+					float w = (d00 * d21 - d01 * d20) / denom;
+					float u = 1.0f - v - w;
+					pos = positions[0] * u + positions[1] * v + positions[2] * w;
+					normal = normals[0] * u + normals[1] * v + normals[2] * w;
+				}
+
+				int ofs = yi * width + xi;
+				pixels[ofs].normal = normal;
+				pixels[ofs].pos = pos;
+			}
+		}
+		xf += dx_far;
+		if (yi < y[1])
+			xt += dx_upper;
+		else
+			xt += dx_low;
+	}
+}
+
+void VoxelLightBaker::_sample_baked_octree_filtered_and_anisotropic(const Vector3 &p_posf, const Vector3 &p_direction, float p_level, Vector3 &r_color, float &r_alpha) {
+
+	int size = 1 << (cell_subdiv - 1);
+
+	int clamp_v = size - 1;
+	//first of all, clamp
+	Vector3 pos;
+	pos.x = CLAMP(p_posf.x, 0, clamp_v);
+	pos.y = CLAMP(p_posf.y, 0, clamp_v);
+	pos.z = CLAMP(p_posf.z, 0, clamp_v);
+
+	float level = (cell_subdiv - 1) - p_level;
+
+	int target_level;
+	float level_filter;
+	if (level <= 0.0) {
+		level_filter = 0;
+		target_level = 0;
+	} else {
+		target_level = Math::ceil(level);
+		level_filter = target_level - level;
+	}
+
+	const Cell *cells = bake_cells.ptr();
+	const Light *light = bake_light.ptr();
+
+	Vector3 color[2][8];
+	float alpha[2][8];
+	zeromem(alpha, sizeof(float) * 2 * 8);
+
+	//find cell at given level first
+
+	for (int c = 0; c < 2; c++) {
+
+		int current_level = MAX(0, target_level - c);
+		int level_cell_size = (1 << (cell_subdiv - 1)) >> current_level;
+
+		for (int n = 0; n < 8; n++) {
+
+			int x = int(pos.x);
+			int y = int(pos.y);
+			int z = int(pos.z);
+
+			if (n & 1)
+				x += level_cell_size;
+			if (n & 2)
+				y += level_cell_size;
+			if (n & 4)
+				z += level_cell_size;
+
+			int ofs_x = 0;
+			int ofs_y = 0;
+			int ofs_z = 0;
+
+			x = CLAMP(x, 0, clamp_v);
+			y = CLAMP(y, 0, clamp_v);
+			z = CLAMP(z, 0, clamp_v);
+
+			int half = size / 2;
+			uint32_t cell = 0;
+			for (int i = 0; i < current_level; i++) {
+
+				const Cell *bc = &cells[cell];
+
+				int child = 0;
+				if (x >= ofs_x + half) {
+					child |= 1;
+					ofs_x += half;
+				}
+				if (y >= ofs_y + half) {
+					child |= 2;
+					ofs_y += half;
+				}
+				if (z >= ofs_z + half) {
+					child |= 4;
+					ofs_z += half;
+				}
+
+				cell = bc->childs[child];
+				if (cell == CHILD_EMPTY)
+					break;
+
+				half >>= 1;
+			}
+
+			if (cell == CHILD_EMPTY) {
+				alpha[c][n] = 0;
+			} else {
+				alpha[c][n] = cells[cell].alpha;
+
+				for (int i = 0; i < 6; i++) {
+					//anisotropic read light
+					float amount = p_direction.dot(aniso_normal[i]);
+					//if (c == 0) {
+					//	print_line("\t" + itos(n) + " aniso " + itos(i) + " " + rtos(light[cell].accum[i][0]) + " VEC: " + aniso_normal[i]);
+					//}
+					if (amount < 0)
+						amount = 0;
+					//amount = 1;
+					color[c][n].x += light[cell].accum[i][0] * amount;
+					color[c][n].y += light[cell].accum[i][1] * amount;
+					color[c][n].z += light[cell].accum[i][2] * amount;
+				}
+
+				color[c][n].x += cells[cell].emission[0];
+				color[c][n].y += cells[cell].emission[1];
+				color[c][n].z += cells[cell].emission[2];
+			}
+
+			//print_line("\tlev " + itos(c) + " - " + itos(n) + " alpha: " + rtos(cells[test_cell].alpha) + " col: " + color[c][n]);
+		}
+	}
+
+	float target_level_size = size >> target_level;
+	Vector3 pos_fract[2];
+
+	pos_fract[0].x = Math::fmod(pos.x, target_level_size) / target_level_size;
+	pos_fract[0].y = Math::fmod(pos.y, target_level_size) / target_level_size;
+	pos_fract[0].z = Math::fmod(pos.z, target_level_size) / target_level_size;
+
+	target_level_size = size >> MAX(0, target_level - 1);
+
+	pos_fract[1].x = Math::fmod(pos.x, target_level_size) / target_level_size;
+	pos_fract[1].y = Math::fmod(pos.y, target_level_size) / target_level_size;
+	pos_fract[1].z = Math::fmod(pos.z, target_level_size) / target_level_size;
+
+	float alpha_interp[2];
+	Vector3 color_interp[2];
+
+	for (int i = 0; i < 2; i++) {
+
+		Vector3 color_x00 = color[i][0].linear_interpolate(color[i][1], pos_fract[i].x);
+		Vector3 color_xy0 = color[i][2].linear_interpolate(color[i][3], pos_fract[i].x);
+		Vector3 blend_z0 = color_x00.linear_interpolate(color_xy0, pos_fract[i].y);
+
+		Vector3 color_x0z = color[i][4].linear_interpolate(color[i][5], pos_fract[i].x);
+		Vector3 color_xyz = color[i][6].linear_interpolate(color[i][7], pos_fract[i].x);
+		Vector3 blend_z1 = color_x0z.linear_interpolate(color_xyz, pos_fract[i].y);
+
+		color_interp[i] = blend_z0.linear_interpolate(blend_z1, pos_fract[i].z);
+
+		float alpha_x00 = Math::lerp(alpha[i][0], alpha[i][1], pos_fract[i].x);
+		float alpha_xy0 = Math::lerp(alpha[i][2], alpha[i][3], pos_fract[i].x);
+		float alpha_z0 = Math::lerp(alpha_x00, alpha_xy0, pos_fract[i].y);
+
+		float alpha_x0z = Math::lerp(alpha[i][4], alpha[i][5], pos_fract[i].x);
+		float alpha_xyz = Math::lerp(alpha[i][6], alpha[i][7], pos_fract[i].x);
+		float alpha_z1 = Math::lerp(alpha_x0z, alpha_xyz, pos_fract[i].y);
+
+		alpha_interp[i] = Math::lerp(alpha_z0, alpha_z1, pos_fract[i].z);
+	}
+
+	r_color = color_interp[0].linear_interpolate(color_interp[1], level_filter);
+	r_alpha = Math::lerp(alpha_interp[0], alpha_interp[1], level_filter);
+
+	//	print_line("pos: " + p_posf + " level " + rtos(p_level) + " down to " + itos(target_level) + "." + rtos(level_filter) + " color " + r_color + " alpha " + rtos(r_alpha));
+}
+
+Vector3 VoxelLightBaker::_voxel_cone_trace(const Vector3 &p_pos, const Vector3 &p_normal, float p_aperture) {
+
+	float bias = 2.5;
+	float max_distance = (Vector3(1, 1, 1) * (1 << (cell_subdiv - 1))).length();
+
+	float dist = bias;
+	float alpha = 0.0;
+	Vector3 color;
+
+	Vector3 scolor;
+	float salpha;
+
+	while (dist < max_distance && alpha < 0.95) {
+		float diameter = MAX(1.0, 2.0 * p_aperture * dist);
+		//print_line("VCT: pos " + (p_pos + dist * p_normal) + " dist " + rtos(dist) + " mipmap " + rtos(log2(diameter)) + " alpha " + rtos(alpha));
+		//Plane scolor = textureLod(probe, (pos + dist * direction) * cell_size, log2(diameter) );
+		_sample_baked_octree_filtered_and_anisotropic(p_pos + dist * p_normal, p_normal, log2(diameter), scolor, salpha);
+		float a = (1.0 - alpha);
+		color += scolor * a;
+		alpha += a * salpha;
+		dist += diameter * 0.5;
+	}
+
+	/*if (blend_ambient) {
+		color.rgb = mix(ambient,color.rgb,min(1.0,alpha/0.95));
+	}*/
+
+	return color;
+}
+
+Vector3 VoxelLightBaker::_compute_pixel_light_at_pos(const Vector3 &p_pos, const Vector3 &p_normal) {
+
+	//find arbitrary tangent and bitangent, then build a matrix
+	Vector3 v0 = Math::abs(p_normal.z) < 0.999 ? Vector3(0, 0, 1) : Vector3(0, 1, 0);
+	Vector3 tangent = v0.cross(p_normal).normalized();
+	Vector3 bitangent = tangent.cross(p_normal).normalized();
+	Basis normal_xform = Basis(tangent, bitangent, p_normal).transposed();
+
+	//	print_line("normal xform: " + normal_xform);
+	const Vector3 *cone_dirs;
+	const float *cone_weights;
+	int cone_dir_count;
+	float cone_aperture;
+
+	switch (bake_quality) {
+		case BAKE_QUALITY_LOW: {
+			//default quality
+			static const Vector3 dirs[4] = {
+				Vector3(0.707107, 0, 0.707107),
+				Vector3(0, 0.707107, 0.707107),
+				Vector3(-0.707107, 0, 0.707107),
+				Vector3(0, -0.707107, 0.707107)
+			};
+
+			static const float weights[4] = { 0.25, 0.25, 0.25, 0.25 };
+
+			cone_dirs = dirs;
+			cone_dir_count = 4;
+			cone_aperture = 1.0; // tan(angle) 90 degrees
+			cone_weights = weights;
+		} break;
+		case BAKE_QUALITY_MEDIUM: {
+			//default quality
+			static const Vector3 dirs[6] = {
+				Vector3(0, 0, 1),
+				Vector3(0.866025, 0, 0.5),
+				Vector3(0.267617, 0.823639, 0.5),
+				Vector3(-0.700629, 0.509037, 0.5),
+				Vector3(-0.700629, -0.509037, 0.5),
+				Vector3(0.267617, -0.823639, 0.5)
+			};
+			static const float weights[6] = { 0.25, 0.15, 0.15, 0.15, 0.15, 0.15 };
+			//
+			cone_dirs = dirs;
+			cone_dir_count = 6;
+			cone_aperture = 0.577; // tan(angle) 60 degrees
+			cone_weights = weights;
+		} break;
+		case BAKE_QUALITY_HIGH: {
+
+			//high qualily
+			static const Vector3 dirs[10] = {
+				Vector3(0.8781648411741658, 0.0, 0.478358141694643),
+				Vector3(0.5369754325592234, 0.6794204427701518, 0.5000452447267606),
+				Vector3(-0.19849436573466497, 0.8429904390140635, 0.49996710542041645),
+				Vector3(-0.7856196499811189, 0.3639120321329737, 0.5003696617825604),
+				Vector3(-0.7856196499811189, -0.3639120321329737, 0.5003696617825604),
+				Vector3(-0.19849436573466497, -0.8429904390140635, 0.49996710542041645),
+				Vector3(0.5369754325592234, -0.6794204427701518, 0.5000452447267606),
+				Vector3(-0.4451656858129485, 0.0, 0.8954482185892644),
+				Vector3(0.19124006749743122, 0.39355745585016605, 0.8991883926788214),
+				Vector3(0.19124006749743122, -0.39355745585016605, 0.8991883926788214),
+			};
+			static const float weights[10] = { 0.08571, 0.08571, 0.08571, 0.08571, 0.08571, 0.08571, 0.08571, 0.133333, 0.133333, 0.13333 };
+			cone_dirs = dirs;
+			cone_dir_count = 10;
+			cone_aperture = 0.404; // tan(angle) 45 degrees
+			cone_weights = weights;
+		} break;
+	}
+
+	Vector3 accum;
+
+	for (int i = 0; i < cone_dir_count; i++) {
+		//	if (i > 0)
+		//		continue;
+		Vector3 dir = normal_xform.xform(cone_dirs[i]).normalized(); //normal may not completely correct when transformed to cell
+		//print_line("direction: " + dir);
+		accum += _voxel_cone_trace(p_pos, dir, cone_aperture) * cone_weights[i];
+	}
+
+	return accum;
+}
+
+Vector3 VoxelLightBaker::_compute_ray_trace_at_pos(const Vector3 &p_pos, const Vector3 &p_normal) {
+
+	int samples_per_quality[3] = { 48, 128, 512 };
+
+	int samples = samples_per_quality[bake_quality];
+
+	//create a basis in Z
+	Vector3 v0 = Math::abs(p_normal.z) < 0.999 ? Vector3(0, 0, 1) : Vector3(0, 1, 0);
+	Vector3 tangent = v0.cross(p_normal).normalized();
+	Vector3 bitangent = tangent.cross(p_normal).normalized();
+	Basis normal_xform = Basis(tangent, bitangent, p_normal).transposed();
+
+	float bias = 1.5;
+	int max_level = cell_subdiv - 1;
+	int size = 1 << max_level;
+
+	Vector3 accum;
+	float spread = Math::deg2rad(80.0);
+
+	const Light *light = bake_light.ptr();
+	const Cell *cells = bake_cells.ptr();
+
+	for (int i = 0; i < samples; i++) {
+
+		float random_angle1 = (((Math::rand() % 65535) / 65535.0) * 2.0 - 1.0) * spread;
+		Vector3 axis(0, sin(random_angle1), cos(random_angle1));
+		float random_angle2 = ((Math::rand() % 65535) / 65535.0) * Math_PI * 2.0;
+		Basis rot(Vector3(0, 0, 1), random_angle2);
+		axis = rot.xform(axis);
+
+		Vector3 direction = normal_xform.xform(axis).normalized();
+
+		Vector3 pos = p_pos + Vector3(0.5, 0.5, 0.5) + direction * bias;
+
+		Vector3 advance = direction * _get_normal_advance(direction);
+
+		uint32_t cell = CHILD_EMPTY;
+
+		while (cell == CHILD_EMPTY) {
+
+			int x = int(pos.x);
+			int y = int(pos.y);
+			int z = int(pos.z);
+
+			int ofs_x = 0;
+			int ofs_y = 0;
+			int ofs_z = 0;
+			int half = size / 2;
+
+			if (x < 0 || x >= size)
+				break;
+			if (y < 0 || y >= size)
+				break;
+			if (z < 0 || z >= size)
+				break;
+
+			//int level_limit = max_level;
+
+			cell = 0; //start from root
+			for (int i = 0; i < max_level; i++) {
+
+				const Cell *bc = &cells[cell];
+
+				int child = 0;
+				if (x >= ofs_x + half) {
+					child |= 1;
+					ofs_x += half;
+				}
+				if (y >= ofs_y + half) {
+					child |= 2;
+					ofs_y += half;
+				}
+				if (z >= ofs_z + half) {
+					child |= 4;
+					ofs_z += half;
+				}
+
+				cell = bc->childs[child];
+				if (cell == CHILD_EMPTY)
+					break;
+
+				half >>= 1;
+			}
+
+			pos += advance;
+		}
+
+		if (cell != CHILD_EMPTY) {
+			for (int i = 0; i < 6; i++) {
+				//anisotropic read light
+				float amount = direction.dot(aniso_normal[i]);
+				if (amount < 0)
+					amount = 0;
+				accum.x += light[cell].accum[i][0] * amount;
+				accum.y += light[cell].accum[i][1] * amount;
+				accum.z += light[cell].accum[i][2] * amount;
+			}
+		}
+	}
+
+	return accum / samples;
+}
+
+Error VoxelLightBaker::make_lightmap(const Transform &p_xform, Ref<Mesh> &p_mesh, LightMapData &r_lightmap, bool (*p_bake_time_func)(void *, float, float), void *p_bake_time_ud) {
+
+	//transfer light information to a lightmap
+	Ref<Mesh> mesh = p_mesh;
+
+	int width = mesh->get_lightmap_size_hint().x;
+	int height = mesh->get_lightmap_size_hint().y;
+
+	//step 1 - create lightmap
+	Vector<LightMap> lightmap;
+	lightmap.resize(width * height);
+
+	Transform xform = to_cell_space * p_xform;
+
+	//step 2 plot faces to lightmap
+	for (int i = 0; i < mesh->get_surface_count(); i++) {
+		Array arrays = mesh->surface_get_arrays(i);
+		PoolVector<Vector3> vertices = arrays[Mesh::ARRAY_VERTEX];
+		PoolVector<Vector3> normals = arrays[Mesh::ARRAY_NORMAL];
+		PoolVector<Vector2> uv2 = arrays[Mesh::ARRAY_TEX_UV2];
+		PoolVector<int> indices = arrays[Mesh::ARRAY_INDEX];
+
+		ERR_FAIL_COND_V(vertices.size() == 0, ERR_INVALID_PARAMETER);
+		ERR_FAIL_COND_V(normals.size() == 0, ERR_INVALID_PARAMETER);
+		ERR_FAIL_COND_V(uv2.size() == 0, ERR_INVALID_PARAMETER);
+
+		int vc = vertices.size();
+		PoolVector<Vector3>::Read vr = vertices.read();
+		PoolVector<Vector3>::Read nr = normals.read();
+		PoolVector<Vector2>::Read u2r = uv2.read();
+		PoolVector<int>::Read ir;
+		int ic = 0;
+
+		if (indices.size()) {
+			ic = indices.size();
+			ir = indices.read();
+		}
+
+		int faces = ic ? ic / 3 : vc / 3;
+		for (int i = 0; i < faces; i++) {
+			Vector3 vertex[3];
+			Vector3 normal[3];
+			Vector2 uv[3];
+			for (int j = 0; j < 3; j++) {
+				int idx = ic ? ir[i * 3 + j] : i * 3 + j;
+				vertex[j] = xform.xform(vr[idx]);
+				normal[j] = xform.basis.xform(nr[idx]).normalized();
+				uv[j] = u2r[idx];
+			}
+
+			_plot_triangle(uv, vertex, normal, lightmap.ptrw(), width, height);
+		}
+	}
+	//step 3 perform voxel cone trace on lightmap pixels
+
+	{
+		LightMap *lightmap_ptr = lightmap.ptrw();
+		uint64_t begin_time = OS::get_singleton()->get_ticks_usec();
+		volatile int lines = 0;
+
+		for (int i = 0; i < height; i++) {
+
+		//print_line("bake line " + itos(i) + " / " + itos(height));
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+			for (int j = 0; j < width; j++) {
+
+				//if (i == 125 && j == 280) {
+
+				LightMap *pixel = &lightmap_ptr[i * width + j];
+				if (pixel->pos == Vector3())
+					continue; //unused, skipe
+
+				//print_line("pos: " + pixel->pos + " normal " + pixel->normal);
+				switch (bake_mode) {
+					case BAKE_MODE_CONE_TRACE: {
+						pixel->light = _compute_pixel_light_at_pos(pixel->pos, pixel->normal) * energy;
+					} break;
+					case BAKE_MODE_RAY_TRACE: {
+						pixel->light = _compute_ray_trace_at_pos(pixel->pos, pixel->normal) * energy;
+					} break;
+						//	pixel->light = Vector3(1, 1, 1);
+						//}
+				}
+			}
+
+			lines = MAX(lines, i); //for multithread
+			if (p_bake_time_func) {
+				uint64_t elapsed = OS::get_singleton()->get_ticks_usec() - begin_time;
+				float elapsed_sec = double(elapsed) / 1000000.0;
+				float remaining = lines < 1 ? 0 : (elapsed_sec / lines) * (height - lines - 1);
+				if (p_bake_time_func(p_bake_time_ud, remaining, lines / float(height))) {
+					return ERR_SKIP;
+				}
+			}
+		}
+
+		if (bake_mode == BAKE_MODE_RAY_TRACE) {
+			//blur
+			print_line("bluring, use pos for separatable copy");
+			//gauss kernel, 7 step sigma 2
+			static const float gauss_kernel[4] = { 0.214607, 0.189879, 0.131514, 0.071303 };
+			//horizontal pass
+			for (int i = 0; i < height; i++) {
+				for (int j = 0; j < width; j++) {
+					if (lightmap_ptr[i * width + j].normal == Vector3()) {
+						continue; //empty
+					}
+					float gauss_sum = gauss_kernel[0];
+					Vector3 accum = lightmap_ptr[i * width + j].light * gauss_kernel[0];
+					for (int k = 1; k < 4; k++) {
+						int new_x = j + k;
+						if (new_x >= width || lightmap_ptr[i * width + new_x].normal == Vector3())
+							break;
+						gauss_sum += gauss_kernel[k];
+						accum += lightmap_ptr[i * width + new_x].light * gauss_kernel[k];
+					}
+					for (int k = 1; k < 4; k++) {
+						int new_x = j - k;
+						if (new_x < 0 || lightmap_ptr[i * width + new_x].normal == Vector3())
+							break;
+						gauss_sum += gauss_kernel[k];
+						accum += lightmap_ptr[i * width + new_x].light * gauss_kernel[k];
+					}
+
+					lightmap_ptr[i * width + j].pos = accum /= gauss_sum;
+				}
+			}
+			//vertical pass
+			for (int i = 0; i < height; i++) {
+				for (int j = 0; j < width; j++) {
+					if (lightmap_ptr[i * width + j].normal == Vector3())
+						continue; //empty, dont write over it anyway
+					float gauss_sum = gauss_kernel[0];
+					Vector3 accum = lightmap_ptr[i * width + j].pos * gauss_kernel[0];
+					for (int k = 1; k < 4; k++) {
+						int new_y = i + k;
+						if (new_y >= height || lightmap_ptr[new_y * width + j].normal == Vector3())
+							break;
+						gauss_sum += gauss_kernel[k];
+						accum += lightmap_ptr[new_y * width + j].pos * gauss_kernel[k];
+					}
+					for (int k = 1; k < 4; k++) {
+						int new_y = i - k;
+						if (new_y < 0 || lightmap_ptr[new_y * width + j].normal == Vector3())
+							break;
+						gauss_sum += gauss_kernel[k];
+						accum += lightmap_ptr[new_y * width + j].pos * gauss_kernel[k];
+					}
+
+					lightmap_ptr[i * width + j].light = accum /= gauss_sum;
+				}
+			}
+		}
+
+		//add directional light (do this after blur)
+		{
+			LightMap *lightmap_ptr = lightmap.ptrw();
+			const Cell *cells = bake_cells.ptr();
+			const Light *light = bake_light.ptr();
+
+			for (int i = 0; i < height; i++) {
+
+			//print_line("bake line " + itos(i) + " / " + itos(height));
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+				for (int j = 0; j < width; j++) {
+
+					//if (i == 125 && j == 280) {
+
+					LightMap *pixel = &lightmap_ptr[i * width + j];
+					if (pixel->pos == Vector3())
+						continue; //unused, skipe
+
+					int x = int(pixel->pos.x) - 1;
+					int y = int(pixel->pos.y) - 1;
+					int z = int(pixel->pos.z) - 1;
+					Color accum;
+					int size = 1 << (cell_subdiv - 1);
+
+					int found = 0;
+
+					for (int k = 0; k < 8; k++) {
+
+						int ofs_x = x;
+						int ofs_y = y;
+						int ofs_z = z;
+
+						if (k & 1)
+							ofs_x++;
+						if (k & 2)
+							ofs_y++;
+						if (k & 4)
+							ofs_z++;
+
+						if (x < 0 || x >= size)
+							continue;
+						if (y < 0 || y >= size)
+							continue;
+						if (z < 0 || z >= size)
+							continue;
+
+						uint32_t cell = _find_cell_at_pos(cells, ofs_x, ofs_y, ofs_z);
+
+						if (cell == CHILD_EMPTY)
+							continue;
+						for (int l = 0; l < 6; l++) {
+							float s = pixel->normal.dot(aniso_normal[l]);
+							if (s < 0)
+								s = 0;
+							accum.r += light[cell].direct_accum[l][0] * s;
+							accum.g += light[cell].direct_accum[l][1] * s;
+							accum.b += light[cell].direct_accum[l][2] * s;
+						}
+						found++;
+					}
+					if (found) {
+						accum /= found;
+						pixel->light.x += accum.r;
+						pixel->light.y += accum.g;
+						pixel->light.z += accum.b;
+					}
+				}
+			}
+		}
+
+		{
+			//fill gaps with neighbour vertices to avoid filter fades to black on edges
+
+			for (int i = 0; i < height; i++) {
+				for (int j = 0; j < width; j++) {
+					if (lightmap_ptr[i * width + j].normal != Vector3()) {
+						continue; //filled, skip
+					}
+
+					//this can't be made separatable..
+
+					int closest_i = -1, closest_j = 1;
+					float closest_dist = 1e20;
+
+					const int margin = 3;
+					for (int y = i - margin; y <= i + margin; y++) {
+						for (int x = j - margin; x <= j + margin; x++) {
+
+							if (x == j && y == i)
+								continue;
+							if (x < 0 || x >= width)
+								continue;
+							if (y < 0 || y >= height)
+								continue;
+							if (lightmap_ptr[y * width + x].normal == Vector3())
+								continue; //also ensures that blitted stuff is not reused
+
+							float dist = Vector2(i - y, j - x).length();
+							if (dist > closest_dist)
+								continue;
+
+							closest_dist = dist;
+							closest_i = y;
+							closest_j = x;
+						}
+					}
+
+					if (closest_i != -1) {
+						lightmap_ptr[i * width + j].light = lightmap_ptr[closest_i * width + closest_j].light;
+					}
+				}
+			}
+		}
+
+		{
+			//fill the lightmap data
+			r_lightmap.width = width;
+			r_lightmap.height = height;
+			r_lightmap.light.resize(lightmap.size() * 3);
+			PoolVector<float>::Write w = r_lightmap.light.write();
+			for (int i = 0; i < lightmap.size(); i++) {
+				w[i * 3 + 0] = lightmap[i].light.x;
+				w[i * 3 + 1] = lightmap[i].light.y;
+				w[i * 3 + 2] = lightmap[i].light.z;
+			}
+		}
+
+#if 0
+		{
+			PoolVector<uint8_t> img;
+			int ls = lightmap.size();
+			img.resize(ls * 3);
+			{
+				PoolVector<uint8_t>::Write w = img.write();
+				for (int i = 0; i < ls; i++) {
+					w[i * 3 + 0] = CLAMP(lightmap_ptr[i].light.x * 255, 0, 255);
+					w[i * 3 + 1] = CLAMP(lightmap_ptr[i].light.y * 255, 0, 255);
+					w[i * 3 + 2] = CLAMP(lightmap_ptr[i].light.z * 255, 0, 255);
+					//w[i * 3 + 0] = CLAMP(lightmap_ptr[i].normal.x * 255, 0, 255);
+					//w[i * 3 + 1] = CLAMP(lightmap_ptr[i].normal.y * 255, 0, 255);
+					//w[i * 3 + 2] = CLAMP(lightmap_ptr[i].normal.z * 255, 0, 255);
+					//w[i * 3 + 0] = CLAMP(lightmap_ptr[i].pos.x / (1 << (cell_subdiv - 1)) * 255, 0, 255);
+					//w[i * 3 + 1] = CLAMP(lightmap_ptr[i].pos.y / (1 << (cell_subdiv - 1)) * 255, 0, 255);
+					//w[i * 3 + 2] = CLAMP(lightmap_ptr[i].pos.z / (1 << (cell_subdiv - 1)) * 255, 0, 255);
+				}
+			}
+
+			Ref<Image> image;
+			image.instance();
+			image->create(width, height, false, Image::FORMAT_RGB8, img);
+
+			String name = p_mesh->get_name();
+			if (name == "") {
+				name = "Mesh" + itos(p_mesh->get_instance_id());
+			}
+			image->save_png(name + ".png");
+		}
+#endif
+	}
+
+	return OK;
+}
+
+void VoxelLightBaker::begin_bake(int p_subdiv, const AABB &p_bounds) {
+
+	original_bounds = p_bounds;
+	cell_subdiv = p_subdiv;
+	bake_cells.resize(1);
+	material_cache.clear();
+
+	//find out the actual real bounds, power of 2, which gets the highest subdivision
+	po2_bounds = p_bounds;
+	int longest_axis = po2_bounds.get_longest_axis_index();
+	axis_cell_size[longest_axis] = (1 << (cell_subdiv - 1));
+	leaf_voxel_count = 0;
+
+	for (int i = 0; i < 3; i++) {
+
+		if (i == longest_axis)
+			continue;
+
+		axis_cell_size[i] = axis_cell_size[longest_axis];
+		float axis_size = po2_bounds.size[longest_axis];
+
+		//shrink until fit subdiv
+		while (axis_size / 2.0 >= po2_bounds.size[i]) {
+			axis_size /= 2.0;
+			axis_cell_size[i] >>= 1;
+		}
+
+		po2_bounds.size[i] = po2_bounds.size[longest_axis];
+	}
+
+	Transform to_bounds;
+	to_bounds.basis.scale(Vector3(po2_bounds.size[longest_axis], po2_bounds.size[longest_axis], po2_bounds.size[longest_axis]));
+	to_bounds.origin = po2_bounds.position;
+
+	Transform to_grid;
+	to_grid.basis.scale(Vector3(axis_cell_size[longest_axis], axis_cell_size[longest_axis], axis_cell_size[longest_axis]));
+
+	to_cell_space = to_grid * to_bounds.affine_inverse();
+
+	cell_size = po2_bounds.size[longest_axis] / axis_cell_size[longest_axis];
+}
+
+void VoxelLightBaker::end_bake() {
+	_fixup_plot(0, 0);
+}
+
+//create the data for visual server
+
+PoolVector<int> VoxelLightBaker::create_gi_probe_data() {
+
+	PoolVector<int> data;
+
+	data.resize(16 + (8 + 1 + 1 + 1 + 1) * bake_cells.size()); //4 for header, rest for rest.
+
+	{
+		PoolVector<int>::Write w = data.write();
+
+		uint32_t *w32 = (uint32_t *)w.ptr();
+
+		w32[0] = 0; //version
+		w32[1] = cell_subdiv; //subdiv
+		w32[2] = axis_cell_size[0];
+		w32[3] = axis_cell_size[1];
+		w32[4] = axis_cell_size[2];
+		w32[5] = bake_cells.size();
+		w32[6] = leaf_voxel_count;
+
+		int ofs = 16;
+
+		for (int i = 0; i < bake_cells.size(); i++) {
+
+			for (int j = 0; j < 8; j++) {
+				w32[ofs++] = bake_cells[i].childs[j];
+			}
+
+			{ //albedo
+				uint32_t rgba = uint32_t(CLAMP(bake_cells[i].albedo[0] * 255.0, 0, 255)) << 16;
+				rgba |= uint32_t(CLAMP(bake_cells[i].albedo[1] * 255.0, 0, 255)) << 8;
+				rgba |= uint32_t(CLAMP(bake_cells[i].albedo[2] * 255.0, 0, 255)) << 0;
+
+				w32[ofs++] = rgba;
+			}
+			{ //emission
+
+				Vector3 e(bake_cells[i].emission[0], bake_cells[i].emission[1], bake_cells[i].emission[2]);
+				float l = e.length();
+				if (l > 0) {
+					e.normalize();
+					l = CLAMP(l / 8.0, 0, 1.0);
+				}
+
+				uint32_t em = uint32_t(CLAMP(e[0] * 255, 0, 255)) << 24;
+				em |= uint32_t(CLAMP(e[1] * 255, 0, 255)) << 16;
+				em |= uint32_t(CLAMP(e[2] * 255, 0, 255)) << 8;
+				em |= uint32_t(CLAMP(l * 255, 0, 255));
+
+				w32[ofs++] = em;
+			}
+
+			//w32[ofs++]=bake_cells[i].used_sides;
+			{ //normal
+
+				Vector3 n(bake_cells[i].normal[0], bake_cells[i].normal[1], bake_cells[i].normal[2]);
+				n = n * Vector3(0.5, 0.5, 0.5) + Vector3(0.5, 0.5, 0.5);
+				uint32_t norm = 0;
+
+				norm |= uint32_t(CLAMP(n.x * 255.0, 0, 255)) << 16;
+				norm |= uint32_t(CLAMP(n.y * 255.0, 0, 255)) << 8;
+				norm |= uint32_t(CLAMP(n.z * 255.0, 0, 255)) << 0;
+
+				w32[ofs++] = norm;
+			}
+
+			{
+				uint16_t alpha = CLAMP(uint32_t(bake_cells[i].alpha * 65535.0), 0, 65535);
+				uint16_t level = bake_cells[i].level;
+
+				w32[ofs++] = (uint32_t(level) << 16) | uint32_t(alpha);
+			}
+		}
+	}
+
+	return data;
+}
+
+void VoxelLightBaker::_debug_mesh(int p_idx, int p_level, const AABB &p_aabb, Ref<MultiMesh> &p_multimesh, int &idx, DebugMode p_mode) {
+
+	if (p_level == cell_subdiv - 1) {
+
+		Vector3 center = p_aabb.position + p_aabb.size * 0.5;
+		Transform xform;
+		xform.origin = center;
+		xform.basis.scale(p_aabb.size * 0.5);
+		p_multimesh->set_instance_transform(idx, xform);
+		Color col;
+		if (p_mode == DEBUG_ALBEDO) {
+			col = Color(bake_cells[p_idx].albedo[0], bake_cells[p_idx].albedo[1], bake_cells[p_idx].albedo[2]);
+		} else if (p_mode == DEBUG_LIGHT) {
+			for (int i = 0; i < 6; i++) {
+				col.r += bake_light[p_idx].accum[i][0];
+				col.g += bake_light[p_idx].accum[i][1];
+				col.b += bake_light[p_idx].accum[i][2];
+				col.r += bake_light[p_idx].direct_accum[i][0];
+				col.g += bake_light[p_idx].direct_accum[i][1];
+				col.b += bake_light[p_idx].direct_accum[i][2];
+			}
+		}
+		//Color col = Color(bake_cells[p_idx].emission[0], bake_cells[p_idx].emission[1], bake_cells[p_idx].emission[2]);
+		p_multimesh->set_instance_color(idx, col);
+
+		idx++;
+
+	} else {
+
+		for (int i = 0; i < 8; i++) {
+
+			uint32_t child = bake_cells[p_idx].childs[i];
+
+			if (child == CHILD_EMPTY || child >= max_original_cells)
+				continue;
+
+			AABB aabb = p_aabb;
+			aabb.size *= 0.5;
+
+			if (i & 1)
+				aabb.position.x += aabb.size.x;
+			if (i & 2)
+				aabb.position.y += aabb.size.y;
+			if (i & 4)
+				aabb.position.z += aabb.size.z;
+
+			_debug_mesh(bake_cells[p_idx].childs[i], p_level + 1, aabb, p_multimesh, idx, p_mode);
+		}
+	}
+}
+
+Ref<MultiMesh> VoxelLightBaker::create_debug_multimesh(DebugMode p_mode) {
+
+	Ref<MultiMesh> mm;
+
+	ERR_FAIL_COND_V(p_mode == DEBUG_LIGHT && bake_light.size() == 0, mm);
+	mm.instance();
+
+	mm->set_transform_format(MultiMesh::TRANSFORM_3D);
+	mm->set_color_format(MultiMesh::COLOR_8BIT);
+	print_line("leaf voxels: " + itos(leaf_voxel_count));
+	mm->set_instance_count(leaf_voxel_count);
+
+	Ref<ArrayMesh> mesh;
+	mesh.instance();
+
+	{
+		Array arr;
+		arr.resize(Mesh::ARRAY_MAX);
+
+		PoolVector<Vector3> vertices;
+		PoolVector<Color> colors;
+
+		int vtx_idx = 0;
+#define ADD_VTX(m_idx)                      \
+	;                                       \
+	vertices.push_back(face_points[m_idx]); \
+	colors.push_back(Color(1, 1, 1, 1));    \
+	vtx_idx++;
+
+		for (int i = 0; i < 6; i++) {
+
+			Vector3 face_points[4];
+
+			for (int j = 0; j < 4; j++) {
+
+				float v[3];
+				v[0] = 1.0;
+				v[1] = 1 - 2 * ((j >> 1) & 1);
+				v[2] = v[1] * (1 - 2 * (j & 1));
+
+				for (int k = 0; k < 3; k++) {
+
+					if (i < 3)
+						face_points[j][(i + k) % 3] = v[k] * (i >= 3 ? -1 : 1);
+					else
+						face_points[3 - j][(i + k) % 3] = v[k] * (i >= 3 ? -1 : 1);
+				}
+			}
+
+			//tri 1
+			ADD_VTX(0);
+			ADD_VTX(1);
+			ADD_VTX(2);
+			//tri 2
+			ADD_VTX(2);
+			ADD_VTX(3);
+			ADD_VTX(0);
+		}
+
+		arr[Mesh::ARRAY_VERTEX] = vertices;
+		arr[Mesh::ARRAY_COLOR] = colors;
+		mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, arr);
+	}
+
+	{
+		Ref<SpatialMaterial> fsm;
+		fsm.instance();
+		fsm->set_flag(SpatialMaterial::FLAG_SRGB_VERTEX_COLOR, true);
+		fsm->set_flag(SpatialMaterial::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
+		fsm->set_flag(SpatialMaterial::FLAG_UNSHADED, true);
+		fsm->set_albedo(Color(1, 1, 1, 1));
+
+		mesh->surface_set_material(0, fsm);
+	}
+
+	mm->set_mesh(mesh);
+
+	int idx = 0;
+	_debug_mesh(0, 0, po2_bounds, mm, idx, p_mode);
+
+	return mm;
+}
+
+struct VoxelLightBakerOctree {
+
+	enum {
+		CHILD_EMPTY = 0xFFFFFFFF
+	};
+
+	uint16_t light[6][3]; //anisotropic light
+	float alpha;
+	uint32_t children[8];
+};
+
+PoolVector<uint8_t> VoxelLightBaker::create_capture_octree(int p_subdiv) {
+
+	p_subdiv = MIN(p_subdiv, cell_subdiv); // use the smaller one
+
+	Vector<uint32_t> remap;
+	int bc = bake_cells.size();
+	remap.resize(bc);
+	Vector<uint32_t> demap;
+
+	int new_size = 0;
+	for (int i = 0; i < bc; i++) {
+		uint32_t c = CHILD_EMPTY;
+		if (bake_cells[i].level < p_subdiv) {
+			c = new_size;
+			new_size++;
+			demap.push_back(i);
+		}
+		remap[i] = c;
+	}
+
+	Vector<VoxelLightBakerOctree> octree;
+	octree.resize(new_size);
+
+	for (int i = 0; i < new_size; i++) {
+		octree[i].alpha = bake_cells[demap[i]].alpha;
+		for (int j = 0; j < 6; j++) {
+			for (int k = 0; k < 3; k++) {
+				float l = bake_light[demap[i]].accum[j][k]; //add anisotropic light
+				l += bake_cells[demap[i]].emission[k]; //add emission
+				octree[i].light[j][k] = CLAMP(l * 1024, 0, 65535); //give two more bits to octree
+			}
+		}
+
+		for (int j = 0; j < 8; j++) {
+			uint32_t child = bake_cells[demap[i]].childs[j];
+			octree[i].children[j] = child == CHILD_EMPTY ? CHILD_EMPTY : remap[child];
+		}
+	}
+
+	PoolVector<uint8_t> ret;
+	int ret_bytes = octree.size() * sizeof(VoxelLightBakerOctree);
+	ret.resize(ret_bytes);
+	{
+		PoolVector<uint8_t>::Write w = ret.write();
+		copymem(w.ptr(), octree.ptr(), ret_bytes);
+	}
+
+	return ret;
+}
+
+float VoxelLightBaker::get_cell_size() const {
+	return cell_size;
+}
+
+Transform VoxelLightBaker::get_to_cell_space_xform() const {
+	return to_cell_space;
+}
+VoxelLightBaker::VoxelLightBaker() {
+	color_scan_cell_width = 4;
+	bake_texture_size = 128;
+	propagation = 0.85;
+	energy = 1.0;
+}
diff --git a/scene/3d/voxel_light_baker.h b/scene/3d/voxel_light_baker.h
new file mode 100644
index 0000000000..6dee2ee69b
--- /dev/null
+++ b/scene/3d/voxel_light_baker.h
@@ -0,0 +1,148 @@
+#ifndef VOXEL_LIGHT_BAKER_H
+#define VOXEL_LIGHT_BAKER_H
+
+#include "scene/3d/mesh_instance.h"
+#include "scene/resources/multimesh.h"
+
+class VoxelLightBaker {
+public:
+	enum DebugMode {
+		DEBUG_ALBEDO,
+		DEBUG_LIGHT
+	};
+
+	enum BakeQuality {
+		BAKE_QUALITY_LOW,
+		BAKE_QUALITY_MEDIUM,
+		BAKE_QUALITY_HIGH
+	};
+
+	enum BakeMode {
+		BAKE_MODE_CONE_TRACE,
+		BAKE_MODE_RAY_TRACE,
+	};
+
+private:
+	enum {
+		CHILD_EMPTY = 0xFFFFFFFF
+
+	};
+
+	struct Cell {
+
+		uint32_t childs[8];
+		float albedo[3]; //albedo in RGB24
+		float emission[3]; //accumulated light in 16:16 fixed point (needs to be integer for moving lights fast)
+		float normal[3];
+		uint32_t used_sides;
+		float alpha; //used for upsampling
+		int level;
+
+		Cell() {
+			for (int i = 0; i < 8; i++) {
+				childs[i] = CHILD_EMPTY;
+			}
+
+			for (int i = 0; i < 3; i++) {
+				emission[i] = 0;
+				albedo[i] = 0;
+				normal[i] = 0;
+			}
+			alpha = 0;
+			used_sides = 0;
+			level = 0;
+		}
+	};
+
+	Vector<Cell> bake_cells;
+	int cell_subdiv;
+
+	struct Light {
+		int x, y, z;
+		float accum[6][3]; //rgb anisotropic
+		float direct_accum[6][3]; //for direct bake
+		int next_leaf;
+	};
+
+	int first_leaf;
+
+	Vector<Light> bake_light;
+
+	struct MaterialCache {
+		//128x128 textures
+		Vector<Color> albedo;
+		Vector<Color> emission;
+	};
+
+	Map<Ref<Material>, MaterialCache> material_cache;
+	int leaf_voxel_count;
+	bool direct_lights_baked;
+
+	AABB original_bounds;
+	AABB po2_bounds;
+	int axis_cell_size[3];
+
+	Transform to_cell_space;
+
+	int color_scan_cell_width;
+	int bake_texture_size;
+	float cell_size;
+	float propagation;
+	float energy;
+
+	BakeQuality bake_quality;
+	BakeMode bake_mode;
+
+	int max_original_cells;
+
+	void _init_light_plot(int p_idx, int p_level, int p_x, int p_y, int p_z, uint32_t p_parent);
+
+	Vector<Color> _get_bake_texture(Ref<Image> p_image, const Color &p_color_mul, const Color &p_color_add);
+	MaterialCache _get_material_cache(Ref<Material> p_material);
+	void _plot_face(int p_idx, int p_level, int p_x, int p_y, int p_z, const Vector3 *p_vtx, const Vector2 *p_uv, const MaterialCache &p_material, const AABB &p_aabb);
+	void _fixup_plot(int p_idx, int p_level);
+	void _debug_mesh(int p_idx, int p_level, const AABB &p_aabb, Ref<MultiMesh> &p_multimesh, int &idx, DebugMode p_mode);
+	void _check_init_light();
+
+	uint32_t _find_cell_at_pos(const Cell *cells, int x, int y, int z);
+
+	struct LightMap {
+		Vector3 light;
+		Vector3 pos;
+		Vector3 normal;
+	};
+
+	void _plot_triangle(Vector2 *vertices, Vector3 *positions, Vector3 *normals, LightMap *pixels, int width, int height);
+
+	_FORCE_INLINE_ void _sample_baked_octree_filtered_and_anisotropic(const Vector3 &p_posf, const Vector3 &p_direction, float p_level, Vector3 &r_color, float &r_alpha);
+	_FORCE_INLINE_ Vector3 _voxel_cone_trace(const Vector3 &p_pos, const Vector3 &p_normal, float p_aperture);
+	_FORCE_INLINE_ Vector3 _compute_pixel_light_at_pos(const Vector3 &p_pos, const Vector3 &p_normal);
+	_FORCE_INLINE_ Vector3 _compute_ray_trace_at_pos(const Vector3 &p_pos, const Vector3 &p_normal);
+
+public:
+	void begin_bake(int p_subdiv, const AABB &p_bounds);
+	void plot_mesh(const Transform &p_xform, Ref<Mesh> &p_mesh, const Vector<Ref<Material> > &p_materials, const Ref<Material> &p_override_material);
+	void begin_bake_light(BakeQuality p_quality = BAKE_QUALITY_MEDIUM, BakeMode p_bake_mode = BAKE_MODE_CONE_TRACE, float p_propagation = 0.85, float p_energy = 1);
+	void plot_light_directional(const Vector3 &p_direction, const Color &p_color, float p_energy, float p_indirect_energy, bool p_direct);
+	void plot_light_omni(const Vector3 &p_pos, const Color &p_color, float p_energy, float p_indirect_energy, float p_radius, float p_attenutation, bool p_direct);
+	void plot_light_spot(const Vector3 &p_pos, const Vector3 &p_axis, const Color &p_color, float p_energy, float p_indirect_energy, float p_radius, float p_attenutation, float p_spot_angle, float p_spot_attenuation, bool p_direct);
+	void end_bake();
+
+	struct LightMapData {
+		int width;
+		int height;
+		PoolVector<float> light;
+	};
+
+	Error make_lightmap(const Transform &p_xform, Ref<Mesh> &p_mesh, LightMapData &r_lightmap, bool (*p_bake_time_func)(void *, float, float) = NULL, void *p_bake_time_ud = NULL);
+
+	PoolVector<int> create_gi_probe_data();
+	Ref<MultiMesh> create_debug_multimesh(DebugMode p_mode = DEBUG_ALBEDO);
+	PoolVector<uint8_t> create_capture_octree(int p_subdiv);
+
+	float get_cell_size() const;
+	Transform get_to_cell_space_xform() const;
+	VoxelLightBaker();
+};
+
+#endif // VOXEL_LIGHT_BAKER_H
diff --git a/scene/animation/animation_player.cpp b/scene/animation/animation_player.cpp
index e866e665d8..5e776c5a1a 100644
--- a/scene/animation/animation_player.cpp
+++ b/scene/animation/animation_player.cpp
@@ -233,7 +233,6 @@ void AnimationPlayer::_notification(int p_what) {
 		} break;
 		case NOTIFICATION_EXIT_TREE: {
 
-			//stop_all();
 			clear_caches();
 		} break;
 	}
@@ -738,7 +737,7 @@ void AnimationPlayer::remove_animation(const StringName &p_name) {
 
 	ERR_FAIL_COND(!animation_set.has(p_name));
 
-	stop_all();
+	stop();
 	_unref_anim(animation_set[p_name].animation);
 	animation_set.erase(p_name);
 
@@ -775,9 +774,7 @@ void AnimationPlayer::rename_animation(const StringName &p_name, const StringNam
 	ERR_FAIL_COND(String(p_new_name).find("/") != -1 || String(p_new_name).find(":") != -1);
 	ERR_FAIL_COND(animation_set.has(p_new_name));
 
-	//print_line("Rename anim: "+String(p_name)+" name: "+String(p_new_name));
-
-	stop_all();
+	stop();
 	AnimationData ad = animation_set[p_name];
 	ad.name = p_new_name;
 	animation_set.erase(p_name);
@@ -1019,13 +1016,6 @@ void AnimationPlayer::stop(bool p_reset) {
 	playing = false;
 }
 
-void AnimationPlayer::stop_all() {
-
-	stop();
-
-	_set_process(false); // always process when starting an animation
-}
-
 void AnimationPlayer::set_speed_scale(float p_speed) {
 
 	speed_scale = p_speed;
@@ -1307,8 +1297,8 @@ void AnimationPlayer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("play", "name", "custom_blend", "custom_speed", "from_end"), &AnimationPlayer::play, DEFVAL(""), DEFVAL(-1), DEFVAL(1.0), DEFVAL(false));
 	ClassDB::bind_method(D_METHOD("play_backwards", "name", "custom_blend"), &AnimationPlayer::play_backwards, DEFVAL(""), DEFVAL(-1));
 	ClassDB::bind_method(D_METHOD("stop", "reset"), &AnimationPlayer::stop, DEFVAL(true));
-	ClassDB::bind_method(D_METHOD("stop_all"), &AnimationPlayer::stop_all);
 	ClassDB::bind_method(D_METHOD("is_playing"), &AnimationPlayer::is_playing);
+
 	ClassDB::bind_method(D_METHOD("set_current_animation", "anim"), &AnimationPlayer::set_current_animation);
 	ClassDB::bind_method(D_METHOD("get_current_animation"), &AnimationPlayer::get_current_animation);
 	ClassDB::bind_method(D_METHOD("queue", "name"), &AnimationPlayer::queue);
@@ -1326,9 +1316,6 @@ void AnimationPlayer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_root", "path"), &AnimationPlayer::set_root);
 	ClassDB::bind_method(D_METHOD("get_root"), &AnimationPlayer::get_root);
 
-	ClassDB::bind_method(D_METHOD("seek", "seconds", "update"), &AnimationPlayer::seek, DEFVAL(false));
-	ClassDB::bind_method(D_METHOD("advance", "delta"), &AnimationPlayer::advance);
-
 	ClassDB::bind_method(D_METHOD("find_animation", "animation"), &AnimationPlayer::find_animation);
 
 	ClassDB::bind_method(D_METHOD("clear_caches"), &AnimationPlayer::clear_caches);
@@ -1339,15 +1326,13 @@ void AnimationPlayer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_current_animation_position"), &AnimationPlayer::get_current_animation_position);
 	ClassDB::bind_method(D_METHOD("get_current_animation_length"), &AnimationPlayer::get_current_animation_length);
 
+	ClassDB::bind_method(D_METHOD("seek", "seconds", "update"), &AnimationPlayer::seek, DEFVAL(false));
+	ClassDB::bind_method(D_METHOD("advance", "delta"), &AnimationPlayer::advance);
+
 	ADD_GROUP("Playback Options", "playback_");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "playback_process_mode", PROPERTY_HINT_ENUM, "Physics,Idle"), "set_animation_process_mode", "get_animation_process_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::REAL, "playback_default_blend_time", PROPERTY_HINT_RANGE, "0,4096,0.01"), "set_default_blend_time", "get_default_blend_time");
-
 	ADD_PROPERTY(PropertyInfo(Variant::NODE_PATH, "root_node"), "set_root", "get_root");
-	ADD_PROPERTY(PropertyInfo(Variant::STRING, "autoplay"), "set_autoplay", "get_autoplay");
-	ADD_PROPERTY(PropertyInfo(Variant::REAL, "speed_scale"), "set_speed_scale", "get_speed_scale");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "active"), "set_active", "is_active");
-	ADD_PROPERTY(PropertyInfo(Variant::STRING, "current_animation"), "set_current_animation", "get_current_animation");
 
 	ADD_SIGNAL(MethodInfo("animation_finished", PropertyInfo(Variant::STRING, "name")));
 	ADD_SIGNAL(MethodInfo("animation_changed", PropertyInfo(Variant::STRING, "old_name"), PropertyInfo(Variant::STRING, "new_name")));
diff --git a/scene/gui/dialogs.cpp b/scene/gui/dialogs.cpp
index d4912339da..9a55073bb6 100644
--- a/scene/gui/dialogs.cpp
+++ b/scene/gui/dialogs.cpp
@@ -289,10 +289,17 @@ bool WindowDialog::get_resizable() const {
 Size2 WindowDialog::get_minimum_size() const {
 
 	Ref<Font> font = get_font("title_font", "WindowDialog");
-	int msx = close_button->get_combined_minimum_size().x;
-	msx += font->get_string_size(title).x;
 
-	return Size2(msx, 1);
+	const int button_width = close_button->get_combined_minimum_size().x;
+	const int title_width = font->get_string_size(title).x;
+	const int padding = button_width / 2;
+	const int button_area = button_width + padding;
+
+	// as the title gets centered, title_width + close_button_width is not enough.
+	// we want a width w, such that w / 2 - title_width / 2 >= button_area, i.e.
+	// w >= 2 * button_area + title_width
+
+	return Size2(2 * button_area + title_width, 1);
 }
 
 TextureButton *WindowDialog::get_close_button() {
diff --git a/scene/gui/popup_menu.cpp b/scene/gui/popup_menu.cpp
index e37cdd5cc9..698676cc39 100644
--- a/scene/gui/popup_menu.cpp
+++ b/scene/gui/popup_menu.cpp
@@ -42,24 +42,6 @@ String PopupMenu::_get_accel_text(int p_item) const {
 	else if (items[p_item].accel)
 		return keycode_get_string(items[p_item].accel);
 	return String();
-
-	/*
-	String atxt;
-	if (p_accel&KEY_MASK_SHIFT)
-		atxt+="Shift+";
-	if (p_accel&KEY_MASK_ALT)
-		atxt+="Alt+";
-	if (p_accel&KEY_MASK_CTRL)
-		atxt+="Ctrl+";
-	if (p_accel&KEY_MASK_META)
-		atxt+="Meta+";
-
-	p_accel&=KEY_CODE_MASK;
-
-	atxt+=String::chr(p_accel).to_upper();
-
-	return atxt;
-*/
 }
 
 Size2 PopupMenu::get_minimum_size() const {
@@ -136,7 +118,6 @@ int PopupMenu::_get_mouse_over(const Point2 &p_over) const {
 
 	Ref<Font> font = get_font("font");
 	int vseparation = get_constant("vseparation");
-	//int hseparation = get_constant("hseparation");
 	float font_h = font->get_height();
 
 	for (int i = 0; i < items.size(); i++) {
@@ -230,6 +211,11 @@ void PopupMenu::_gui_input(const Ref<InputEvent> &p_event) {
 
 						mouse_over = i;
 						update();
+
+						if (items[i].submenu != "" && submenu_over != i) {
+							submenu_over = i;
+							submenu_timer->start();
+						}
 						break;
 					}
 				}
@@ -245,6 +231,11 @@ void PopupMenu::_gui_input(const Ref<InputEvent> &p_event) {
 
 						mouse_over = i;
 						update();
+
+						if (items[i].submenu != "" && submenu_over != i) {
+							submenu_over = i;
+							submenu_timer->start();
+						}
 						break;
 					}
 				}
@@ -500,6 +491,13 @@ void PopupMenu::_notification(int p_what) {
 		} break;
 		case NOTIFICATION_MOUSE_EXIT: {
 
+			if (mouse_over >= 0 && (items[mouse_over].submenu == "" || submenu_over != -1)) {
+				mouse_over = -1;
+				update();
+			}
+		} break;
+		case NOTIFICATION_POPUP_HIDE: {
+
 			if (mouse_over >= 0) {
 				mouse_over = -1;
 				update();
@@ -624,7 +622,7 @@ void PopupMenu::add_check_shortcut(const Ref<ShortCut> &p_shortcut, int p_ID, bo
 	update();
 }
 
-void PopupMenu::add_statable_item(const String &p_label, int p_max_states, int p_default_state, int p_ID, uint32_t p_accel) {
+void PopupMenu::add_multistate_item(const String &p_label, int p_max_states, int p_default_state, int p_ID, uint32_t p_accel) {
 
 	Item item;
 	item.text = p_label;
@@ -839,14 +837,14 @@ void PopupMenu::set_item_h_offset(int p_idx, int p_offset) {
 	update();
 }
 
-void PopupMenu::set_item_statable(int p_idx, int p_state) {
+void PopupMenu::set_item_multistate(int p_idx, int p_state) {
 
 	ERR_FAIL_INDEX(p_idx, items.size());
 	items[p_idx].state = p_state;
 	update();
 }
 
-void PopupMenu::toggle_item_statable(int p_idx) {
+void PopupMenu::toggle_item_multistate(int p_idx) {
 
 	ERR_FAIL_INDEX(p_idx, items.size());
 	if (0 >= items[p_idx].max_states) {
@@ -940,7 +938,7 @@ void PopupMenu::activate_item(int p_item) {
 			if (!hide_on_checkable_item_selection || !pop->is_hide_on_checkable_item_selection())
 				break;
 		} else if (0 < items[p_item].max_states) {
-			if (!hide_on_statable_item_selection || !pop->is_hide_on_statable_item_selection())
+			if (!hide_on_multistate_item_selection || !pop->is_hide_on_multistate_item_selection())
 				break;
 		} else if (!hide_on_item_selection || !pop->is_hide_on_item_selection())
 			break;
@@ -957,7 +955,7 @@ void PopupMenu::activate_item(int p_item) {
 		if (!hide_on_checkable_item_selection)
 			return;
 	} else if (0 < items[p_item].max_states) {
-		if (!hide_on_statable_item_selection)
+		if (!hide_on_multistate_item_selection)
 			return;
 	} else if (!hide_on_item_selection)
 		return;
@@ -1093,14 +1091,14 @@ bool PopupMenu::is_hide_on_checkable_item_selection() const {
 	return hide_on_checkable_item_selection;
 }
 
-void PopupMenu::set_hide_on_statable_item_selection(bool p_enabled) {
+void PopupMenu::set_hide_on_multistate_item_selection(bool p_enabled) {
 
-	hide_on_statable_item_selection = p_enabled;
+	hide_on_multistate_item_selection = p_enabled;
 }
 
-bool PopupMenu::is_hide_on_statable_item_selection() const {
+bool PopupMenu::is_hide_on_multistate_item_selection() const {
 
-	return hide_on_statable_item_selection;
+	return hide_on_multistate_item_selection;
 }
 
 String PopupMenu::get_tooltip(const Point2 &p_pos) const {
@@ -1161,10 +1159,10 @@ void PopupMenu::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_item_as_checkable", "idx", "enable"), &PopupMenu::set_item_as_checkable);
 	ClassDB::bind_method(D_METHOD("set_item_tooltip", "idx", "tooltip"), &PopupMenu::set_item_tooltip);
 	ClassDB::bind_method(D_METHOD("set_item_shortcut", "idx", "shortcut", "global"), &PopupMenu::set_item_shortcut, DEFVAL(false));
-	ClassDB::bind_method(D_METHOD("set_item_statable", "idx", "state"), &PopupMenu::set_item_statable);
+	ClassDB::bind_method(D_METHOD("set_item_multistate", "idx", "state"), &PopupMenu::set_item_multistate);
 
 	ClassDB::bind_method(D_METHOD("toggle_item_checked", "idx"), &PopupMenu::toggle_item_checked);
-	ClassDB::bind_method(D_METHOD("toggle_item_statable", "idx"), &PopupMenu::toggle_item_statable);
+	ClassDB::bind_method(D_METHOD("toggle_item_multistate", "idx"), &PopupMenu::toggle_item_multistate);
 
 	ClassDB::bind_method(D_METHOD("get_item_text", "idx"), &PopupMenu::get_item_text);
 	ClassDB::bind_method(D_METHOD("get_item_icon", "idx"), &PopupMenu::get_item_icon);
@@ -1196,8 +1194,8 @@ void PopupMenu::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_hide_on_checkable_item_selection", "enable"), &PopupMenu::set_hide_on_checkable_item_selection);
 	ClassDB::bind_method(D_METHOD("is_hide_on_checkable_item_selection"), &PopupMenu::is_hide_on_checkable_item_selection);
 
-	ClassDB::bind_method(D_METHOD("set_hide_on_state_item_selection", "enable"), &PopupMenu::set_hide_on_statable_item_selection);
-	ClassDB::bind_method(D_METHOD("is_hide_on_state_item_selection"), &PopupMenu::is_hide_on_statable_item_selection);
+	ClassDB::bind_method(D_METHOD("set_hide_on_state_item_selection", "enable"), &PopupMenu::set_hide_on_multistate_item_selection);
+	ClassDB::bind_method(D_METHOD("is_hide_on_state_item_selection"), &PopupMenu::is_hide_on_multistate_item_selection);
 
 	ClassDB::bind_method(D_METHOD("_submenu_timeout"), &PopupMenu::_submenu_timeout);
 
@@ -1217,12 +1215,13 @@ void PopupMenu::set_invalidate_click_until_motion() {
 PopupMenu::PopupMenu() {
 
 	mouse_over = -1;
+	submenu_over = -1;
 
 	set_focus_mode(FOCUS_ALL);
 	set_as_toplevel(true);
 	set_hide_on_item_selection(true);
 	set_hide_on_checkable_item_selection(true);
-	set_hide_on_statable_item_selection(false);
+	set_hide_on_multistate_item_selection(false);
 
 	submenu_timer = memnew(Timer);
 	submenu_timer->set_wait_time(0.3);
diff --git a/scene/gui/popup_menu.h b/scene/gui/popup_menu.h
index 5a10bf0765..ee514f4c4b 100644
--- a/scene/gui/popup_menu.h
+++ b/scene/gui/popup_menu.h
@@ -90,7 +90,7 @@ class PopupMenu : public Popup {
 	bool invalidated_click;
 	bool hide_on_item_selection;
 	bool hide_on_checkable_item_selection;
-	bool hide_on_statable_item_selection;
+	bool hide_on_multistate_item_selection;
 	Vector2 moved;
 
 	Array _get_items() const;
@@ -120,7 +120,7 @@ public:
 	void add_icon_check_shortcut(const Ref<Texture> &p_icon, const Ref<ShortCut> &p_shortcut, int p_ID = -1, bool p_global = false);
 	void add_check_shortcut(const Ref<ShortCut> &p_shortcut, int p_ID = -1, bool p_global = false);
 
-	void add_statable_item(const String &p_label, int p_max_states, int p_default_state, int p_ID = -1, uint32_t p_accel = 0);
+	void add_multistate_item(const String &p_label, int p_max_states, int p_default_state, int p_ID = -1, uint32_t p_accel = 0);
 
 	void set_item_text(int p_idx, const String &p_text);
 	void set_item_icon(int p_idx, const Ref<Texture> &p_icon);
@@ -135,8 +135,8 @@ public:
 	void set_item_tooltip(int p_idx, const String &p_tooltip);
 	void set_item_shortcut(int p_idx, const Ref<ShortCut> &p_shortcut, bool p_global = false);
 	void set_item_h_offset(int p_idx, int p_offset);
-	void set_item_statable(int p_idx, int p_state);
-	void toggle_item_statable(int p_idx);
+	void set_item_multistate(int p_idx, int p_state);
+	void toggle_item_multistate(int p_idx);
 
 	void toggle_item_checked(int p_idx);
 
@@ -183,8 +183,8 @@ public:
 	void set_hide_on_checkable_item_selection(bool p_enabled);
 	bool is_hide_on_checkable_item_selection() const;
 
-	void set_hide_on_statable_item_selection(bool p_enabled);
-	bool is_hide_on_statable_item_selection() const;
+	void set_hide_on_multistate_item_selection(bool p_enabled);
+	bool is_hide_on_multistate_item_selection() const;
 
 	PopupMenu();
 	~PopupMenu();
diff --git a/scene/gui/rich_text_label.cpp b/scene/gui/rich_text_label.cpp
index 45188c3a52..6fbc58a38a 100644
--- a/scene/gui/rich_text_label.cpp
+++ b/scene/gui/rich_text_label.cpp
@@ -31,6 +31,11 @@
 #include "os/keyboard.h"
 #include "os/os.h"
 #include "scene/scene_string_names.h"
+
+#ifdef TOOLS_ENABLED
+#include "editor/editor_node.h"
+#endif
+
 RichTextLabel::Item *RichTextLabel::_get_next_item(Item *p_item, bool p_free) {
 
 	if (p_free) {
@@ -370,7 +375,11 @@ int RichTextLabel::_process_line(ItemFrame *p_frame, const Vector2 &p_ofs, int &
 									Color uc = color;
 									uc.a *= 0.5;
 									int uy = y + lh - fh + ascent + 2;
-									VS::get_singleton()->canvas_item_add_line(ci, p_ofs + Point2(align_ofs + pofs, uy), p_ofs + Point2(align_ofs + pofs + cw, uy), uc);
+									float underline_width = 1.0;
+#ifdef TOOLS_ENABLED
+									underline_width *= EDSCALE;
+#endif
+									VS::get_singleton()->canvas_item_add_line(ci, p_ofs + Point2(align_ofs + pofs, uy), p_ofs + Point2(align_ofs + pofs + cw, uy), uc, underline_width);
 								}
 								ofs += cw;
 							}
@@ -453,7 +462,7 @@ int RichTextLabel::_process_line(ItemFrame *p_frame, const Vector2 &p_ofs, int &
 						for (int i = 0; i < frame->lines.size(); i++) {
 
 							_process_line(frame, Point2(), ly, p_width, i, PROCESS_CACHE, cfont, Color());
-							table->columns[column].min_width = MAX(table->columns[i].min_width, frame->lines[i].minimum_width);
+							table->columns[column].min_width = MAX(table->columns[column].min_width, frame->lines[i].minimum_width);
 						}
 						idx++;
 					}
diff --git a/scene/gui/slider.cpp b/scene/gui/slider.cpp
index e88742a3e3..70b8616af1 100644
--- a/scene/gui/slider.cpp
+++ b/scene/gui/slider.cpp
@@ -171,47 +171,48 @@ void Slider::_notification(int p_what) {
 			Ref<StyleBox> grabber_area = get_stylebox("grabber_area");
 			Ref<Texture> grabber = get_icon(editable ? ((mouse_inside || has_focus()) ? "grabber_highlight" : "grabber") : "grabber_disabled");
 			Ref<Texture> tick = get_icon("tick");
+			double ratio = Math::is_nan(get_as_ratio()) ? 0 : get_as_ratio();
 
 			if (orientation == VERTICAL) {
 
 				int widget_width = style->get_minimum_size().width + style->get_center_size().width;
 				float areasize = size.height - grabber->get_size().height;
 				style->draw(ci, Rect2i(Point2i(size.width / 2 - widget_width / 2, 0), Size2i(widget_width, size.height)));
-				grabber_area->draw(ci, Rect2i(Point2i((size.width - widget_width) / 2, size.height - areasize * get_as_ratio() - grabber->get_size().height / 2), Size2i(widget_width, areasize * get_as_ratio() + grabber->get_size().width / 2)));
+				grabber_area->draw(ci, Rect2i(Point2i((size.width - widget_width) / 2, size.height - areasize * ratio - grabber->get_size().height / 2), Size2i(widget_width, areasize * ratio + grabber->get_size().width / 2)));
 				/*
 				if (mouse_inside||has_focus())
 					focus->draw(ci,Rect2i(Point2i(),Size2i(style->get_minimum_size().width+style->get_center_size().width,size.height)));
 				*/
 				if (ticks > 1) {
-					int tickarea = size.height - tick->get_height();
+					int grabber_offset = (grabber->get_size().height / 2 - tick->get_height() / 2);
 					for (int i = 0; i < ticks; i++) {
 						if (!ticks_on_borders && (i == 0 || i + 1 == ticks)) continue;
-						int ofs = i * tickarea / (ticks - 1);
+						int ofs = (i * areasize / (ticks - 1)) + grabber_offset;
 						tick->draw(ci, Point2i((size.width - widget_width) / 2, ofs));
 					}
 				}
-				grabber->draw(ci, Point2i(size.width / 2 - grabber->get_size().width / 2, size.height - get_as_ratio() * areasize - grabber->get_size().height));
+				grabber->draw(ci, Point2i(size.width / 2 - grabber->get_size().width / 2, size.height - ratio * areasize - grabber->get_size().height));
 			} else {
 
 				int widget_height = style->get_minimum_size().height + style->get_center_size().height;
 				float areasize = size.width - grabber->get_size().width;
 
 				style->draw(ci, Rect2i(Point2i(0, (size.height - widget_height) / 2), Size2i(size.width, widget_height)));
-				grabber_area->draw(ci, Rect2i(Point2i(0, (size.height - widget_height) / 2), Size2i(areasize * get_as_ratio() + grabber->get_size().width / 2, widget_height)));
+				grabber_area->draw(ci, Rect2i(Point2i(0, (size.height - widget_height) / 2), Size2i(areasize * ratio + grabber->get_size().width / 2, widget_height)));
 				/*
 				if (mouse_inside||has_focus())
 					focus->draw(ci,Rect2i(Point2i(),Size2i(size.width,style->get_minimum_size().height+style->get_center_size().height)));
 				*/
 
 				if (ticks > 1) {
-					int tickarea = size.width - tick->get_width();
+					int grabber_offset = (grabber->get_size().width / 2 - tick->get_width() / 2);
 					for (int i = 0; i < ticks; i++) {
 						if ((!ticks_on_borders) && ((i == 0) || ((i + 1) == ticks))) continue;
-						int ofs = i * tickarea / (ticks - 1);
+						int ofs = (i * areasize / (ticks - 1)) + grabber_offset;
 						tick->draw(ci, Point2i(ofs, (size.height - widget_height) / 2));
 					}
 				}
-				grabber->draw(ci, Point2i(get_as_ratio() * areasize, size.height / 2 - grabber->get_size().height / 2));
+				grabber->draw(ci, Point2i(ratio * areasize, size.height / 2 - grabber->get_size().height / 2));
 			}
 
 		} break;
diff --git a/scene/gui/text_edit.cpp b/scene/gui/text_edit.cpp
index e5169089f2..2ce709732c 100644
--- a/scene/gui/text_edit.cpp
+++ b/scene/gui/text_edit.cpp
@@ -1664,17 +1664,22 @@ void TextEdit::backspace_at_cursor() {
 	cursor_set_column(prev_column);
 }
 
-void TextEdit::indent_selection_right() {
+void TextEdit::indent_right() {
 
-	if (!is_selection_active()) {
-		return;
-	}
+	int start_line;
+	int end_line;
 	begin_complex_operation();
-	int start_line = get_selection_from_line();
-	int end_line = get_selection_to_line();
+
+	if (is_selection_active()) {
+		start_line = get_selection_from_line();
+		end_line = get_selection_to_line();
+	} else {
+		start_line = cursor.line;
+		end_line = start_line;
+	}
 
 	// ignore if the cursor is not past the first column
-	if (get_selection_to_column() == 0) {
+	if (is_selection_active() && get_selection_to_column() == 0) {
 		end_line--;
 	}
 
@@ -1688,23 +1693,32 @@ void TextEdit::indent_selection_right() {
 		set_line(i, line_text);
 	}
 
-	// fix selection being off by one on the last line
-	selection.to_column++;
+	// fix selection and cursor being off by one on the last line
+	if (is_selection_active()) {
+		selection.to_column++;
+		selection.from_column++;
+	}
+	cursor.column++;
 	end_complex_operation();
 	update();
 }
 
-void TextEdit::indent_selection_left() {
+void TextEdit::indent_left() {
 
-	if (!is_selection_active()) {
-		return;
-	}
+	int start_line;
+	int end_line;
 	begin_complex_operation();
-	int start_line = get_selection_from_line();
-	int end_line = get_selection_to_line();
+
+	if (is_selection_active()) {
+		start_line = get_selection_from_line();
+		end_line = get_selection_to_line();
+	} else {
+		start_line = cursor.line;
+		end_line = start_line;
+	}
 
 	// ignore if the cursor is not past the first column
-	if (get_selection_to_column() == 0) {
+	if (is_selection_active() && get_selection_to_column() == 0) {
 		end_line--;
 	}
 	String last_line_text = get_line(end_line);
@@ -1721,9 +1735,15 @@ void TextEdit::indent_selection_left() {
 		}
 	}
 
-	// fix selection being off by one on the last line
-	if (last_line_text != get_line(end_line) && selection.to_column > 0) {
-		selection.to_column--;
+	// fix selection and cursor being off by one on the last line
+	if (is_selection_active() && last_line_text != get_line(end_line)) {
+		if (selection.to_column > 0)
+			selection.to_column--;
+		if (selection.from_column > 0)
+			selection.from_column--;
+	}
+	if (cursor.column > 0) {
+		cursor.column--;
 	}
 	end_complex_operation();
 	update();
@@ -2216,9 +2236,9 @@ void TextEdit::_gui_input(const Ref<InputEvent> &p_gui_input) {
 
 				case KEY_TAB: {
 					if (k->get_shift()) {
-						indent_selection_left();
+						indent_left();
 					} else {
-						indent_selection_right();
+						indent_right();
 					}
 					dobreak = true;
 					accept_event();
@@ -2389,8 +2409,12 @@ void TextEdit::_gui_input(const Ref<InputEvent> &p_gui_input) {
 				if (readonly)
 					break;
 
-				if (selection.active) {
-
+				if (is_selection_active()) {
+					if (k->get_shift()) {
+						indent_left();
+					} else {
+						indent_right();
+					}
 				} else {
 					if (k->get_shift()) {
 
@@ -5657,4 +5681,4 @@ TextEdit::TextEdit() {
 }
 
 TextEdit::~TextEdit() {
-}
+}
+\ No newline at end of file
diff --git a/scene/gui/text_edit.h b/scene/gui/text_edit.h
index dd305d5822..edef28cc25 100644
--- a/scene/gui/text_edit.h
+++ b/scene/gui/text_edit.h
@@ -443,8 +443,8 @@ public:
 	void set_line(int line, String new_text);
 	void backspace_at_cursor();
 
-	void indent_selection_left();
-	void indent_selection_right();
+	void indent_left();
+	void indent_right();
 	int get_indent_level(int p_line) const;
 
 	inline void set_scroll_pass_end_of_file(bool p_enabled) {
diff --git a/scene/gui/tree.cpp b/scene/gui/tree.cpp
index ab12d123ba..b5b42e8f29 100644
--- a/scene/gui/tree.cpp
+++ b/scene/gui/tree.cpp
@@ -30,6 +30,7 @@
 #include "tree.h"
 #include <limits.h>
 
+#include "math_funcs.h"
 #include "os/input.h"
 #include "os/keyboard.h"
 #include "os/os.h"
@@ -37,6 +38,10 @@
 #include "project_settings.h"
 #include "scene/main/viewport.h"
 
+#ifdef TOOLS_ENABLED
+#include "editor/editor_node.h"
+#endif
+
 void TreeItem::move_to_top() {
 
 	if (!parent || parent->childs == this)
@@ -1412,9 +1417,14 @@ int Tree::draw_item(const Point2i &p_pos, const Point2 &p_draw_ofs, const Size2
 				if (c->get_children() != NULL)
 					root_pos -= Point2i(cache.arrow->get_width(), 0);
 
+				float line_width = 1.0;
+#ifdef TOOLS_ENABLED
+				line_width *= EDSCALE;
+#endif
+
 				Point2i parent_pos = Point2i(parent_ofs - cache.arrow->get_width() / 2, p_pos.y + label_h / 2 + cache.arrow->get_height() / 2) - cache.offset + p_draw_ofs;
-				VisualServer::get_singleton()->canvas_item_add_line(ci, root_pos, Point2i(parent_pos.x, root_pos.y), cache.relationship_line_color);
-				VisualServer::get_singleton()->canvas_item_add_line(ci, Point2i(parent_pos.x, root_pos.y), parent_pos, cache.relationship_line_color);
+				VisualServer::get_singleton()->canvas_item_add_line(ci, root_pos, Point2i(parent_pos.x - Math::floor(line_width / 2), root_pos.y), cache.relationship_line_color, line_width);
+				VisualServer::get_singleton()->canvas_item_add_line(ci, Point2i(parent_pos.x, root_pos.y), parent_pos, cache.relationship_line_color, line_width);
 			}
 
 			int child_h = draw_item(children_pos, p_draw_ofs, p_draw_size, c);
diff --git a/scene/main/http_request.cpp b/scene/main/http_request.cpp
index 672e893f1b..4afdb56f86 100644
--- a/scene/main/http_request.cpp
+++ b/scene/main/http_request.cpp
@@ -36,7 +36,6 @@ void HTTPRequest::_redirect_request(const String &p_new_url) {
 
 Error HTTPRequest::_request() {
 
-	//print_line("Requesting:\n\tURL: "+url+"\n\tString: "+request_string+"\n\tPort: "+itos(port)+"\n\tSSL: "+itos(use_ssl)+"\n\tValidate SSL: "+itos(validate_ssl));
 	return client->connect_to_host(url, port, use_ssl, validate_ssl);
 }
 
@@ -54,37 +53,32 @@ Error HTTPRequest::_parse_url(const String &p_url) {
 	downloaded = 0;
 	redirections = 0;
 
-	//print_line("1 url: "+url);
-	if (url.begins_with("http://")) {
-
+	String url_lower = url.to_lower();
+	if (url_lower.begins_with("http://")) {
 		url = url.substr(7, url.length() - 7);
-		//print_line("no SSL");
-
-	} else if (url.begins_with("https://")) {
+	} else if (url_lower.begins_with("https://")) {
 		url = url.substr(8, url.length() - 8);
 		use_ssl = true;
 		port = 443;
-		//print_line("yes SSL");
 	} else {
 		ERR_EXPLAIN("Malformed URL");
 		ERR_FAIL_V(ERR_INVALID_PARAMETER);
 	}
 
-	//print_line("2 url: "+url);
+	if (url.length() < 1) {
+		ERR_EXPLAIN("URL too short");
+		ERR_FAIL_V(ERR_INVALID_PARAMETER);
+	}
 
 	int slash_pos = url.find("/");
 
 	if (slash_pos != -1) {
 		request_string = url.substr(slash_pos, url.length());
 		url = url.substr(0, slash_pos);
-		//print_line("request string: "+request_string);
 	} else {
 		request_string = "/";
-		//print_line("no request");
 	}
 
-	//print_line("3 url: "+url);
-
 	int colon_pos = url.find(":");
 	if (colon_pos != -1) {
 		port = url.substr(colon_pos + 1, url.length()).to_int();
@@ -92,8 +86,6 @@ Error HTTPRequest::_parse_url(const String &p_url) {
 		ERR_FAIL_COND_V(port < 1 || port > 65535, ERR_INVALID_PARAMETER);
 	}
 
-	//print_line("4 url: "+url);
-
 	return OK;
 }
 
@@ -198,10 +190,8 @@ void HTTPRequest::cancel_request() {
 	}
 	client->close();
 	body.resize(0);
-	//downloaded=0;
 	got_response = false;
 	response_code = -1;
-	//body_len=-1;
 	request_sent = false;
 	requesting = false;
 }
@@ -221,12 +211,12 @@ bool HTTPRequest::_handle_response(bool *ret_value) {
 	response_headers.resize(0);
 	downloaded = 0;
 	for (List<String>::Element *E = rheaders.front(); E; E = E->next()) {
-		//print_line("HEADER: "+E->get());
 		response_headers.push_back(E->get());
 	}
 
 	if (response_code == 301 || response_code == 302) {
-		//redirect
+		// Handle redirect
+
 		if (max_redirects >= 0 && redirections >= max_redirects) {
 
 			call_deferred("_request_done", RESULT_REDIRECT_LIMIT_REACHED, response_code, response_headers, PoolByteArray());
@@ -242,15 +232,13 @@ bool HTTPRequest::_handle_response(bool *ret_value) {
 			}
 		}
 
-		//print_line("NEW LOCATION: "+new_request);
-
 		if (new_request != "") {
-			//process redirect
+			// Process redirect
 			client->close();
-			int new_redirs = redirections + 1; //because _request() will clear it
+			int new_redirs = redirections + 1; // Because _request() will clear it
 			Error err;
 			if (new_request.begins_with("http")) {
-				//new url, request all again
+				// New url, request all again
 				err = _parse_url(new_request);
 			} else {
 				request_string = new_request;
@@ -258,7 +246,6 @@ bool HTTPRequest::_handle_response(bool *ret_value) {
 
 			err = _request();
 
-			//print_line("new connection: "+itos(err));
 			if (err == OK) {
 				request_sent = false;
 				got_response = false;
@@ -280,11 +267,11 @@ bool HTTPRequest::_update_connection() {
 	switch (client->get_status()) {
 		case HTTPClient::STATUS_DISCONNECTED: {
 			call_deferred("_request_done", RESULT_CANT_CONNECT, 0, PoolStringArray(), PoolByteArray());
-			return true; //end it, since it's doing something
+			return true; // End it, since it's doing something
 		} break;
 		case HTTPClient::STATUS_RESOLVING: {
 			client->poll();
-			//must wait
+			// Must wait
 			return false;
 		} break;
 		case HTTPClient::STATUS_CANT_RESOLVE: {
@@ -294,9 +281,9 @@ bool HTTPRequest::_update_connection() {
 		} break;
 		case HTTPClient::STATUS_CONNECTING: {
 			client->poll();
-			//must wait
+			// Must wait
 			return false;
-		} break; //connecting to ip
+		} break; // Connecting to IP
 		case HTTPClient::STATUS_CANT_CONNECT: {
 
 			call_deferred("_request_done", RESULT_CANT_CONNECT, 0, PoolStringArray(), PoolByteArray());
@@ -309,7 +296,7 @@ bool HTTPRequest::_update_connection() {
 
 				if (!got_response) {
 
-					//no body
+					// No body
 
 					bool ret_value;
 
@@ -320,16 +307,16 @@ bool HTTPRequest::_update_connection() {
 					return true;
 				}
 				if (got_response && body_len < 0) {
-					//chunked transfer is done
+					// Chunked transfer is done
 					call_deferred("_request_done", RESULT_SUCCESS, response_code, response_headers, body);
 					return true;
 				}
 
 				call_deferred("_request_done", RESULT_CHUNKED_BODY_SIZE_MISMATCH, response_code, response_headers, PoolByteArray());
 				return true;
-				//request migh have been done
+				// Request migh have been done
 			} else {
-				//did not request yet, do request
+				// Did not request yet, do request
 
 				Error err = client->request(method, request_string, headers, request_data);
 				if (err != OK) {
@@ -340,13 +327,13 @@ bool HTTPRequest::_update_connection() {
 				request_sent = true;
 				return false;
 			}
-		} break; //connected: { } break requests only accepted here
+		} break; // Connected: break requests only accepted here
 		case HTTPClient::STATUS_REQUESTING: {
-			//must wait, it's requesting
+			// Must wait, still requesting
 			client->poll();
 			return false;
 
-		} break; // request in progress
+		} break; // Request in progress
 		case HTTPClient::STATUS_BODY: {
 
 			if (!got_response) {
@@ -363,7 +350,7 @@ bool HTTPRequest::_update_connection() {
 				}
 
 				if (client->is_response_chunked()) {
-					body_len = -1; //no body len because chunked, change your webserver configuration if you want body len
+					body_len = -1; // No body len because chunked, change your webserver configuration if you want body len
 				} else {
 					body_len = client->get_response_body_length();
 
@@ -383,7 +370,6 @@ bool HTTPRequest::_update_connection() {
 				}
 			}
 
-			//print_line("BODY: "+itos(body.size()));
 			client->poll();
 
 			PoolByteArray chunk = client->read_response_body_chunk();
@@ -411,15 +397,11 @@ bool HTTPRequest::_update_connection() {
 					call_deferred("_request_done", RESULT_SUCCESS, response_code, response_headers, body);
 					return true;
 				}
-				/*if (body.size()>=body_len) {
-					call_deferred("_request_done",RESULT_BODY_SIZE_MISMATCH,response_code,response_headers,ByteArray());
-					return true;
-				}*/
 			}
 
 			return false;
 
-		} break; // request resulted in body: { } break which must be read
+		} break; // Request resulted in body: break which must be read
 		case HTTPClient::STATUS_CONNECTION_ERROR: {
 			call_deferred("_request_done", RESULT_CONNECTION_ERROR, 0, PoolStringArray(), PoolByteArray());
 			return true;
@@ -449,7 +431,7 @@ void HTTPRequest::_notification(int p_what) {
 		if (done) {
 
 			set_process_internal(false);
-			//cancel_request(); called from _request done now
+			// cancel_request(); called from _request done now
 		}
 	}
 
@@ -543,7 +525,7 @@ void HTTPRequest::_bind_methods() {
 
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_threads"), "set_use_threads", "is_using_threads");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "body_size_limit", PROPERTY_HINT_RANGE, "-1,2000000000"), "set_body_size_limit", "get_body_size_limit");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "max_redirects", PROPERTY_HINT_RANGE, "-1,1024"), "set_max_redirects", "get_max_redirects");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "max_redirects", PROPERTY_HINT_RANGE, "-1,64"), "set_max_redirects", "get_max_redirects");
 
 	ADD_SIGNAL(MethodInfo("request_completed", PropertyInfo(Variant::INT, "result"), PropertyInfo(Variant::INT, "response_code"), PropertyInfo(Variant::POOL_STRING_ARRAY, "headers"), PropertyInfo(Variant::POOL_BYTE_ARRAY, "body")));
 
diff --git a/scene/main/http_request.h b/scene/main/http_request.h
index 790ff5f7ef..ab5a79c40d 100644
--- a/scene/main/http_request.h
+++ b/scene/main/http_request.h
@@ -42,7 +42,6 @@ class HTTPRequest : public Node {
 public:
 	enum Result {
 		RESULT_SUCCESS,
-		//RESULT_NO_BODY,
 		RESULT_CHUNKED_BODY_SIZE_MISMATCH,
 		RESULT_CANT_CONNECT,
 		RESULT_CANT_RESOLVE,
diff --git a/scene/main/node.cpp b/scene/main/node.cpp
index cae368aeca..efc5d269a6 100644
--- a/scene/main/node.cpp
+++ b/scene/main/node.cpp
@@ -177,8 +177,8 @@ void Node::_propagate_ready() {
 	}
 	data.blocked--;
 	if (data.ready_first) {
-		notification(NOTIFICATION_READY);
 		data.ready_first = false;
+		notification(NOTIFICATION_READY);
 	}
 }
 
diff --git a/scene/main/viewport.cpp b/scene/main/viewport.cpp
index 4635de81e8..f5d7043a40 100644
--- a/scene/main/viewport.cpp
+++ b/scene/main/viewport.cpp
@@ -1421,7 +1421,7 @@ void Viewport::_gui_show_tooltip() {
 	gui.tooltip_label->set_anchor_and_margin(MARGIN_TOP, Control::ANCHOR_BEGIN, ttp->get_margin(MARGIN_TOP));
 	gui.tooltip_label->set_anchor_and_margin(MARGIN_RIGHT, Control::ANCHOR_END, -ttp->get_margin(MARGIN_RIGHT));
 	gui.tooltip_label->set_anchor_and_margin(MARGIN_BOTTOM, Control::ANCHOR_END, -ttp->get_margin(MARGIN_BOTTOM));
-	gui.tooltip_label->set_text(tooltip);
+	gui.tooltip_label->set_text(tooltip.strip_edges());
 	Rect2 r(gui.tooltip_pos + Point2(10, 10), gui.tooltip_label->get_combined_minimum_size() + ttp->get_minimum_size());
 	Rect2 vr = gui.tooltip_label->get_viewport_rect();
 	if (r.size.x + r.position.x > vr.size.x)
diff --git a/scene/register_scene_types.cpp b/scene/register_scene_types.cpp
index d6557f508e..9715e1d6a0 100644
--- a/scene/register_scene_types.cpp
+++ b/scene/register_scene_types.cpp
@@ -160,6 +160,7 @@
 #include "scene/3d/area.h"
 #include "scene/3d/arvr_nodes.h"
 #include "scene/3d/audio_stream_player_3d.h"
+#include "scene/3d/baked_lightmap.h"
 #include "scene/3d/bone_attachment.h"
 #include "scene/3d/camera.h"
 #include "scene/3d/collision_polygon.h"
@@ -375,6 +376,8 @@ void register_scene_types() {
 	ClassDB::register_class<ReflectionProbe>();
 	ClassDB::register_class<GIProbe>();
 	ClassDB::register_class<GIProbeData>();
+	ClassDB::register_class<BakedLightmap>();
+	ClassDB::register_class<BakedLightmapData>();
 	ClassDB::register_class<AnimationTreePlayer>();
 	ClassDB::register_class<Particles>();
 	ClassDB::register_class<Position3D>();
diff --git a/scene/resources/material.cpp b/scene/resources/material.cpp
index c8ab7c2a04..326320c60f 100644
--- a/scene/resources/material.cpp
+++ b/scene/resources/material.cpp
@@ -645,7 +645,7 @@ void SpatialMaterial::_update_shader() {
 		code += "\tvec2 base_uv = UV;\n";
 	}
 
-	if ((features[FEATURE_DETAIL] && detail_uv == DETAIL_UV_2) || (features[FEATURE_AMBIENT_OCCLUSION] && flags[FLAG_AO_ON_UV2])) {
+	if ((features[FEATURE_DETAIL] && detail_uv == DETAIL_UV_2) || (features[FEATURE_AMBIENT_OCCLUSION] && flags[FLAG_AO_ON_UV2]) || (features[FEATURE_EMISSION] && flags[FLAG_EMISSION_ON_UV2])) {
 		code += "\tvec2 base_uv2 = UV2;\n";
 	}
 
@@ -729,11 +729,20 @@ void SpatialMaterial::_update_shader() {
 	}
 
 	if (features[FEATURE_EMISSION]) {
-		if (flags[FLAG_UV1_USE_TRIPLANAR]) {
-			code += "\tvec3 emission_tex = triplanar_texture(texture_emission,uv1_power_normal,uv1_triplanar_pos).rgb;\n";
+		if (flags[FLAG_EMISSION_ON_UV2]) {
+			if (flags[FLAG_UV2_USE_TRIPLANAR]) {
+				code += "\tvec3 emission_tex = triplanar_texture(texture_emission,uv2_power_normal,uv2_triplanar_pos).rgb;\n";
+			} else {
+				code += "\tvec3 emission_tex = texture(texture_emission,base_uv2).rgb;\n";
+			}
 		} else {
-			code += "\tvec3 emission_tex = texture(texture_emission,base_uv).rgb;\n";
+			if (flags[FLAG_UV1_USE_TRIPLANAR]) {
+				code += "\tvec3 emission_tex = triplanar_texture(texture_emission,uv1_power_normal,uv1_triplanar_pos).rgb;\n";
+			} else {
+				code += "\tvec3 emission_tex = texture(texture_emission,base_uv).rgb;\n";
+			}
 		}
+
 		if (emission_op == EMISSION_OP_ADD) {
 			code += "\tEMISSION = (emission.rgb+emission_tex)*emission_energy;\n";
 		} else {
@@ -1892,6 +1901,7 @@ void SpatialMaterial::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::COLOR, "emission", PROPERTY_HINT_COLOR_NO_ALPHA), "set_emission", "get_emission");
 	ADD_PROPERTY(PropertyInfo(Variant::REAL, "emission_energy", PROPERTY_HINT_RANGE, "0,16,0.01"), "set_emission_energy", "get_emission_energy");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "emission_operator", PROPERTY_HINT_ENUM, "Add,Multiply"), "set_emission_operator", "get_emission_operator");
+	ADD_PROPERTYI(PropertyInfo(Variant::BOOL, "emission_on_uv2"), "set_flag", "get_flag", FLAG_EMISSION_ON_UV2);
 	ADD_PROPERTYI(PropertyInfo(Variant::OBJECT, "emission_texture", PROPERTY_HINT_RESOURCE_TYPE, "Texture"), "set_texture", "get_texture", TEXTURE_EMISSION);
 
 	ADD_GROUP("NormalMap", "normal_");
@@ -2034,6 +2044,7 @@ void SpatialMaterial::_bind_methods() {
 	BIND_ENUM_CONSTANT(FLAG_UV1_USE_TRIPLANAR);
 	BIND_ENUM_CONSTANT(FLAG_UV2_USE_TRIPLANAR);
 	BIND_ENUM_CONSTANT(FLAG_AO_ON_UV2);
+	BIND_ENUM_CONSTANT(FLAG_EMISSION_ON_UV2);
 	BIND_ENUM_CONSTANT(FLAG_USE_ALPHA_SCISSOR);
 	BIND_ENUM_CONSTANT(FLAG_TRIPLANAR_USE_WORLD);
 	BIND_ENUM_CONSTANT(FLAG_ALBEDO_TEXTURE_FORCE_SRGB);
diff --git a/scene/resources/material.h b/scene/resources/material.h
index 7cfa38fce4..d5c3ef83e2 100644
--- a/scene/resources/material.h
+++ b/scene/resources/material.h
@@ -184,6 +184,7 @@ public:
 		FLAG_UV2_USE_TRIPLANAR,
 		FLAG_TRIPLANAR_USE_WORLD,
 		FLAG_AO_ON_UV2,
+		FLAG_EMISSION_ON_UV2,
 		FLAG_USE_ALPHA_SCISSOR,
 		FLAG_ALBEDO_TEXTURE_FORCE_SRGB,
 		FLAG_MAX
@@ -234,7 +235,7 @@ private:
 			uint64_t blend_mode : 2;
 			uint64_t depth_draw_mode : 2;
 			uint64_t cull_mode : 2;
-			uint64_t flags : 13;
+			uint64_t flags : 14;
 			uint64_t detail_blend_mode : 2;
 			uint64_t diffuse_mode : 3;
 			uint64_t specular_mode : 2;
diff --git a/scene/resources/mesh.cpp b/scene/resources/mesh.cpp
index 0b352efca2..bb33962be6 100644
--- a/scene/resources/mesh.cpp
+++ b/scene/resources/mesh.cpp
@@ -1123,27 +1123,29 @@ Error ArrayMesh::lightmap_unwrap(const Transform &p_base_transform, float p_texe
 
 		PoolVector<int> rindices = arrays[Mesh::ARRAY_INDEX];
 		int ic = rindices.size();
-		int index_ofs = indices.size();
 
 		if (ic == 0) {
-			indices.resize(index_ofs + vc);
-			face_materials.resize((index_ofs + vc) / 3);
-			for (int j = 0; j < vc; j++) {
-				indices[index_ofs + j] = vertex_ofs + j;
-			}
+
 			for (int j = 0; j < vc / 3; j++) {
-				face_materials[(index_ofs / 3) + j] = i;
+				if (Face3(r[j * 3 + 0], r[j * 3 + 1], r[j * 3 + 2]).is_degenerate())
+					continue;
+
+				indices.push_back(vertex_ofs + j * 3 + 0);
+				indices.push_back(vertex_ofs + j * 3 + 1);
+				indices.push_back(vertex_ofs + j * 3 + 2);
+				face_materials.push_back(i);
 			}
 
 		} else {
 			PoolVector<int>::Read ri = rindices.read();
-			indices.resize(index_ofs + ic);
-			face_materials.resize((index_ofs + ic) / 3);
-			for (int j = 0; j < ic; j++) {
-				indices[index_ofs + j] = vertex_ofs + ri[j];
-			}
+
 			for (int j = 0; j < ic / 3; j++) {
-				face_materials[(index_ofs / 3) + j] = i;
+				if (Face3(r[ri[j * 3 + 0]], r[ri[j * 3 + 1]], r[ri[j * 3 + 2]]).is_degenerate())
+					continue;
+				indices.push_back(vertex_ofs + ri[j * 3 + 0]);
+				indices.push_back(vertex_ofs + ri[j * 3 + 1]);
+				indices.push_back(vertex_ofs + ri[j * 3 + 2]);
+				face_materials.push_back(i);
 			}
 		}
 
diff --git a/scene/resources/scene_format_text.cpp b/scene/resources/scene_format_text.cpp
index fe23fbf6b3..aebbb5b562 100644
--- a/scene/resources/scene_format_text.cpp
+++ b/scene/resources/scene_format_text.cpp
@@ -28,7 +28,7 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 #include "scene_format_text.h"
-
+#include "core/io/resource_format_binary.h"
 #include "os/dir_access.h"
 #include "project_settings.h"
 #include "version.h"
@@ -53,6 +53,60 @@ Ref<Resource> ResourceInteractiveLoaderText::get_resource() {
 	return resource;
 }
 
+Error ResourceInteractiveLoaderText::_parse_sub_resource_dummy(DummyReadData *p_data, VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str) {
+
+	VariantParser::Token token;
+	VariantParser::get_token(p_stream, token, line, r_err_str);
+	if (token.type != VariantParser::TK_NUMBER) {
+		r_err_str = "Expected number (sub-resource index)";
+		return ERR_PARSE_ERROR;
+	}
+
+	int index = token.value;
+
+	if (!p_data->resource_map.has(index)) {
+		Ref<DummyResource> dr;
+		dr.instance();
+		dr->set_subindex(index);
+		p_data->resource_map[index] = dr;
+		p_data->resource_set.insert(dr);
+	}
+
+	r_res = p_data->resource_map[index];
+
+	VariantParser::get_token(p_stream, token, line, r_err_str);
+	if (token.type != VariantParser::TK_PARENTHESIS_CLOSE) {
+		r_err_str = "Expected ')'";
+		return ERR_PARSE_ERROR;
+	}
+
+	return OK;
+}
+
+Error ResourceInteractiveLoaderText::_parse_ext_resource_dummy(DummyReadData *p_data, VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str) {
+
+	VariantParser::Token token;
+	VariantParser::get_token(p_stream, token, line, r_err_str);
+	if (token.type != VariantParser::TK_NUMBER) {
+		r_err_str = "Expected number (sub-resource index)";
+		return ERR_PARSE_ERROR;
+	}
+
+	int id = token.value;
+
+	ERR_FAIL_COND_V(!p_data->rev_external_resources.has(id), ERR_PARSE_ERROR);
+
+	r_res = p_data->rev_external_resources[id];
+
+	VariantParser::get_token(p_stream, token, line, r_err_str);
+	if (token.type != VariantParser::TK_PARENTHESIS_CLOSE) {
+		r_err_str = "Expected ')'";
+		return ERR_PARSE_ERROR;
+	}
+
+	return OK;
+}
+
 Error ResourceInteractiveLoaderText::_parse_sub_resource(VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str) {
 
 	VariantParser::Token token;
@@ -131,6 +185,203 @@ Error ResourceInteractiveLoaderText::_parse_ext_resource(VariantParser::Stream *
 	return OK;
 }
 
+Ref<PackedScene> ResourceInteractiveLoaderText::_parse_node_tag(VariantParser::ResourceParser &parser) {
+	Ref<PackedScene> packed_scene;
+	packed_scene.instance();
+
+	while (true) {
+
+		if (next_tag.name == "node") {
+
+			int parent = -1;
+			int owner = -1;
+			int type = -1;
+			int name = -1;
+			int instance = -1;
+			//int base_scene=-1;
+
+			if (next_tag.fields.has("name")) {
+				name = packed_scene->get_state()->add_name(next_tag.fields["name"]);
+			}
+
+			if (next_tag.fields.has("parent")) {
+				NodePath np = next_tag.fields["parent"];
+				np.prepend_period(); //compatible to how it manages paths internally
+				parent = packed_scene->get_state()->add_node_path(np);
+			}
+
+			if (next_tag.fields.has("type")) {
+				type = packed_scene->get_state()->add_name(next_tag.fields["type"]);
+			} else {
+				type = SceneState::TYPE_INSTANCED; //no type? assume this was instanced
+			}
+
+			if (next_tag.fields.has("instance")) {
+
+				instance = packed_scene->get_state()->add_value(next_tag.fields["instance"]);
+
+				if (packed_scene->get_state()->get_node_count() == 0 && parent == -1) {
+					packed_scene->get_state()->set_base_scene(instance);
+					instance = -1;
+				}
+			}
+
+			if (next_tag.fields.has("instance_placeholder")) {
+
+				String path = next_tag.fields["instance_placeholder"];
+
+				int path_v = packed_scene->get_state()->add_value(path);
+
+				if (packed_scene->get_state()->get_node_count() == 0) {
+					error = ERR_FILE_CORRUPT;
+					error_text = "Instance Placeholder can't be used for inheritance.";
+					_printerr();
+					return Ref<PackedScene>();
+				}
+
+				instance = path_v | SceneState::FLAG_INSTANCE_IS_PLACEHOLDER;
+			}
+
+			if (next_tag.fields.has("owner")) {
+				owner = packed_scene->get_state()->add_node_path(next_tag.fields["owner"]);
+			} else {
+				if (parent != -1 && !(type == SceneState::TYPE_INSTANCED && instance == -1))
+					owner = 0; //if no owner, owner is root
+			}
+
+			int node_id = packed_scene->get_state()->add_node(parent, owner, type, name, instance);
+
+			if (next_tag.fields.has("groups")) {
+
+				Array groups = next_tag.fields["groups"];
+				for (int i = 0; i < groups.size(); i++) {
+					packed_scene->get_state()->add_node_group(node_id, packed_scene->get_state()->add_name(groups[i]));
+				}
+			}
+
+			while (true) {
+
+				String assign;
+				Variant value;
+
+				error = VariantParser::parse_tag_assign_eof(&stream, lines, error_text, next_tag, assign, value, &parser);
+
+				if (error) {
+					if (error != ERR_FILE_EOF) {
+						_printerr();
+						return Ref<PackedScene>();
+					} else {
+						return packed_scene;
+					}
+				}
+
+				if (assign != String()) {
+					int nameidx = packed_scene->get_state()->add_name(assign);
+					int valueidx = packed_scene->get_state()->add_value(value);
+					packed_scene->get_state()->add_node_property(node_id, nameidx, valueidx);
+					//it's assignment
+				} else if (next_tag.name != String()) {
+					break;
+				}
+			}
+		} else if (next_tag.name == "connection") {
+
+			if (!next_tag.fields.has("from")) {
+				error = ERR_FILE_CORRUPT;
+				error_text = "missing 'from' field fron connection tag";
+				return Ref<PackedScene>();
+			}
+
+			if (!next_tag.fields.has("to")) {
+				error = ERR_FILE_CORRUPT;
+				error_text = "missing 'to' field fron connection tag";
+				return Ref<PackedScene>();
+			}
+
+			if (!next_tag.fields.has("signal")) {
+				error = ERR_FILE_CORRUPT;
+				error_text = "missing 'signal' field fron connection tag";
+				return Ref<PackedScene>();
+			}
+
+			if (!next_tag.fields.has("method")) {
+				error = ERR_FILE_CORRUPT;
+				error_text = "missing 'method' field fron connection tag";
+				return Ref<PackedScene>();
+			}
+
+			NodePath from = next_tag.fields["from"];
+			NodePath to = next_tag.fields["to"];
+			StringName method = next_tag.fields["method"];
+			StringName signal = next_tag.fields["signal"];
+			int flags = CONNECT_PERSIST;
+			Array binds;
+
+			if (next_tag.fields.has("flags")) {
+				flags = next_tag.fields["flags"];
+			}
+
+			if (next_tag.fields.has("binds")) {
+				binds = next_tag.fields["binds"];
+			}
+
+			Vector<int> bind_ints;
+			for (int i = 0; i < binds.size(); i++) {
+				bind_ints.push_back(packed_scene->get_state()->add_value(binds[i]));
+			}
+
+			packed_scene->get_state()->add_connection(
+					packed_scene->get_state()->add_node_path(from.simplified()),
+					packed_scene->get_state()->add_node_path(to.simplified()),
+					packed_scene->get_state()->add_name(signal),
+					packed_scene->get_state()->add_name(method),
+					flags,
+					bind_ints);
+
+			error = VariantParser::parse_tag(&stream, lines, error_text, next_tag, &parser);
+
+			if (error) {
+				if (error != ERR_FILE_EOF) {
+					_printerr();
+					return Ref<PackedScene>();
+				} else {
+					return packed_scene;
+				}
+			}
+		} else if (next_tag.name == "editable") {
+
+			if (!next_tag.fields.has("path")) {
+				error = ERR_FILE_CORRUPT;
+				error_text = "missing 'path' field fron connection tag";
+				_printerr();
+				return Ref<PackedScene>();
+			}
+
+			NodePath path = next_tag.fields["path"];
+
+			packed_scene->get_state()->add_editable_instance(path.simplified());
+
+			error = VariantParser::parse_tag(&stream, lines, error_text, next_tag, &parser);
+
+			if (error) {
+				if (error != ERR_FILE_EOF) {
+					_printerr();
+					return Ref<PackedScene>();
+				} else {
+					return packed_scene;
+				}
+			}
+		} else {
+
+			error = ERR_FILE_CORRUPT;
+			_printerr();
+			return Ref<PackedScene>();
+		}
+	}
+
+	return packed_scene;
+}
+
 Error ResourceInteractiveLoaderText::poll() {
 
 	if (error != OK)
@@ -364,231 +615,21 @@ Error ResourceInteractiveLoaderText::poll() {
 			return error;
 		}
 
-		/*
-		int add_name(const StringName& p_name);
-		int add_value(const Variant& p_value);
-		int add_node_path(const NodePath& p_path);
-		int add_node(int p_parent,int p_owner,int p_type,int p_name, int p_instance);
-		void add_node_property(int p_node,int p_name,int p_value);
-		void add_node_group(int p_node,int p_group);
-		void set_base_scene(int p_idx);
-		void add_connection(int p_from,int p_to, int p_signal, int p_method, int p_flags,const Vector<int>& p_binds);
-		void add_editable_instance(const NodePath& p_path);
-
-		*/
-
-		int parent = -1;
-		int owner = -1;
-		int type = -1;
-		int name = -1;
-		int instance = -1;
-		//int base_scene=-1;
-
-		if (next_tag.fields.has("name")) {
-			name = packed_scene->get_state()->add_name(next_tag.fields["name"]);
-		}
-
-		if (next_tag.fields.has("parent")) {
-			NodePath np = next_tag.fields["parent"];
-			np.prepend_period(); //compatible to how it manages paths internally
-			parent = packed_scene->get_state()->add_node_path(np);
-		}
-
-		if (next_tag.fields.has("type")) {
-			type = packed_scene->get_state()->add_name(next_tag.fields["type"]);
-		} else {
-			type = SceneState::TYPE_INSTANCED; //no type? assume this was instanced
-		}
-
-		if (next_tag.fields.has("instance")) {
-
-			instance = packed_scene->get_state()->add_value(next_tag.fields["instance"]);
-
-			if (packed_scene->get_state()->get_node_count() == 0 && parent == -1) {
-				packed_scene->get_state()->set_base_scene(instance);
-				instance = -1;
-			}
-		}
-
-		if (next_tag.fields.has("instance_placeholder")) {
-
-			String path = next_tag.fields["instance_placeholder"];
-
-			int path_v = packed_scene->get_state()->add_value(path);
-
-			if (packed_scene->get_state()->get_node_count() == 0) {
-				error = ERR_FILE_CORRUPT;
-				error_text = "Instance Placeholder can't be used for inheritance.";
-				_printerr();
-				return error;
-			}
-
-			instance = path_v | SceneState::FLAG_INSTANCE_IS_PLACEHOLDER;
-		}
-
-		if (next_tag.fields.has("owner")) {
-			owner = packed_scene->get_state()->add_node_path(next_tag.fields["owner"]);
-		} else {
-			if (parent != -1 && !(type == SceneState::TYPE_INSTANCED && instance == -1))
-				owner = 0; //if no owner, owner is root
-		}
-
-		int node_id = packed_scene->get_state()->add_node(parent, owner, type, name, instance);
-
-		if (next_tag.fields.has("groups")) {
-
-			Array groups = next_tag.fields["groups"];
-			for (int i = 0; i < groups.size(); i++) {
-				packed_scene->get_state()->add_node_group(node_id, packed_scene->get_state()->add_name(groups[i]));
-			}
-		}
-
-		while (true) {
-
-			String assign;
-			Variant value;
-
-			error = VariantParser::parse_tag_assign_eof(&stream, lines, error_text, next_tag, assign, value, &rp);
-
-			if (error) {
-				if (error != ERR_FILE_EOF) {
-					_printerr();
-				} else {
-					resource = packed_scene;
-					if (!ResourceCache::has(res_path)) {
-						packed_scene->set_path(res_path);
-					}
-				}
-				return error;
-			}
-
-			if (assign != String()) {
-				int nameidx = packed_scene->get_state()->add_name(assign);
-				int valueidx = packed_scene->get_state()->add_value(value);
-				packed_scene->get_state()->add_node_property(node_id, nameidx, valueidx);
-				//it's assignment
-			} else if (next_tag.name != String()) {
-
-				error = OK;
-				return error;
-			} else {
-
-				resource = packed_scene;
-				error = ERR_FILE_EOF;
-				return error;
-			}
-		}
-
-		return OK;
-
-	} else if (next_tag.name == "connection") {
-
-		if (!is_scene) {
-
-			error_text += "found the 'connection' tag on a resource file!";
-			_printerr();
-			error = ERR_FILE_CORRUPT;
-			return error;
-		}
-
-		if (!next_tag.fields.has("from")) {
-			error = ERR_FILE_CORRUPT;
-			error_text = "missing 'from' field fron connection tag";
-			return error;
-		}
-
-		if (!next_tag.fields.has("to")) {
-			error = ERR_FILE_CORRUPT;
-			error_text = "missing 'to' field fron connection tag";
-			return error;
-		}
-
-		if (!next_tag.fields.has("signal")) {
-			error = ERR_FILE_CORRUPT;
-			error_text = "missing 'signal' field fron connection tag";
-			return error;
-		}
-
-		if (!next_tag.fields.has("method")) {
-			error = ERR_FILE_CORRUPT;
-			error_text = "missing 'method' field fron connection tag";
-			return error;
-		}
-
-		NodePath from = next_tag.fields["from"];
-		NodePath to = next_tag.fields["to"];
-		StringName method = next_tag.fields["method"];
-		StringName signal = next_tag.fields["signal"];
-		int flags = CONNECT_PERSIST;
-		Array binds;
-
-		if (next_tag.fields.has("flags")) {
-			flags = next_tag.fields["flags"];
-		}
-
-		if (next_tag.fields.has("binds")) {
-			binds = next_tag.fields["binds"];
-		}
-
-		Vector<int> bind_ints;
-		for (int i = 0; i < binds.size(); i++) {
-			bind_ints.push_back(packed_scene->get_state()->add_value(binds[i]));
-		}
-
-		packed_scene->get_state()->add_connection(
-				packed_scene->get_state()->add_node_path(from.simplified()),
-				packed_scene->get_state()->add_node_path(to.simplified()),
-				packed_scene->get_state()->add_name(signal),
-				packed_scene->get_state()->add_name(method),
-				flags,
-				bind_ints);
-
-		error = VariantParser::parse_tag(&stream, lines, error_text, next_tag, &rp);
-
-		if (error) {
-			if (error != ERR_FILE_EOF) {
-				_printerr();
-			} else {
-				resource = packed_scene;
-			}
-		}
-
-		return error;
-	} else if (next_tag.name == "editable") {
+		Ref<PackedScene> packed_scene = _parse_node_tag(rp);
 
-		if (!is_scene) {
-
-			error_text += "found the 'editable' tag on a resource file!";
-			_printerr();
-			error = ERR_FILE_CORRUPT;
+		if (!packed_scene.is_valid())
 			return error;
-		}
 
-		if (!next_tag.fields.has("path")) {
-			error = ERR_FILE_CORRUPT;
-			error_text = "missing 'path' field fron connection tag";
-			_printerr();
-			return error;
+		error = OK;
+		//get it here
+		resource = packed_scene;
+		if (!ResourceCache::has(res_path)) {
+			packed_scene->set_path(res_path);
 		}
 
-		NodePath path = next_tag.fields["path"];
-
-		packed_scene->get_state()->add_editable_instance(path.simplified());
-
-		error = VariantParser::parse_tag(&stream, lines, error_text, next_tag, &rp);
-
-		if (error) {
-			if (error != ERR_FILE_EOF) {
-				_printerr();
-			} else {
-				resource = packed_scene;
-			}
-		}
-
-		return error;
+		return ERR_FILE_EOF;
 
 	} else {
-
 		error_text += "Unknown tag in file: " + next_tag.name;
 		_printerr();
 		error = ERR_FILE_CORRUPT;
@@ -804,7 +845,6 @@ void ResourceInteractiveLoaderText::open(FileAccess *p_f, bool p_skip_first_tag)
 
 	if (tag.name == "gd_scene") {
 		is_scene = true;
-		packed_scene.instance();
 
 	} else if (tag.name == "gd_resource") {
 		if (!tag.fields.has("type")) {
@@ -846,6 +886,281 @@ void ResourceInteractiveLoaderText::open(FileAccess *p_f, bool p_skip_first_tag)
 	rp.userdata = this;
 }
 
+static void bs_save_unicode_string(FileAccess *f, const String &p_string, bool p_bit_on_len = false) {
+
+	CharString utf8 = p_string.utf8();
+	if (p_bit_on_len) {
+		f->store_32(utf8.length() + 1 | 0x80000000);
+	} else {
+		f->store_32(utf8.length() + 1);
+	}
+	f->store_buffer((const uint8_t *)utf8.get_data(), utf8.length() + 1);
+}
+
+Error ResourceInteractiveLoaderText::save_as_binary(FileAccess *p_f, const String &p_path) {
+
+	if (error)
+		return error;
+
+	FileAccessRef wf = FileAccess::open(p_path, FileAccess::WRITE);
+	if (!wf) {
+		return ERR_CANT_OPEN;
+	}
+
+	//save header compressed
+	static const uint8_t header[4] = { 'R', 'S', 'R', 'C' };
+	wf->store_buffer(header, 4);
+
+	wf->store_32(0); //endianness, little endian
+	wf->store_32(0); //64 bits file, false for now
+	wf->store_32(VERSION_MAJOR);
+	wf->store_32(VERSION_MINOR);
+	static const int save_format_version = 3; //use format version 3 for saving
+	wf->store_32(save_format_version);
+
+	bs_save_unicode_string(wf.f, is_scene ? "PackedScene" : resource_type);
+	wf->store_64(0); //offset to import metadata, this is no longer used
+	for (int i = 0; i < 14; i++)
+		wf->store_32(0); // reserved
+
+	wf->store_32(0); //string table size, will not be in use
+	size_t ext_res_count_pos = wf->get_position();
+
+	wf->store_32(0); //zero ext resources, still parsing them
+
+	//go with external resources
+
+	DummyReadData dummy_read;
+	VariantParser::ResourceParser rp;
+	rp.ext_func = _parse_ext_resource_dummys;
+	rp.sub_func = _parse_sub_resource_dummys;
+	rp.userdata = &dummy_read;
+
+	while (next_tag.name == "ext_resource") {
+
+		if (!next_tag.fields.has("path")) {
+			error = ERR_FILE_CORRUPT;
+			error_text = "Missing 'path' in external resource tag";
+			_printerr();
+			return error;
+		}
+
+		if (!next_tag.fields.has("type")) {
+			error = ERR_FILE_CORRUPT;
+			error_text = "Missing 'type' in external resource tag";
+			_printerr();
+			return error;
+		}
+
+		if (!next_tag.fields.has("id")) {
+			error = ERR_FILE_CORRUPT;
+			error_text = "Missing 'id' in external resource tag";
+			_printerr();
+			return error;
+		}
+
+		String path = next_tag.fields["path"];
+		String type = next_tag.fields["type"];
+		int index = next_tag.fields["id"];
+
+		bs_save_unicode_string(wf.f, type);
+		bs_save_unicode_string(wf.f, path);
+
+		int lindex = dummy_read.external_resources.size();
+		Ref<DummyResource> dr;
+		dr.instance();
+		dr->set_path("res://dummy" + itos(lindex)); //anything is good to detect it for saving as external
+		dummy_read.external_resources[dr] = lindex;
+		dummy_read.rev_external_resources[index] = dr;
+
+		error = VariantParser::parse_tag(&stream, lines, error_text, next_tag, &rp);
+
+		if (error) {
+			_printerr();
+			return error;
+		}
+	}
+
+	// save external resource table
+	wf->seek(ext_res_count_pos);
+	wf->store_32(dummy_read.external_resources.size());
+	wf->seek_end();
+
+	//now, save resources to a separate file, for now
+
+	size_t sub_res_count_pos = wf->get_position();
+	wf->store_32(0); //zero sub resources, still parsing them
+
+	String temp_file = p_path + ".temp";
+	FileAccessRef wf2 = FileAccess::open(temp_file, FileAccess::WRITE);
+	if (!wf2) {
+		return ERR_CANT_OPEN;
+	}
+
+	Vector<size_t> local_offsets;
+	Vector<size_t> local_pointers_pos;
+
+	while (next_tag.name == "sub_resource" || next_tag.name == "resource") {
+
+		String type;
+		int id = -1;
+		bool main_res;
+
+		if (next_tag.name == "sub_resource") {
+			if (!next_tag.fields.has("type")) {
+				error = ERR_FILE_CORRUPT;
+				error_text = "Missing 'type' in external resource tag";
+				_printerr();
+				return error;
+			}
+
+			if (!next_tag.fields.has("id")) {
+				error = ERR_FILE_CORRUPT;
+				error_text = "Missing 'index' in external resource tag";
+				_printerr();
+				return error;
+			}
+
+			type = next_tag.fields["type"];
+			id = next_tag.fields["id"];
+			main_res = false;
+		} else {
+			type = res_type;
+			id = 0; //used for last anyway
+			main_res = true;
+		}
+
+		local_offsets.push_back(wf2->get_position());
+
+		bs_save_unicode_string(wf, "local://" + itos(id));
+		local_pointers_pos.push_back(wf->get_position());
+		wf->store_64(0); //temp local offset
+
+		bs_save_unicode_string(wf2, type);
+		size_t propcount_ofs = wf2->get_position();
+		wf2->store_32(0);
+
+		int prop_count = 0;
+
+		while (true) {
+
+			String assign;
+			Variant value;
+
+			error = VariantParser::parse_tag_assign_eof(&stream, lines, error_text, next_tag, assign, value, &rp);
+
+			if (error) {
+				if (main_res && error == ERR_FILE_EOF) {
+					next_tag.name = ""; //exit
+					break;
+				}
+
+				_printerr();
+				return error;
+			}
+
+			if (assign != String()) {
+
+				Map<StringName, int> empty_string_map; //unused
+				bs_save_unicode_string(wf2, assign, true);
+				ResourceFormatSaverBinaryInstance::write_variant(wf2, value, dummy_read.resource_set, dummy_read.external_resources, empty_string_map);
+				prop_count++;
+
+			} else if (next_tag.name != String()) {
+
+				error = OK;
+				break;
+			} else {
+				error = ERR_FILE_CORRUPT;
+				error_text = "Premature end of file while parsing [sub_resource]";
+				_printerr();
+				return error;
+			}
+		}
+
+		wf2->seek(propcount_ofs);
+		wf2->store_32(prop_count);
+		wf2->seek_end();
+	}
+
+	if (next_tag.name == "node") {
+		//this is a node, must save one more!
+
+		if (!is_scene) {
+
+			error_text += "found the 'node' tag on a resource file!";
+			_printerr();
+			error = ERR_FILE_CORRUPT;
+			return error;
+		}
+
+		Ref<PackedScene> packed_scene = _parse_node_tag(rp);
+
+		if (!packed_scene.is_valid())
+			return error;
+
+		error = OK;
+		//get it here
+		List<PropertyInfo> props;
+		packed_scene->get_property_list(&props);
+
+		bs_save_unicode_string(wf, "local://0");
+		local_pointers_pos.push_back(wf->get_position());
+		wf->store_64(0); //temp local offset
+
+		local_offsets.push_back(wf2->get_position());
+		bs_save_unicode_string(wf2, "PackedScene");
+		size_t propcount_ofs = wf2->get_position();
+		wf2->store_32(0);
+
+		int prop_count = 0;
+
+		for (List<PropertyInfo>::Element *E = props.front(); E; E = E->next()) {
+
+			if (!(E->get().usage & PROPERTY_USAGE_STORAGE))
+				continue;
+
+			String name = E->get().name;
+			Variant value = packed_scene->get(name);
+
+			Map<StringName, int> empty_string_map; //unused
+			bs_save_unicode_string(wf2, name, true);
+			ResourceFormatSaverBinaryInstance::write_variant(wf2, value, dummy_read.resource_set, dummy_read.external_resources, empty_string_map);
+			prop_count++;
+		}
+
+		wf2->seek(propcount_ofs);
+		wf2->store_32(prop_count);
+		wf2->seek_end();
+	}
+
+	wf2->close();
+
+	size_t offset_from = wf->get_position();
+	wf->seek(sub_res_count_pos); //plus one because the saved one
+	wf->store_32(local_offsets.size());
+
+	for (int i = 0; i < local_offsets.size(); i++) {
+		wf->seek(local_pointers_pos[i]);
+		wf->store_64(local_offsets[i] + offset_from);
+	}
+
+	wf->seek_end();
+
+	Vector<uint8_t> data = FileAccess::get_file_as_array(temp_file);
+	wf->store_buffer(data.ptr(), data.size());
+	{
+		DirAccessRef dar = DirAccess::open(temp_file.get_base_dir());
+		dar->remove(temp_file);
+	}
+
+	wf->store_buffer((const uint8_t *)"RSRC", 4); //magic at end
+
+	wf->close();
+
+	return OK;
+}
+
 String ResourceInteractiveLoaderText::recognize(FileAccess *p_f) {
 
 	error = OK;
@@ -991,6 +1306,25 @@ Error ResourceFormatLoaderText::rename_dependencies(const String &p_path, const
 	return ria->rename_dependencies(f, p_path, p_map);
 }
 
+Error ResourceFormatLoaderText::convert_file_to_binary(const String &p_src_path, const String &p_dst_path) {
+
+	Error err;
+	FileAccess *f = FileAccess::open(p_src_path, FileAccess::READ, &err);
+
+	if (err != OK) {
+
+		ERR_FAIL_COND_V(err != OK, ERR_CANT_OPEN);
+	}
+
+	Ref<ResourceInteractiveLoaderText> ria = memnew(ResourceInteractiveLoaderText);
+	String path = p_src_path;
+	ria->local_path = ProjectSettings::get_singleton()->localize_path(path);
+	ria->res_path = ria->local_path;
+	//ria->set_local_path( ProjectSettings::get_singleton()->localize_path(p_path) );
+	ria->open(f);
+	return ria->save_as_binary(f, p_dst_path);
+}
+
 /*****************************************************************************************************/
 /*****************************************************************************************************/
 /*****************************************************************************************************/
diff --git a/scene/resources/scene_format_text.h b/scene/resources/scene_format_text.h
index a72a62037c..5d3c2004c1 100644
--- a/scene/resources/scene_format_text.h
+++ b/scene/resources/scene_format_text.h
@@ -78,9 +78,26 @@ class ResourceInteractiveLoaderText : public ResourceInteractiveLoader {
 	Error _parse_sub_resource(VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str);
 	Error _parse_ext_resource(VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str);
 
-	VariantParser::ResourceParser rp;
+	// for converter
+	class DummyResource : public Resource {
+	public:
+	};
 
-	Ref<PackedScene> packed_scene;
+	struct DummyReadData {
+
+		Map<RES, int> external_resources;
+		Map<int, RES> rev_external_resources;
+		Set<RES> resource_set;
+		Map<int, RES> resource_map;
+	};
+
+	static Error _parse_sub_resource_dummys(void *p_self, VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str) { return _parse_sub_resource_dummy((DummyReadData *)(p_self), p_stream, r_res, line, r_err_str); }
+	static Error _parse_ext_resource_dummys(void *p_self, VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str) { return _parse_ext_resource_dummy((DummyReadData *)(p_self), p_stream, r_res, line, r_err_str); }
+
+	static Error _parse_sub_resource_dummy(DummyReadData *p_data, VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str);
+	static Error _parse_ext_resource_dummy(DummyReadData *p_data, VariantParser::Stream *p_stream, Ref<Resource> &r_res, int &line, String &r_err_str);
+
+	VariantParser::ResourceParser rp;
 
 	friend class ResourceFormatLoaderText;
 
@@ -89,6 +106,8 @@ class ResourceInteractiveLoaderText : public ResourceInteractiveLoader {
 
 	RES resource;
 
+	Ref<PackedScene> _parse_node_tag(VariantParser::ResourceParser &parser);
+
 public:
 	virtual void set_local_path(const String &p_local_path);
 	virtual Ref<Resource> get_resource();
@@ -102,6 +121,7 @@ public:
 	void get_dependencies(FileAccess *p_f, List<String> *p_dependencies, bool p_add_types);
 	Error rename_dependencies(FileAccess *p_f, const String &p_path, const Map<String, String> &p_map);
 
+	Error save_as_binary(FileAccess *p_f, const String &p_path);
 	ResourceInteractiveLoaderText();
 	~ResourceInteractiveLoaderText();
 };
@@ -115,6 +135,8 @@ public:
 	virtual String get_resource_type(const String &p_path) const;
 	virtual void get_dependencies(const String &p_path, List<String> *p_dependencies, bool p_add_types = false);
 	virtual Error rename_dependencies(const String &p_path, const Map<String, String> &p_map);
+
+	static Error convert_file_to_binary(const String &p_src_path, const String &p_dst_path);
 };
 
 class ResourceFormatSaverTextInstance {
diff --git a/servers/visual/rasterizer.h b/servers/visual/rasterizer.h
index df41c3b5ce..c5c225a40a 100644
--- a/servers/visual/rasterizer.h
+++ b/servers/visual/rasterizer.h
@@ -112,6 +112,10 @@ public:
 
 		SelfList<InstanceBase> dependency_item;
 
+		InstanceBase *lightmap_capture;
+		RID lightmap;
+		Vector<Color> lightmap_capture_data; //in a array (12 values) to avoid wasting space if unused. Alpha is unused, but needed to send to shader
+
 		virtual void base_removed() = 0;
 		virtual void base_changed() = 0;
 		virtual void base_material_changed() = 0;
@@ -126,6 +130,7 @@ public:
 			depth_layer = 0;
 			layer_mask = 1;
 			baked_light = false;
+			lightmap_capture = NULL;
 		}
 	};
 
@@ -437,6 +442,32 @@ public:
 	virtual RID gi_probe_dynamic_data_create(int p_width, int p_height, int p_depth, GIProbeCompression p_compression) = 0;
 	virtual void gi_probe_dynamic_data_update(RID p_gi_probe_data, int p_depth_slice, int p_slice_count, int p_mipmap, const void *p_data) = 0;
 
+	/* LIGHTMAP CAPTURE */
+
+	struct LightmapCaptureOctree {
+
+		enum {
+			CHILD_EMPTY = 0xFFFFFFFF
+		};
+
+		uint16_t light[6][3]; //anisotropic light
+		float alpha;
+		uint32_t children[8];
+	};
+
+	virtual RID lightmap_capture_create() = 0;
+	virtual void lightmap_capture_set_bounds(RID p_capture, const AABB &p_bounds) = 0;
+	virtual AABB lightmap_capture_get_bounds(RID p_capture) const = 0;
+	virtual void lightmap_capture_set_octree(RID p_capture, const PoolVector<uint8_t> &p_octree) = 0;
+	virtual PoolVector<uint8_t> lightmap_capture_get_octree(RID p_capture) const = 0;
+	virtual void lightmap_capture_set_octree_cell_transform(RID p_capture, const Transform &p_xform) = 0;
+	virtual Transform lightmap_capture_get_octree_cell_transform(RID p_capture) const = 0;
+	virtual void lightmap_capture_set_octree_cell_subdiv(RID p_capture, int p_subdiv) = 0;
+	virtual int lightmap_capture_get_octree_cell_subdiv(RID p_capture) const = 0;
+	virtual void lightmap_capture_set_energy(RID p_capture, float p_energy) = 0;
+	virtual float lightmap_capture_get_energy(RID p_capture) const = 0;
+	virtual const PoolVector<LightmapCaptureOctree> *lightmap_capture_get_octree_ptr(RID p_capture) const = 0;
+
 	/* PARTICLES */
 
 	virtual RID particles_create() = 0;
diff --git a/servers/visual/shader_language.cpp b/servers/visual/shader_language.cpp
index e10a57c571..c69bbb9343 100644
--- a/servers/visual/shader_language.cpp
+++ b/servers/visual/shader_language.cpp
@@ -755,12 +755,12 @@ void ShaderLanguage::clear() {
 	}
 }
 
-bool ShaderLanguage::_find_identifier(const BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types, const StringName &p_identifier, DataType *r_data_type, IdentifierType *r_type) {
+bool ShaderLanguage::_find_identifier(const BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types, const StringName &p_identifier, DataType *r_data_type, IdentifierType *r_type) {
 
 	if (p_builtin_types.has(p_identifier)) {
 
 		if (r_data_type) {
-			*r_data_type = p_builtin_types[p_identifier];
+			*r_data_type = p_builtin_types[p_identifier].type;
 		}
 		if (r_type) {
 			*r_type = IDENTIFIER_BUILTIN_VAR;
@@ -2008,7 +2008,7 @@ bool ShaderLanguage::_validate_function_call(BlockNode *p_block, OperatorNode *p
 	return false;
 }
 
-bool ShaderLanguage::_parse_function_arguments(BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types, OperatorNode *p_func, int *r_complete_arg) {
+bool ShaderLanguage::_parse_function_arguments(BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types, OperatorNode *p_func, int *r_complete_arg) {
 
 	TkPos pos = _get_tkpos();
 	Token tk = _get_token();
@@ -2261,7 +2261,48 @@ bool ShaderLanguage::_get_completable_identifier(BlockNode *p_block, CompletionT
 	return false;
 }
 
-ShaderLanguage::Node *ShaderLanguage::_parse_expression(BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types) {
+bool ShaderLanguage::_is_operator_assign(Operator p_op) const {
+	switch (p_op) {
+		case OP_ASSIGN:
+		case OP_ASSIGN_ADD:
+		case OP_ASSIGN_SUB:
+		case OP_ASSIGN_MUL:
+		case OP_ASSIGN_DIV:
+		case OP_ASSIGN_MOD:
+		case OP_ASSIGN_SHIFT_LEFT:
+		case OP_ASSIGN_SHIFT_RIGHT:
+		case OP_ASSIGN_BIT_AND:
+		case OP_ASSIGN_BIT_OR:
+		case OP_ASSIGN_BIT_XOR:
+			return true;
+		default:
+			return false;
+	}
+
+	return false;
+}
+
+bool ShaderLanguage::_validate_assign(Node *p_node, const Map<StringName, BuiltInInfo> &p_builtin_types) {
+
+	if (p_node->type == Node::TYPE_OPERATOR) {
+
+		OperatorNode *op = static_cast<OperatorNode *>(p_node);
+		if (op->type == OP_INDEX) {
+			return _validate_assign(op->arguments[0], p_builtin_types);
+		}
+	}
+
+	if (p_node->type == Node::TYPE_VARIABLE) {
+
+		VariableNode *var = static_cast<VariableNode *>(p_node);
+		if (p_builtin_types.has(var->name) && p_builtin_types[var->name].constant) {
+			return false; //ops not valid
+		}
+	}
+	return true;
+}
+
+ShaderLanguage::Node *ShaderLanguage::_parse_expression(BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types) {
 
 	Vector<Expression> expression;
 	//Vector<TokenType> operators;
@@ -2765,6 +2806,11 @@ ShaderLanguage::Node *ShaderLanguage::_parse_expression(BlockNode *p_block, cons
 					_set_error("Invalid base type for increment/decrement operator");
 					return NULL;
 				}
+
+				if (!_validate_assign(expr, p_builtin_types)) {
+					_set_error("Invalid use of increment/decrement operator in constant expression.");
+					return NULL;
+				}
 				expr = op;
 			} else {
 
@@ -2948,6 +2994,11 @@ ShaderLanguage::Node *ShaderLanguage::_parse_expression(BlockNode *p_block, cons
 
 				OperatorNode *op = alloc_node<OperatorNode>();
 				op->op = expression[i].op;
+				if ((op->op == OP_INCREMENT || op->op == OP_DECREMENT) && !_validate_assign(expression[i + 1].node, p_builtin_types)) {
+
+					_set_error("Can't use increment/decrement operator in constant expression.");
+					return NULL;
+				}
 				op->arguments.push_back(expression[i + 1].node);
 
 				expression[i].is_op = false;
@@ -3019,6 +3070,12 @@ ShaderLanguage::Node *ShaderLanguage::_parse_expression(BlockNode *p_block, cons
 				ERR_FAIL_V(NULL);
 			}
 
+			if (_is_operator_assign(op->op) && !_validate_assign(expression[next_op - 1].node, p_builtin_types)) {
+
+				_set_error("Assignment to constant expression.");
+				return NULL;
+			}
+
 			if (expression[next_op + 1].is_op) {
 				// this is not invalid and can really appear
 				// but it becomes invalid anyway because no binary op
@@ -3142,7 +3199,7 @@ ShaderLanguage::Node *ShaderLanguage::_reduce_expression(BlockNode *p_block, Sha
 	return p_node;
 }
 
-ShaderLanguage::Node *ShaderLanguage::_parse_and_reduce_expression(BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types) {
+ShaderLanguage::Node *ShaderLanguage::_parse_and_reduce_expression(BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types) {
 
 	ShaderLanguage::Node *expr = _parse_expression(p_block, p_builtin_types);
 	if (!expr) //errored
@@ -3153,7 +3210,7 @@ ShaderLanguage::Node *ShaderLanguage::_parse_and_reduce_expression(BlockNode *p_
 	return expr;
 }
 
-Error ShaderLanguage::_parse_block(BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types, bool p_just_one, bool p_can_break, bool p_can_continue) {
+Error ShaderLanguage::_parse_block(BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types, bool p_just_one, bool p_can_break, bool p_can_continue) {
 
 	while (true) {
 
@@ -3636,7 +3693,7 @@ Error ShaderLanguage::_parse_shader(const Map<StringName, FunctionInfo> &p_funct
 
 				name = tk.text;
 
-				if (_find_identifier(NULL, Map<StringName, DataType>(), name)) {
+				if (_find_identifier(NULL, Map<StringName, BuiltInInfo>(), name)) {
 					_set_error("Redefinition of '" + String(name) + "'");
 					return ERR_PARSE_ERROR;
 				}
@@ -3660,7 +3717,7 @@ Error ShaderLanguage::_parse_shader(const Map<StringName, FunctionInfo> &p_funct
 					tk = _get_token();
 					if (tk.type == TK_OP_ASSIGN) {
 
-						Node *expr = _parse_and_reduce_expression(NULL, Map<StringName, DataType>());
+						Node *expr = _parse_and_reduce_expression(NULL, Map<StringName, BuiltInInfo>());
 						if (!expr)
 							return ERR_PARSE_ERROR;
 						if (expr->type != Node::TYPE_CONSTANT) {
@@ -3841,7 +3898,7 @@ Error ShaderLanguage::_parse_shader(const Map<StringName, FunctionInfo> &p_funct
 					return ERR_PARSE_ERROR;
 				}
 
-				if (_find_identifier(NULL, Map<StringName, DataType>(), name)) {
+				if (_find_identifier(NULL, Map<StringName, BuiltInInfo>(), name)) {
 					_set_error("Redefinition of '" + String(name) + "'");
 					return ERR_PARSE_ERROR;
 				}
@@ -3852,7 +3909,7 @@ Error ShaderLanguage::_parse_shader(const Map<StringName, FunctionInfo> &p_funct
 					return ERR_PARSE_ERROR;
 				}
 
-				Map<StringName, DataType> builtin_types;
+				Map<StringName, BuiltInInfo> builtin_types;
 				if (p_functions.has(name)) {
 					builtin_types = p_functions[name].built_ins;
 				}
@@ -4109,7 +4166,7 @@ Error ShaderLanguage::complete(const String &p_code, const Map<StringName, Funct
 
 			if (comp_ident && skip_function != StringName() && p_functions.has(skip_function)) {
 
-				for (Map<StringName, DataType>::Element *E = p_functions[skip_function].built_ins.front(); E; E = E->next()) {
+				for (Map<StringName, BuiltInInfo>::Element *E = p_functions[skip_function].built_ins.front(); E; E = E->next()) {
 					matches.insert(E->key());
 				}
 			}
diff --git a/servers/visual/shader_language.h b/servers/visual/shader_language.h
index e092bf931f..4cf8560990 100644
--- a/servers/visual/shader_language.h
+++ b/servers/visual/shader_language.h
@@ -537,8 +537,18 @@ public:
 	static void get_keyword_list(List<String> *r_keywords);
 	static void get_builtin_funcs(List<String> *r_keywords);
 
+	struct BuiltInInfo {
+		DataType type;
+		bool constant;
+		BuiltInInfo() {}
+		BuiltInInfo(DataType p_type, bool p_constant = false) {
+			type = p_type;
+			constant = p_constant;
+		}
+	};
+
 	struct FunctionInfo {
-		Map<StringName, DataType> built_ins;
+		Map<StringName, BuiltInInfo> built_ins;
 		bool can_discard;
 	};
 
@@ -601,7 +611,10 @@ private:
 		IDENTIFIER_BUILTIN_VAR,
 	};
 
-	bool _find_identifier(const BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types, const StringName &p_identifier, DataType *r_data_type = NULL, IdentifierType *r_type = NULL);
+	bool _find_identifier(const BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types, const StringName &p_identifier, DataType *r_data_type = NULL, IdentifierType *r_type = NULL);
+
+	bool _is_operator_assign(Operator p_op) const;
+	bool _validate_assign(Node *p_node, const Map<StringName, BuiltInInfo> &p_builtin_types);
 
 	bool _validate_operator(OperatorNode *p_op, DataType *r_ret_type = NULL);
 
@@ -625,14 +638,14 @@ private:
 	static const BuiltinFuncDef builtin_func_defs[];
 	bool _validate_function_call(BlockNode *p_block, OperatorNode *p_func, DataType *r_ret_type);
 
-	bool _parse_function_arguments(BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types, OperatorNode *p_func, int *r_complete_arg = NULL);
+	bool _parse_function_arguments(BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types, OperatorNode *p_func, int *r_complete_arg = NULL);
 
-	Node *_parse_expression(BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types);
+	Node *_parse_expression(BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types);
 
 	ShaderLanguage::Node *_reduce_expression(BlockNode *p_block, ShaderLanguage::Node *p_node);
-	Node *_parse_and_reduce_expression(BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types);
+	Node *_parse_and_reduce_expression(BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types);
 
-	Error _parse_block(BlockNode *p_block, const Map<StringName, DataType> &p_builtin_types, bool p_just_one = false, bool p_can_break = false, bool p_can_continue = false);
+	Error _parse_block(BlockNode *p_block, const Map<StringName, BuiltInInfo> &p_builtin_types, bool p_just_one = false, bool p_can_break = false, bool p_can_continue = false);
 
 	Error _parse_shader(const Map<StringName, FunctionInfo> &p_functions, const Set<String> &p_render_modes, const Set<String> &p_shader_types);
 
diff --git a/servers/visual/shader_types.cpp b/servers/visual/shader_types.cpp
index 7489ca7e3e..a25c5ca65e 100644
--- a/servers/visual/shader_types.cpp
+++ b/servers/visual/shader_types.cpp
@@ -45,6 +45,11 @@ const Set<String> &ShaderTypes::get_types() {
 
 ShaderTypes *ShaderTypes::singleton = NULL;
 
+static ShaderLanguage::BuiltInInfo constt(ShaderLanguage::DataType p_type) {
+
+	return ShaderLanguage::BuiltInInfo(p_type, true);
+}
+
 ShaderTypes::ShaderTypes() {
 	singleton = this;
 
@@ -58,32 +63,32 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["UV2"] = ShaderLanguage::TYPE_VEC2;
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["POINT_SIZE"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["INSTANCE_ID"] = ShaderLanguage::TYPE_INT;
-	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["INSTANCE_CUSTOM"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["INSTANCE_ID"] = constt(ShaderLanguage::TYPE_INT);
+	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["INSTANCE_CUSTOM"] = constt(ShaderLanguage::TYPE_VEC4);
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["ROUGHNESS"] = ShaderLanguage::TYPE_FLOAT;
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].can_discard = false;
 
 	//builtins
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["WORLD_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["INV_CAMERA_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["CAMERA_MATRIX"] = ShaderLanguage::TYPE_MAT4;
+	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["INV_CAMERA_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["CAMERA_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["PROJECTION_MATRIX"] = ShaderLanguage::TYPE_MAT4;
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["MODELVIEW_MATRIX"] = ShaderLanguage::TYPE_MAT4;
 	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["INV_PROJECTION_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["TIME"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["VIEWPORT_SIZE"] = ShaderLanguage::TYPE_VEC2;
+	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["TIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[VS::SHADER_SPATIAL].functions["vertex"].built_ins["VIEWPORT_SIZE"] = constt(ShaderLanguage::TYPE_VEC2);
 
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["VERTEX"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["FRAGCOORD"] = ShaderLanguage::TYPE_VEC4;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["FRONT_FACING"] = ShaderLanguage::TYPE_BOOL;
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["VERTEX"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["FRAGCOORD"] = constt(ShaderLanguage::TYPE_VEC4);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["FRONT_FACING"] = constt(ShaderLanguage::TYPE_BOOL);
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["NORMAL"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["TANGENT"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["BINORMAL"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["NORMALMAP"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["NORMALMAP_DEPTH"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["UV"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["UV2"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["UV"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["UV2"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["COLOR"] = constt(ShaderLanguage::TYPE_VEC4);
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["ALBEDO"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["ALPHA"] = ShaderLanguage::TYPE_FLOAT;
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["METALLIC"] = ShaderLanguage::TYPE_FLOAT;
@@ -103,33 +108,33 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["SCREEN_TEXTURE"] = ShaderLanguage::TYPE_SAMPLER2D;
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["DEPTH_TEXTURE"] = ShaderLanguage::TYPE_SAMPLER2D;
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["SCREEN_UV"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["POINT_COORD"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["SIDE"] = ShaderLanguage::TYPE_FLOAT;
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["POINT_COORD"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["SIDE"] = constt(ShaderLanguage::TYPE_FLOAT);
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["ALPHA_SCISSOR"] = ShaderLanguage::TYPE_FLOAT;
 
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["WORLD_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["INV_CAMERA_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["PROJECTION_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["INV_PROJECTION_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["TIME"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["VIEWPORT_SIZE"] = ShaderLanguage::TYPE_VEC2;
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["WORLD_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["INV_CAMERA_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["PROJECTION_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["INV_PROJECTION_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["TIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].built_ins["VIEWPORT_SIZE"] = constt(ShaderLanguage::TYPE_VEC2);
 	shader_modes[VS::SHADER_SPATIAL].functions["fragment"].can_discard = true;
 
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["WORLD_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["INV_CAMERA_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["PROJECTION_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["INV_PROJECTION_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["TIME"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["VIEWPORT_SIZE"] = ShaderLanguage::TYPE_VEC2;
-
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["NORMAL"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["VIEW"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["LIGHT"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["LIGHT_COLOR"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["ATTENUATION"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["ALBEDO"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["TRANSMISSION"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["ROUGHNESS"] = ShaderLanguage::TYPE_FLOAT;
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["WORLD_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["INV_CAMERA_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["PROJECTION_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["INV_PROJECTION_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["TIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["VIEWPORT_SIZE"] = constt(ShaderLanguage::TYPE_VEC2);
+
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["NORMAL"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["VIEW"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["LIGHT"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["LIGHT_COLOR"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["ATTENUATION"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["ALBEDO"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["TRANSMISSION"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["ROUGHNESS"] = constt(ShaderLanguage::TYPE_FLOAT);
 	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["DIFFUSE_LIGHT"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_SPATIAL].functions["light"].built_ins["SPECULAR_LIGHT"] = ShaderLanguage::TYPE_VEC3;
 
@@ -177,38 +182,38 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["POINT_SIZE"] = ShaderLanguage::TYPE_FLOAT;
 
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["WORLD_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["PROJECTION_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["EXTRA_MATRIX"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["TIME"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["INSTANCE_CUSTOM"] = ShaderLanguage::TYPE_VEC4;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["AT_LIGHT_PASS"] = ShaderLanguage::TYPE_BOOL;
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["WORLD_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["PROJECTION_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["EXTRA_MATRIX"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["TIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["INSTANCE_CUSTOM"] = constt(ShaderLanguage::TYPE_VEC4);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["AT_LIGHT_PASS"] = constt(ShaderLanguage::TYPE_BOOL);
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["vertex"].can_discard = false;
 
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["FRAGCOORD"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["FRAGCOORD"] = constt(ShaderLanguage::TYPE_VEC4);
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["NORMAL"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["NORMALMAP"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["NORMALMAP_DEPTH"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["UV"] = ShaderLanguage::TYPE_VEC2;
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["UV"] = constt(ShaderLanguage::TYPE_VEC2);
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC4;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["TEXTURE"] = ShaderLanguage::TYPE_SAMPLER2D;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["TEXTURE_PIXEL_SIZE"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["NORMAL_TEXTURE"] = ShaderLanguage::TYPE_SAMPLER2D;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["SCREEN_UV"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["SCREEN_PIXEL_SIZE"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["POINT_COORD"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["TIME"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["AT_LIGHT_PASS"] = ShaderLanguage::TYPE_BOOL;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["SCREEN_TEXTURE"] = ShaderLanguage::TYPE_SAMPLER2D;
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["TEXTURE"] = constt(ShaderLanguage::TYPE_SAMPLER2D);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["TEXTURE_PIXEL_SIZE"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["NORMAL_TEXTURE"] = constt(ShaderLanguage::TYPE_SAMPLER2D);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["SCREEN_UV"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["SCREEN_PIXEL_SIZE"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["POINT_COORD"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["TIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["AT_LIGHT_PASS"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["SCREEN_TEXTURE"] = constt(ShaderLanguage::TYPE_SAMPLER2D);
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["fragment"].can_discard = true;
 
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["POSITION"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["NORMAL"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["UV"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC4;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["TEXTURE"] = ShaderLanguage::TYPE_SAMPLER2D;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["TEXTURE_PIXEL_SIZE"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["SCREEN_UV"] = ShaderLanguage::TYPE_VEC2;
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["POSITION"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["NORMAL"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["UV"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["COLOR"] = constt(ShaderLanguage::TYPE_VEC4);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["TEXTURE"] = constt(ShaderLanguage::TYPE_SAMPLER2D);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["TEXTURE_PIXEL_SIZE"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["SCREEN_UV"] = constt(ShaderLanguage::TYPE_VEC2);
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["LIGHT_VEC"] = ShaderLanguage::TYPE_VEC2;
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["LIGHT_HEIGHT"] = ShaderLanguage::TYPE_FLOAT;
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["LIGHT_COLOR"] = ShaderLanguage::TYPE_VEC4;
@@ -216,8 +221,8 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["LIGHT_SHADOW"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["LIGHT"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["SHADOW"] = ShaderLanguage::TYPE_VEC4;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["POINT_COORD"] = ShaderLanguage::TYPE_VEC2;
-	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["TIME"] = ShaderLanguage::TYPE_FLOAT;
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["POINT_COORD"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].built_ins["TIME"] = constt(ShaderLanguage::TYPE_FLOAT);
 	shader_modes[VS::SHADER_CANVAS_ITEM].functions["light"].can_discard = true;
 
 	shader_modes[VS::SHADER_CANVAS_ITEM].modes.insert("skip_vertex_transform");
@@ -237,16 +242,16 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["VELOCITY"] = ShaderLanguage::TYPE_VEC3;
 	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["MASS"] = ShaderLanguage::TYPE_FLOAT;
 	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["ACTIVE"] = ShaderLanguage::TYPE_BOOL;
-	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["RESTART"] = ShaderLanguage::TYPE_BOOL;
+	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["RESTART"] = constt(ShaderLanguage::TYPE_BOOL);
 	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["CUSTOM"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["TRANSFORM"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["TIME"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["LIFETIME"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["DELTA"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["NUMBER"] = ShaderLanguage::TYPE_UINT;
-	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["INDEX"] = ShaderLanguage::TYPE_INT;
-	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["EMISSION_TRANSFORM"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["RANDOM_SEED"] = ShaderLanguage::TYPE_UINT;
+	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["TIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["LIFETIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["DELTA"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["NUMBER"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["INDEX"] = constt(ShaderLanguage::TYPE_INT);
+	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["EMISSION_TRANSFORM"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].built_ins["RANDOM_SEED"] = constt(ShaderLanguage::TYPE_UINT);
 	shader_modes[VS::SHADER_PARTICLES].functions["vertex"].can_discard = false;
 
 	shader_modes[VS::SHADER_PARTICLES].modes.insert("billboard");
diff --git a/servers/visual/visual_server_raster.h b/servers/visual/visual_server_raster.h
index d843c443a2..a0e79e9d3e 100644
--- a/servers/visual/visual_server_raster.h
+++ b/servers/visual/visual_server_raster.h
@@ -361,6 +361,24 @@ public:
 	BIND2(gi_probe_set_dynamic_data, RID, const PoolVector<int> &)
 	BIND1RC(PoolVector<int>, gi_probe_get_dynamic_data, RID)
 
+	/* LIGHTMAP CAPTURE */
+
+	BIND0R(RID, lightmap_capture_create)
+
+	BIND2(lightmap_capture_set_bounds, RID, const AABB &)
+	BIND1RC(AABB, lightmap_capture_get_bounds, RID)
+
+	BIND2(lightmap_capture_set_octree, RID, const PoolVector<uint8_t> &)
+	BIND1RC(PoolVector<uint8_t>, lightmap_capture_get_octree, RID)
+
+	BIND2(lightmap_capture_set_octree_cell_transform, RID, const Transform &)
+	BIND1RC(Transform, lightmap_capture_get_octree_cell_transform, RID)
+	BIND2(lightmap_capture_set_octree_cell_subdiv, RID, int)
+	BIND1RC(int, lightmap_capture_get_octree_cell_subdiv, RID)
+
+	BIND2(lightmap_capture_set_energy, RID, float)
+	BIND1RC(float, lightmap_capture_get_energy, RID)
+
 	/* PARTICLES */
 
 	BIND0R(RID, particles_create)
@@ -504,6 +522,7 @@ public:
 	BIND3(instance_set_blend_shape_weight, RID, int, float)
 	BIND3(instance_set_surface_material, RID, int, RID)
 	BIND2(instance_set_visible, RID, bool)
+	BIND3(instance_set_use_lightmap, RID, RID, RID)
 
 	BIND2(instance_set_custom_aabb, RID, AABB)
 
diff --git a/servers/visual/visual_server_scene.cpp b/servers/visual/visual_server_scene.cpp
index dde69eedd3..22be2f6ff9 100644
--- a/servers/visual/visual_server_scene.cpp
+++ b/servers/visual/visual_server_scene.cpp
@@ -133,6 +133,19 @@ void *VisualServerScene::_instance_pair(void *p_self, OctreeElementID, Instance
 		geom->reflection_dirty = true;
 
 		return E; //this element should make freeing faster
+	} else if (B->base_type == VS::INSTANCE_LIGHTMAP_CAPTURE && ((1 << A->base_type) & VS::INSTANCE_GEOMETRY_MASK)) {
+
+		InstanceLightmapCaptureData *lightmap_capture = static_cast<InstanceLightmapCaptureData *>(B->base_data);
+		InstanceGeometryData *geom = static_cast<InstanceGeometryData *>(A->base_data);
+
+		InstanceLightmapCaptureData::PairInfo pinfo;
+		pinfo.geometry = A;
+		pinfo.L = geom->lightmap_captures.push_back(B);
+
+		List<InstanceLightmapCaptureData::PairInfo>::Element *E = lightmap_capture->geometries.push_back(pinfo);
+		((VisualServerScene *)p_self)->_instance_queue_update(A, false, false); //need to update capture
+
+		return E; //this element should make freeing faster
 	} else if (B->base_type == VS::INSTANCE_GI_PROBE && ((1 << A->base_type) & VS::INSTANCE_GEOMETRY_MASK)) {
 
 		InstanceGIProbeData *gi_probe = static_cast<InstanceGIProbeData *>(B->base_data);
@@ -193,6 +206,16 @@ void VisualServerScene::_instance_unpair(void *p_self, OctreeElementID, Instance
 		reflection_probe->geometries.erase(E);
 
 		geom->reflection_dirty = true;
+	} else if (B->base_type == VS::INSTANCE_LIGHTMAP_CAPTURE && ((1 << A->base_type) & VS::INSTANCE_GEOMETRY_MASK)) {
+
+		InstanceLightmapCaptureData *lightmap_capture = static_cast<InstanceLightmapCaptureData *>(B->base_data);
+		InstanceGeometryData *geom = static_cast<InstanceGeometryData *>(A->base_data);
+
+		List<InstanceLightmapCaptureData::PairInfo>::Element *E = reinterpret_cast<List<InstanceLightmapCaptureData::PairInfo>::Element *>(udata);
+
+		geom->lightmap_captures.erase(E->get().L);
+		lightmap_capture->geometries.erase(E);
+		((VisualServerScene *)p_self)->_instance_queue_update(A, false, false); //need to update capture
 
 	} else if (B->base_type == VS::INSTANCE_GI_PROBE && ((1 << A->base_type) & VS::INSTANCE_GEOMETRY_MASK)) {
 
@@ -344,6 +367,14 @@ void VisualServerScene::instance_set_base(RID p_instance, RID p_base) {
 					reflection_probe_render_list.remove(&reflection_probe->update_list);
 				}
 			} break;
+			case VS::INSTANCE_LIGHTMAP_CAPTURE: {
+
+				InstanceLightmapCaptureData *lightmap_capture = static_cast<InstanceLightmapCaptureData *>(instance->base_data);
+				//erase dependencies, since no longer a lightmap
+				while (lightmap_capture->users.front()) {
+					instance_set_use_lightmap(lightmap_capture->users.front()->get()->self, RID(), RID());
+				}
+			} break;
 			case VS::INSTANCE_GI_PROBE: {
 
 				InstanceGIProbeData *gi_probe = static_cast<InstanceGIProbeData *>(instance->base_data);
@@ -355,6 +386,14 @@ void VisualServerScene::instance_set_base(RID p_instance, RID p_base) {
 					VSG::storage->free(gi_probe->dynamic.probe_data);
 				}
 
+				if (instance->lightmap_capture) {
+					Instance *capture = (Instance *)instance->lightmap_capture;
+					InstanceLightmapCaptureData *lightmap_capture = static_cast<InstanceLightmapCaptureData *>(capture->base_data);
+					lightmap_capture->users.erase(instance);
+					instance->lightmap_capture = NULL;
+					instance->lightmap = RID();
+				}
+
 				VSG::scene_render->free(gi_probe->probe_instance);
 
 			} break;
@@ -412,6 +451,12 @@ void VisualServerScene::instance_set_base(RID p_instance, RID p_base) {
 
 				reflection_probe->instance = VSG::scene_render->reflection_probe_instance_create(p_base);
 			} break;
+			case VS::INSTANCE_LIGHTMAP_CAPTURE: {
+
+				InstanceLightmapCaptureData *lightmap_capture = memnew(InstanceLightmapCaptureData);
+				instance->base_data = lightmap_capture;
+				//lightmap_capture->instance = VSG::scene_render->lightmap_capture_instance_create(p_base);
+			} break;
 			case VS::INSTANCE_GI_PROBE: {
 
 				InstanceGIProbeData *gi_probe = memnew(InstanceGIProbeData);
@@ -591,6 +636,12 @@ void VisualServerScene::instance_set_visible(RID p_instance, bool p_visible) {
 			}
 
 		} break;
+		case VS::INSTANCE_LIGHTMAP_CAPTURE: {
+			if (instance->octree_id && instance->scenario) {
+				instance->scenario->octree.set_pairable(instance->octree_id, p_visible, 1 << VS::INSTANCE_LIGHTMAP_CAPTURE, p_visible ? VS::INSTANCE_GEOMETRY_MASK : 0);
+			}
+
+		} break;
 		case VS::INSTANCE_GI_PROBE: {
 			if (instance->octree_id && instance->scenario) {
 				instance->scenario->octree.set_pairable(instance->octree_id, p_visible, 1 << VS::INSTANCE_GI_PROBE, p_visible ? (VS::INSTANCE_GEOMETRY_MASK | (1 << VS::INSTANCE_LIGHT)) : 0);
@@ -599,11 +650,35 @@ void VisualServerScene::instance_set_visible(RID p_instance, bool p_visible) {
 		} break;
 	}
 }
-
 inline bool is_geometry_instance(VisualServer::InstanceType p_type) {
 	return p_type == VS::INSTANCE_MESH || p_type == VS::INSTANCE_MULTIMESH || p_type == VS::INSTANCE_PARTICLES || p_type == VS::INSTANCE_IMMEDIATE;
 }
 
+void VisualServerScene::instance_set_use_lightmap(RID p_instance, RID p_lightmap_instance, RID p_lightmap) {
+
+	Instance *instance = instance_owner.get(p_instance);
+	ERR_FAIL_COND(!instance);
+	ERR_FAIL_COND(!is_geometry_instance(instance->base_type));
+
+	if (instance->lightmap_capture) {
+		InstanceLightmapCaptureData *lightmap_capture = static_cast<InstanceLightmapCaptureData *>(((Instance *)instance->lightmap_capture)->base_data);
+		lightmap_capture->users.erase(instance);
+		instance->lightmap = RID();
+		instance->lightmap_capture = NULL;
+	}
+
+	if (p_lightmap_instance.is_valid()) {
+		Instance *lightmap_instance = instance_owner.get(p_lightmap_instance);
+		ERR_FAIL_COND(!lightmap_instance);
+		ERR_FAIL_COND(lightmap_instance->base_type != VS::INSTANCE_LIGHTMAP_CAPTURE);
+		instance->lightmap_capture = lightmap_instance;
+
+		InstanceLightmapCaptureData *lightmap_capture = static_cast<InstanceLightmapCaptureData *>(((Instance *)instance->lightmap_capture)->base_data);
+		lightmap_capture->users.insert(instance);
+		instance->lightmap = p_lightmap;
+	}
+}
+
 void VisualServerScene::instance_set_custom_aabb(RID p_instance, AABB p_aabb) {
 
 	Instance *instance = instance_owner.get(p_instance);
@@ -811,6 +886,15 @@ void VisualServerScene::_update_instance(Instance *p_instance) {
 				light->shadow_dirty = true;
 			}
 		}
+
+		if (!p_instance->lightmap_capture && geom->lightmap_captures.size()) {
+			//affected by lightmap captures, must update capture info!
+			_update_instance_lightmap_captures(p_instance);
+		} else {
+			if (!p_instance->lightmap_capture_data.empty()) {
+				!p_instance->lightmap_capture_data.resize(0); //not in use, clear capture data
+			}
+		}
 	}
 
 	p_instance->mirror = p_instance->transform.basis.determinant() < 0.0;
@@ -832,7 +916,7 @@ void VisualServerScene::_update_instance(Instance *p_instance) {
 		uint32_t pairable_mask = 0;
 		bool pairable = false;
 
-		if (p_instance->base_type == VS::INSTANCE_LIGHT || p_instance->base_type == VS::INSTANCE_REFLECTION_PROBE) {
+		if (p_instance->base_type == VS::INSTANCE_LIGHT || p_instance->base_type == VS::INSTANCE_REFLECTION_PROBE || p_instance->base_type == VS::INSTANCE_LIGHTMAP_CAPTURE) {
 
 			pairable_mask = p_instance->visible ? VS::INSTANCE_GEOMETRY_MASK : 0;
 			pairable = true;
@@ -917,6 +1001,11 @@ void VisualServerScene::_update_instance_aabb(Instance *p_instance) {
 			new_aabb = VSG::storage->gi_probe_get_bounds(p_instance->base);
 
 		} break;
+		case VisualServer::INSTANCE_LIGHTMAP_CAPTURE: {
+
+			new_aabb = VSG::storage->lightmap_capture_get_bounds(p_instance->base);
+
+		} break;
 
 		default: {}
 	}
@@ -928,6 +1017,237 @@ void VisualServerScene::_update_instance_aabb(Instance *p_instance) {
 	p_instance->aabb = new_aabb;
 }
 
+_FORCE_INLINE_ static void _light_capture_sample_octree(const RasterizerStorage::LightmapCaptureOctree *p_octree, int p_cell_subdiv, const Vector3 &p_pos, const Vector3 &p_dir, float p_level, Vector3 &r_color, float &r_alpha) {
+
+	static const Vector3 aniso_normal[6] = {
+		Vector3(-1, 0, 0),
+		Vector3(1, 0, 0),
+		Vector3(0, -1, 0),
+		Vector3(0, 1, 0),
+		Vector3(0, 0, -1),
+		Vector3(0, 0, 1)
+	};
+
+	int size = 1 << (p_cell_subdiv - 1);
+
+	int clamp_v = size - 1;
+	//first of all, clamp
+	Vector3 pos;
+	pos.x = CLAMP(p_pos.x, 0, clamp_v);
+	pos.y = CLAMP(p_pos.y, 0, clamp_v);
+	pos.z = CLAMP(p_pos.z, 0, clamp_v);
+
+	float level = (p_cell_subdiv - 1) - p_level;
+
+	int target_level;
+	float level_filter;
+	if (level <= 0.0) {
+		level_filter = 0;
+		target_level = 0;
+	} else {
+		target_level = Math::ceil(level);
+		level_filter = target_level - level;
+	}
+
+	Vector3 color[2][8];
+	float alpha[2][8];
+	zeromem(alpha, sizeof(float) * 2 * 8);
+
+	//find cell at given level first
+
+	for (int c = 0; c < 2; c++) {
+
+		int current_level = MAX(0, target_level - c);
+		int level_cell_size = (1 << (p_cell_subdiv - 1)) >> current_level;
+
+		for (int n = 0; n < 8; n++) {
+
+			int x = int(pos.x);
+			int y = int(pos.y);
+			int z = int(pos.z);
+
+			if (n & 1)
+				x += level_cell_size;
+			if (n & 2)
+				y += level_cell_size;
+			if (n & 4)
+				z += level_cell_size;
+
+			int ofs_x = 0;
+			int ofs_y = 0;
+			int ofs_z = 0;
+
+			x = CLAMP(x, 0, clamp_v);
+			y = CLAMP(y, 0, clamp_v);
+			z = CLAMP(z, 0, clamp_v);
+
+			int half = size / 2;
+			uint32_t cell = 0;
+			for (int i = 0; i < current_level; i++) {
+
+				const RasterizerStorage::LightmapCaptureOctree *bc = &p_octree[cell];
+
+				int child = 0;
+				if (x >= ofs_x + half) {
+					child |= 1;
+					ofs_x += half;
+				}
+				if (y >= ofs_y + half) {
+					child |= 2;
+					ofs_y += half;
+				}
+				if (z >= ofs_z + half) {
+					child |= 4;
+					ofs_z += half;
+				}
+
+				cell = bc->children[child];
+				if (cell == RasterizerStorage::LightmapCaptureOctree::CHILD_EMPTY)
+					break;
+
+				half >>= 1;
+			}
+
+			if (cell == RasterizerStorage::LightmapCaptureOctree::CHILD_EMPTY) {
+				alpha[c][n] = 0;
+			} else {
+				alpha[c][n] = p_octree[cell].alpha;
+
+				for (int i = 0; i < 6; i++) {
+					//anisotropic read light
+					float amount = p_dir.dot(aniso_normal[i]);
+					if (amount < 0)
+						amount = 0;
+					color[c][n].x += p_octree[cell].light[i][0] / 1024.0 * amount;
+					color[c][n].y += p_octree[cell].light[i][1] / 1024.0 * amount;
+					color[c][n].z += p_octree[cell].light[i][2] / 1024.0 * amount;
+				}
+			}
+
+			//print_line("\tlev " + itos(c) + " - " + itos(n) + " alpha: " + rtos(cells[test_cell].alpha) + " col: " + color[c][n]);
+		}
+	}
+
+	float target_level_size = size >> target_level;
+	Vector3 pos_fract[2];
+
+	pos_fract[0].x = Math::fmod(pos.x, target_level_size) / target_level_size;
+	pos_fract[0].y = Math::fmod(pos.y, target_level_size) / target_level_size;
+	pos_fract[0].z = Math::fmod(pos.z, target_level_size) / target_level_size;
+
+	target_level_size = size >> MAX(0, target_level - 1);
+
+	pos_fract[1].x = Math::fmod(pos.x, target_level_size) / target_level_size;
+	pos_fract[1].y = Math::fmod(pos.y, target_level_size) / target_level_size;
+	pos_fract[1].z = Math::fmod(pos.z, target_level_size) / target_level_size;
+
+	float alpha_interp[2];
+	Vector3 color_interp[2];
+
+	for (int i = 0; i < 2; i++) {
+
+		Vector3 color_x00 = color[i][0].linear_interpolate(color[i][1], pos_fract[i].x);
+		Vector3 color_xy0 = color[i][2].linear_interpolate(color[i][3], pos_fract[i].x);
+		Vector3 blend_z0 = color_x00.linear_interpolate(color_xy0, pos_fract[i].y);
+
+		Vector3 color_x0z = color[i][4].linear_interpolate(color[i][5], pos_fract[i].x);
+		Vector3 color_xyz = color[i][6].linear_interpolate(color[i][7], pos_fract[i].x);
+		Vector3 blend_z1 = color_x0z.linear_interpolate(color_xyz, pos_fract[i].y);
+
+		color_interp[i] = blend_z0.linear_interpolate(blend_z1, pos_fract[i].z);
+
+		float alpha_x00 = Math::lerp(alpha[i][0], alpha[i][1], pos_fract[i].x);
+		float alpha_xy0 = Math::lerp(alpha[i][2], alpha[i][3], pos_fract[i].x);
+		float alpha_z0 = Math::lerp(alpha_x00, alpha_xy0, pos_fract[i].y);
+
+		float alpha_x0z = Math::lerp(alpha[i][4], alpha[i][5], pos_fract[i].x);
+		float alpha_xyz = Math::lerp(alpha[i][6], alpha[i][7], pos_fract[i].x);
+		float alpha_z1 = Math::lerp(alpha_x0z, alpha_xyz, pos_fract[i].y);
+
+		alpha_interp[i] = Math::lerp(alpha_z0, alpha_z1, pos_fract[i].z);
+	}
+
+	r_color = color_interp[0].linear_interpolate(color_interp[1], level_filter);
+	r_alpha = Math::lerp(alpha_interp[0], alpha_interp[1], level_filter);
+
+	//	print_line("pos: " + p_posf + " level " + rtos(p_level) + " down to " + itos(target_level) + "." + rtos(level_filter) + " color " + r_color + " alpha " + rtos(r_alpha));
+}
+
+_FORCE_INLINE_ static Color _light_capture_voxel_cone_trace(const RasterizerStorage::LightmapCaptureOctree *p_octree, const Vector3 &p_pos, const Vector3 &p_dir, float p_aperture, int p_cell_subdiv) {
+
+	float bias = 0.0; //no need for bias here
+	float max_distance = (Vector3(1, 1, 1) * (1 << (p_cell_subdiv - 1))).length();
+
+	float dist = bias;
+	float alpha = 0.0;
+	Vector3 color;
+
+	Vector3 scolor;
+	float salpha;
+
+	while (dist < max_distance && alpha < 0.95) {
+		float diameter = MAX(1.0, 2.0 * p_aperture * dist);
+		_light_capture_sample_octree(p_octree, p_cell_subdiv, p_pos + dist * p_dir, p_dir, log2(diameter), scolor, salpha);
+		float a = (1.0 - alpha);
+		color += scolor * a;
+		alpha += a * salpha;
+		dist += diameter * 0.5;
+	}
+
+	return Color(color.x, color.y, color.z, alpha);
+}
+
+void VisualServerScene::_update_instance_lightmap_captures(Instance *p_instance) {
+
+	InstanceGeometryData *geom = static_cast<InstanceGeometryData *>(p_instance->base_data);
+
+	static const Vector3 cone_traces[12] = {
+		Vector3(0, 0, 1),
+		Vector3(0.866025, 0, 0.5),
+		Vector3(0.267617, 0.823639, 0.5),
+		Vector3(-0.700629, 0.509037, 0.5),
+		Vector3(-0.700629, -0.509037, 0.5),
+		Vector3(0.267617, -0.823639, 0.5),
+		Vector3(0, 0, -1),
+		Vector3(0.866025, 0, -0.5),
+		Vector3(0.267617, 0.823639, -0.5),
+		Vector3(-0.700629, 0.509037, -0.5),
+		Vector3(-0.700629, -0.509037, -0.5),
+		Vector3(0.267617, -0.823639, -0.5)
+	};
+
+	float cone_aperture = 0.577; // tan(angle) 60 degrees
+
+	if (p_instance->lightmap_capture_data.empty()) {
+		p_instance->lightmap_capture_data.resize(12);
+	}
+
+	//print_line("update captures for pos: " + p_instance->transform.origin);
+
+	zeromem(p_instance->lightmap_capture_data.ptrw(), 12 * sizeof(Color));
+	//this could use some sort of blending..
+	for (List<Instance *>::Element *E = geom->lightmap_captures.front(); E; E = E->next()) {
+		const PoolVector<RasterizerStorage::LightmapCaptureOctree> *octree = VSG::storage->lightmap_capture_get_octree_ptr(E->get()->base);
+		//print_line("octree size: " + itos(octree->size()));
+		if (octree->size() == 0)
+			continue;
+		Transform to_cell_xform = VSG::storage->lightmap_capture_get_octree_cell_transform(E->get()->base);
+		int cell_subdiv = VSG::storage->lightmap_capture_get_octree_cell_subdiv(E->get()->base);
+		to_cell_xform = to_cell_xform * E->get()->transform.affine_inverse();
+
+		PoolVector<RasterizerStorage::LightmapCaptureOctree>::Read octree_r = octree->read();
+
+		Vector3 pos = to_cell_xform.xform(p_instance->transform.origin);
+
+		for (int i = 0; i < 12; i++) {
+
+			Vector3 dir = to_cell_xform.basis.xform(cone_traces[i]).normalized();
+			Color capture = _light_capture_voxel_cone_trace(octree_r.ptr(), pos, dir, cone_aperture, cell_subdiv);
+			p_instance->lightmap_capture_data[i] += capture;
+		}
+	}
+}
+
 void VisualServerScene::_light_instance_update_shadow(Instance *p_instance, const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, RID p_shadow_atlas, Scenario *p_scenario) {
 
 	InstanceLightData *light = static_cast<InstanceLightData *>(p_instance->base_data);
@@ -2188,6 +2508,8 @@ void VisualServerScene::_bake_gi_probe_light(const GIProbeDataHeader *header, co
 				InstanceGIProbeData::LocalData *light = &local_data[idx];
 
 				Vector3 to(light->pos[0] + 0.5, light->pos[1] + 0.5, light->pos[2] + 0.5);
+				to += -light_axis.sign() * 0.47; //make it more likely to receive a ray
+
 				Vector3 norm(
 						(((cells[idx].normal >> 16) & 0xFF) / 255.0) * 2.0 - 1.0,
 						(((cells[idx].normal >> 8) & 0xFF) / 255.0) * 2.0 - 1.0,
@@ -2254,6 +2576,8 @@ void VisualServerScene::_bake_gi_probe_light(const GIProbeDataHeader *header, co
 				InstanceGIProbeData::LocalData *light = &local_data[idx];
 
 				Vector3 to(light->pos[0] + 0.5, light->pos[1] + 0.5, light->pos[2] + 0.5);
+				to += (light_pos - to).sign() * 0.47; //make it more likely to receive a ray
+
 				Vector3 norm(
 						(((cells[idx].normal >> 16) & 0xFF) / 255.0) * 2.0 - 1.0,
 						(((cells[idx].normal >> 8) & 0xFF) / 255.0) * 2.0 - 1.0,
@@ -2927,12 +3251,12 @@ void VisualServerScene::_update_dirty_instance(Instance *p_instance) {
 		}
 	}
 
+	_instance_update_list.remove(&p_instance->update_item);
+
 	_update_instance(p_instance);
 
 	p_instance->update_aabb = false;
 	p_instance->update_materials = false;
-
-	_instance_update_list.remove(&p_instance->update_item);
 }
 
 void VisualServerScene::update_dirty_instances() {
diff --git a/servers/visual/visual_server_scene.h b/servers/visual/visual_server_scene.h
index 9e4701de65..4b0c4af09d 100644
--- a/servers/visual/visual_server_scene.h
+++ b/servers/visual/visual_server_scene.h
@@ -281,6 +281,8 @@ public:
 		List<Instance *> gi_probes;
 		bool gi_probes_dirty;
 
+		List<Instance *> lightmap_captures;
+
 		InstanceGeometryData() {
 
 			lighting_dirty = false;
@@ -445,6 +447,20 @@ public:
 
 	SelfList<InstanceGIProbeData>::List gi_probe_update_list;
 
+	struct InstanceLightmapCaptureData : public InstanceBaseData {
+
+		struct PairInfo {
+			List<Instance *>::Element *L; //iterator in geometry
+			Instance *geometry;
+		};
+		List<PairInfo> geometries;
+
+		Set<Instance *> users;
+
+		InstanceLightmapCaptureData() {
+		}
+	};
+
 	Instance *instance_cull_result[MAX_INSTANCE_CULL];
 	Instance *instance_shadow_cull_result[MAX_INSTANCE_CULL]; //used for generating shadowmaps
 	Instance *light_cull_result[MAX_LIGHTS_CULLED];
@@ -466,6 +482,7 @@ public:
 	virtual void instance_set_blend_shape_weight(RID p_instance, int p_shape, float p_weight);
 	virtual void instance_set_surface_material(RID p_instance, int p_surface, RID p_material);
 	virtual void instance_set_visible(RID p_instance, bool p_visible);
+	virtual void instance_set_use_lightmap(RID p_instance, RID p_lightmap_instance, RID p_lightmap);
 
 	virtual void instance_set_custom_aabb(RID p_insatnce, AABB aabb);
 
@@ -489,6 +506,7 @@ public:
 	_FORCE_INLINE_ void _update_instance(Instance *p_instance);
 	_FORCE_INLINE_ void _update_instance_aabb(Instance *p_instance);
 	_FORCE_INLINE_ void _update_dirty_instance(Instance *p_instance);
+	_FORCE_INLINE_ void _update_instance_lightmap_captures(Instance *p_instance);
 
 	_FORCE_INLINE_ void _light_instance_update_shadow(Instance *p_instance, const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, RID p_shadow_atlas, Scenario *p_scenario);
 
diff --git a/servers/visual/visual_server_wrap_mt.h b/servers/visual/visual_server_wrap_mt.h
index 94f450c024..cb6f67474e 100644
--- a/servers/visual/visual_server_wrap_mt.h
+++ b/servers/visual/visual_server_wrap_mt.h
@@ -294,6 +294,22 @@ public:
 	FUNC2(gi_probe_set_dynamic_data, RID, const PoolVector<int> &)
 	FUNC1RC(PoolVector<int>, gi_probe_get_dynamic_data, RID)
 
+	/* LIGHTMAP CAPTURE */
+
+	FUNCRID(lightmap_capture)
+
+	FUNC2(lightmap_capture_set_bounds, RID, const AABB &)
+	FUNC1RC(AABB, lightmap_capture_get_bounds, RID)
+
+	FUNC2(lightmap_capture_set_octree, RID, const PoolVector<uint8_t> &)
+	FUNC1RC(PoolVector<uint8_t>, lightmap_capture_get_octree, RID)
+	FUNC2(lightmap_capture_set_octree_cell_transform, RID, const Transform &)
+	FUNC1RC(Transform, lightmap_capture_get_octree_cell_transform, RID)
+	FUNC2(lightmap_capture_set_octree_cell_subdiv, RID, int)
+	FUNC1RC(int, lightmap_capture_get_octree_cell_subdiv, RID)
+	FUNC2(lightmap_capture_set_energy, RID, float)
+	FUNC1RC(float, lightmap_capture_get_energy, RID)
+
 	/* PARTICLES */
 
 	FUNCRID(particles)
@@ -425,6 +441,8 @@ public:
 	FUNC3(instance_set_blend_shape_weight, RID, int, float)
 	FUNC3(instance_set_surface_material, RID, int, RID)
 	FUNC2(instance_set_visible, RID, bool)
+	FUNC3(instance_set_use_lightmap, RID, RID, RID)
+
 	FUNC2(instance_set_custom_aabb, RID, AABB)
 
 	FUNC2(instance_attach_skeleton, RID, RID)
diff --git a/servers/visual_server.h b/servers/visual_server.h
index de5ef7da0a..ad4d32b967 100644
--- a/servers/visual_server.h
+++ b/servers/visual_server.h
@@ -485,6 +485,20 @@ public:
 	virtual void gi_probe_set_compress(RID p_probe, bool p_enable) = 0;
 	virtual bool gi_probe_is_compressed(RID p_probe) const = 0;
 
+	/* LIGHTMAP CAPTURE */
+
+	virtual RID lightmap_capture_create() = 0;
+	virtual void lightmap_capture_set_bounds(RID p_capture, const AABB &p_bounds) = 0;
+	virtual AABB lightmap_capture_get_bounds(RID p_capture) const = 0;
+	virtual void lightmap_capture_set_octree(RID p_capture, const PoolVector<uint8_t> &p_octree) = 0;
+	virtual void lightmap_capture_set_octree_cell_transform(RID p_capture, const Transform &p_xform) = 0;
+	virtual Transform lightmap_capture_get_octree_cell_transform(RID p_capture) const = 0;
+	virtual void lightmap_capture_set_octree_cell_subdiv(RID p_capture, int p_subdiv) = 0;
+	virtual int lightmap_capture_get_octree_cell_subdiv(RID p_capture) const = 0;
+	virtual PoolVector<uint8_t> lightmap_capture_get_octree(RID p_capture) const = 0;
+	virtual void lightmap_capture_set_energy(RID p_capture, float p_energy) = 0;
+	virtual float lightmap_capture_get_energy(RID p_capture) const = 0;
+
 	/* PARTICLES API */
 
 	virtual RID particles_create() = 0;
@@ -735,6 +749,7 @@ public:
 		INSTANCE_LIGHT,
 		INSTANCE_REFLECTION_PROBE,
 		INSTANCE_GI_PROBE,
+		INSTANCE_LIGHTMAP_CAPTURE,
 		INSTANCE_MAX,
 		/*INSTANCE_BAKED_LIGHT_SAMPLER,*/
 
@@ -755,6 +770,8 @@ public:
 	virtual void instance_set_surface_material(RID p_instance, int p_surface, RID p_material) = 0;
 	virtual void instance_set_visible(RID p_instance, bool p_visible) = 0;
 
+	virtual void instance_set_use_lightmap(RID p_instance, RID p_lightmap_instance, RID p_lightmap) = 0;
+
 	virtual void instance_set_custom_aabb(RID p_instance, AABB aabb) = 0;
 
 	virtual void instance_attach_skeleton(RID p_instance, RID p_skeleton) = 0;
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 7d2009cdd9..8c50081782 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -62,8 +62,8 @@ Use UI font if exists, because it has tight vertical metrics and good for UI.
 ### Hack Regular
 
 - Upstream: https://github.com/source-foundry/Hack
-- Version: 2.020
-- License: Hack Open Font License v2.0
+- Version: 3.000
+- License: MIT + Bitstream Vera License
 
 ### DroidSans*.ttf
 
@@ -179,11 +179,14 @@ Files extracted from upstream source:
 
 TODO.
 
+Important: File `libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c` has
+Godot-made change marked with `// -- GODOT --` comments.
+
 
 ## libwebp
 
 - Upstream: https://chromium.googlesource.com/webm/libwebp/
-- Version: 0.6.0
+- Version: 0.6.1
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
@@ -269,7 +272,7 @@ Collection of single-file libraries used in Godot components.
 ### poshlib
 
 - Upstream: http://poshlib.hookatooka.com/poshlib/trac.cgi (username guest, password guest123)
-- Version: 1.3.002 
+- Version: 1.3.002
 - License: MIT
 
 Files extracted from the upstream source:
@@ -305,6 +308,10 @@ Files extracted from the upstream source:
 - Relevant sources from src/
 - License.txt
 
+Important: Some files have Godot-made changes, those
+changes are marked with `// -- GODOT --` comments.
+
+
 ## nanosvg
 
 - Upstream: https://github.com/memononen/nanosvg
diff --git a/thirdparty/fonts/Hack_Regular.ttf b/thirdparty/fonts/Hack_Regular.ttf
index a35ea2e4f4..f342700811 100644
--- a/thirdparty/fonts/Hack_Regular.ttf
+++ b/thirdparty/fonts/Hack_Regular.ttf
diff --git a/thirdparty/fonts/LICENSE_Hack.md b/thirdparty/fonts/LICENSE_Hack.md
index e9fc8a1f87..ddd23a2b81 100644
--- a/thirdparty/fonts/LICENSE_Hack.md
+++ b/thirdparty/fonts/LICENSE_Hack.md
@@ -1,49 +1,30 @@
-## License
+The work in the Hack project is Copyright 2017 Source Foundry Authors and licensed under the MIT License
 
-Hack Copyright 2015, Christopher Simpkins with Reserved Font Name "Hack".
+The work in the DejaVu project was committed to the public domain.
 
 Bitstream Vera Sans Mono Copyright 2003 Bitstream Inc. and licensed under the Bitstream Vera License with Reserved Font Names "Bitstream" and "Vera"
 
-DejaVu modifications of the original Bitstream Vera Sans Mono typeface have been committed to the public domain.
+### MIT License
 
+Copyright (c) 2017 Source Foundry Authors
 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-This Font Software is licensed under the Hack Open Font License v2.0 and the Bitstream Vera License.
-
-These licenses are copied below.
-
-
-### Hack Open Font License v2.0
-
-(Version 1.0 - 06 September 2015)
-
-(Version 2.0 - 27 September 2015)
-
-Copyright 2015 by Christopher Simpkins. All Rights Reserved.
-
-DEFINITIONS
-
-"Author" refers to any designer, engineer, programmer, technical writer or other person who contributed to the Font Software.
-
-PERMISSION AND CONDITIONS
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of the fonts accompanying this license ("Fonts") and associated source code, documentation, and binary files (the "Font Software"), to reproduce and distribute the modifications to the Bitstream Vera Font Software, including without limitation the rights to use, study, copy, merge, embed, modify, redistribute, and/or sell modified or unmodified copies of the Font Software, and to permit persons to whom the Font Software is furnished to do so, subject to the following conditions:
-
-(1) The above copyright notice and this permission notice shall be included in all modified and unmodified copies of the Font Software typefaces. These notices can be included either as stand-alone text files, human-readable headers or in the appropriate machine-readable metadata fields within text or binary files as long as those fields can be easily viewed by the user.
-
-(2) The Font Software may be modified, altered, or added to, and in particular the designs of glyphs or characters in the Fonts may be modified and additional glyphs or characters may be added to the Fonts, only if the fonts are renamed to names not containing the word "Hack".
-
-(3) Neither the Font Software nor any of its individual components, in original or modified versions, may be sold by itself.
-
-TERMINATION
-
-This license becomes null and void if any of the above conditions are not met.
-
-THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM OTHER DEALINGS IN THE FONT SOFTWARE.
-
-Except as contained in this notice, the names of Christopher Simpkins and the Author(s) of the Font Software shall not be used to promote, endorse or advertise any modified version, except to acknowledge the contribution(s) of Christopher Simpkins and the Author(s) or with their explicit written permission.  For further information, contact: chris at sourcefoundry dot org.
-
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
 
 ### BITSTREAM VERA LICENSE
 
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index b718678537..d8a92354c9 100644
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -40,11 +40,12 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
 };
 
 #if defined(__clang__)
+// -- GODOT start -
 # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
-    (defined(__APPLE__) && \
+    (!defined(__MACPORTS__) && defined(__APPLE__) && \
         ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
             (__clang_major__ == 5 && __clang_minor__ == 0)))
-
+// -- GODOT end --
 #  define MM256_BROADCASTSI128_SI256(x) \
        _mm_broadcastsi128_si256((__m128i const *)&(x))
 # else  // clang > 3.3, and not 5.0 on macosx.
diff --git a/thirdparty/libwebp/dsp/argb.c b/thirdparty/libwebp/dsp/argb.c
deleted file mode 100644
index cc1f9a96c3..0000000000
--- a/thirdparty/libwebp/dsp/argb.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions.
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  int i;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-  }
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out) {
-  int i, offset = 0;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
-    offset += step;
-  }
-}
-
-void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                    const uint8_t*, int, uint32_t*);
-void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                   int, int, uint32_t*);
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-extern void VP8EncDspARGBInitSSE2(void);
-
-static volatile VP8CPUInfo argb_last_cpuinfo_used =
-    (VP8CPUInfo)&argb_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
-  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8PackARGB = PackARGB;
-  VP8PackRGB = PackRGB;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8EncDspARGBInitSSE2();
-    }
-#endif
-#if defined(WEBP_USE_MIPS_DSP_R2)
-    if (VP8GetCPUInfo(kMIPSdspR2)) {
-      VP8EncDspARGBInitMIPSdspR2();
-    }
-#endif
-  }
-  argb_last_cpuinfo_used = VP8GetCPUInfo;
-}
diff --git a/thirdparty/libwebp/dsp/argb_mips_dsp_r2.c b/thirdparty/libwebp/dsp/argb_mips_dsp_r2.c
deleted file mode 100644
index af65acb8ff..0000000000
--- a/thirdparty/libwebp/dsp/argb_mips_dsp_r2.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions (mips version).
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_MIPS_DSP_R2)
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  int temp0, temp1, temp2, temp3, offset;
-  const int rest = len & 1;
-  const uint32_t* const loop_end = out + len - rest;
-  const int step = 4;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out) {
-  int temp0, temp1, temp2, offset;
-  const int rest = len & 1;
-  const int a = 0xff;
-  const uint32_t* const loop_end = out + len - rest;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
-  VP8PackARGB = PackARGB;
-  VP8PackRGB = PackRGB;
-}
-
-#else  // !WEBP_USE_MIPS_DSP_R2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
-
-#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/argb_sse2.c b/thirdparty/libwebp/dsp/argb_sse2.c
deleted file mode 100644
index afcb1957e7..0000000000
--- a/thirdparty/libwebp/dsp/argb_sse2.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions (SSE2 version).
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_SSE2)
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <string.h>
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
-    int i = 0;
-    const int len_max = len & ~3;  // max length processed in main loop
-    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
-    assert(b == r + 2);
-    assert(a == r + 3);
-    for (; i < len_max; i += 4) {
-      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
-      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
-      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
-      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
-      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
-      const __m128i F = _mm_or_si128(E, C);
-      _mm_storeu_si128((__m128i*)(out + i), F);
-    }
-    for (; i < len; ++i) {
-      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-    }
-  } else {
-    assert(g == b + 1);
-    assert(r == b + 2);
-    assert(a == b + 3);
-    memcpy(out, b, len * 4);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
-  VP8PackARGB = PackARGB;
-}
-
-#else  // !WEBP_USE_SSE2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
-
-#endif  // WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/enc/backward_references_enc.c b/thirdparty/libwebp/enc/backward_references_enc.c
deleted file mode 100644
index 7c0559ff1e..0000000000
--- a/thirdparty/libwebp/enc/backward_references_enc.c
+++ /dev/null
@@ -1,1800 +0,0 @@
-// Copyright 2012 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Jyrki Alakuijala (jyrki@google.com)
-//
-
-#include <assert.h>
-#include <math.h>
-
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../dsp/dsp.h"
-#include "../utils/color_cache_utils.h"
-#include "../utils/utils.h"
-
-#define VALUES_IN_BYTE 256
-
-#define MIN_BLOCK_SIZE 256  // minimum block size for backward references
-
-#define MAX_ENTROPY    (1e30f)
-
-// 1M window (4M bytes) minus 120 special codes for short distances.
-#define WINDOW_SIZE_BITS 20
-#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
-
-// Minimum number of pixels for which it is cheaper to encode a
-// distance + length instead of each pixel as a literal.
-#define MIN_LENGTH 4
-// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
-// is used in VP8LHashChain.
-#define MAX_LENGTH_BITS 12
-// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
-#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
-#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
-#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
-#endif
-
-// -----------------------------------------------------------------------------
-
-static const uint8_t plane_to_code_lut[128] = {
- 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
- 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
- 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
- 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
- 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
- 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
- 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
- 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117
-};
-
-static int DistanceToPlaneCode(int xsize, int dist) {
-  const int yoffset = dist / xsize;
-  const int xoffset = dist - yoffset * xsize;
-  if (xoffset <= 8 && yoffset < 8) {
-    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
-  } else if (xoffset > xsize - 8 && yoffset < 7) {
-    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
-  }
-  return dist + 120;
-}
-
-// Returns the exact index where array1 and array2 are different. For an index
-// inferior or equal to best_len_match, the return value just has to be strictly
-// inferior to best_len_match. The current behavior is to return 0 if this index
-// is best_len_match, and the index itself otherwise.
-// If no two elements are the same, it returns max_limit.
-static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
-                                       const uint32_t* const array2,
-                                       int best_len_match, int max_limit) {
-  // Before 'expensive' linear match, check if the two arrays match at the
-  // current best length index.
-  if (array1[best_len_match] != array2[best_len_match]) return 0;
-
-  return VP8LVectorMismatch(array1, array2, max_limit);
-}
-
-// -----------------------------------------------------------------------------
-//  VP8LBackwardRefs
-
-struct PixOrCopyBlock {
-  PixOrCopyBlock* next_;   // next block (or NULL)
-  PixOrCopy* start_;       // data start
-  int size_;               // currently used size
-};
-
-static void ClearBackwardRefs(VP8LBackwardRefs* const refs) {
-  assert(refs != NULL);
-  if (refs->tail_ != NULL) {
-    *refs->tail_ = refs->free_blocks_;  // recycle all blocks at once
-  }
-  refs->free_blocks_ = refs->refs_;
-  refs->tail_ = &refs->refs_;
-  refs->last_block_ = NULL;
-  refs->refs_ = NULL;
-}
-
-void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
-  assert(refs != NULL);
-  ClearBackwardRefs(refs);
-  while (refs->free_blocks_ != NULL) {
-    PixOrCopyBlock* const next = refs->free_blocks_->next_;
-    WebPSafeFree(refs->free_blocks_);
-    refs->free_blocks_ = next;
-  }
-}
-
-void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
-  assert(refs != NULL);
-  memset(refs, 0, sizeof(*refs));
-  refs->tail_ = &refs->refs_;
-  refs->block_size_ =
-      (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
-}
-
-VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
-  VP8LRefsCursor c;
-  c.cur_block_ = refs->refs_;
-  if (refs->refs_ != NULL) {
-    c.cur_pos = c.cur_block_->start_;
-    c.last_pos_ = c.cur_pos + c.cur_block_->size_;
-  } else {
-    c.cur_pos = NULL;
-    c.last_pos_ = NULL;
-  }
-  return c;
-}
-
-void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
-  PixOrCopyBlock* const b = c->cur_block_->next_;
-  c->cur_pos = (b == NULL) ? NULL : b->start_;
-  c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
-  c->cur_block_ = b;
-}
-
-// Create a new block, either from the free list or allocated
-static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
-  PixOrCopyBlock* b = refs->free_blocks_;
-  if (b == NULL) {   // allocate new memory chunk
-    const size_t total_size =
-        sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
-    b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
-    if (b == NULL) {
-      refs->error_ |= 1;
-      return NULL;
-    }
-    b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b));  // not always aligned
-  } else {  // recycle from free-list
-    refs->free_blocks_ = b->next_;
-  }
-  *refs->tail_ = b;
-  refs->tail_ = &b->next_;
-  refs->last_block_ = b;
-  b->next_ = NULL;
-  b->size_ = 0;
-  return b;
-}
-
-static WEBP_INLINE void BackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
-                                              const PixOrCopy v) {
-  PixOrCopyBlock* b = refs->last_block_;
-  if (b == NULL || b->size_ == refs->block_size_) {
-    b = BackwardRefsNewBlock(refs);
-    if (b == NULL) return;   // refs->error_ is set
-  }
-  b->start_[b->size_++] = v;
-}
-
-int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
-                         VP8LBackwardRefs* const dst) {
-  const PixOrCopyBlock* b = src->refs_;
-  ClearBackwardRefs(dst);
-  assert(src->block_size_ == dst->block_size_);
-  while (b != NULL) {
-    PixOrCopyBlock* const new_b = BackwardRefsNewBlock(dst);
-    if (new_b == NULL) return 0;   // dst->error_ is set
-    memcpy(new_b->start_, b->start_, b->size_ * sizeof(*b->start_));
-    new_b->size_ = b->size_;
-    b = b->next_;
-  }
-  return 1;
-}
-
-// -----------------------------------------------------------------------------
-// Hash chains
-
-int VP8LHashChainInit(VP8LHashChain* const p, int size) {
-  assert(p->size_ == 0);
-  assert(p->offset_length_ == NULL);
-  assert(size > 0);
-  p->offset_length_ =
-      (uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
-  if (p->offset_length_ == NULL) return 0;
-  p->size_ = size;
-
-  return 1;
-}
-
-void VP8LHashChainClear(VP8LHashChain* const p) {
-  assert(p != NULL);
-  WebPSafeFree(p->offset_length_);
-
-  p->size_ = 0;
-  p->offset_length_ = NULL;
-}
-
-// -----------------------------------------------------------------------------
-
-#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
-#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
-
-static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
-  uint32_t key;
-  key  = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
-  key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
-  key = key >> (32 - HASH_BITS);
-  return key;
-}
-
-// Returns the maximum number of hash chain lookups to do for a
-// given compression quality. Return value in range [8, 86].
-static int GetMaxItersForQuality(int quality) {
-  return 8 + (quality * quality) / 128;
-}
-
-static int GetWindowSizeForHashChain(int quality, int xsize) {
-  const int max_window_size = (quality > 75) ? WINDOW_SIZE
-                            : (quality > 50) ? (xsize << 8)
-                            : (quality > 25) ? (xsize << 6)
-                            : (xsize << 4);
-  assert(xsize > 0);
-  return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
-}
-
-static WEBP_INLINE int MaxFindCopyLength(int len) {
-  return (len < MAX_LENGTH) ? len : MAX_LENGTH;
-}
-
-int VP8LHashChainFill(VP8LHashChain* const p, int quality,
-                      const uint32_t* const argb, int xsize, int ysize,
-                      int low_effort) {
-  const int size = xsize * ysize;
-  const int iter_max = GetMaxItersForQuality(quality);
-  const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
-  int pos;
-  int argb_comp;
-  uint32_t base_position;
-  int32_t* hash_to_first_index;
-  // Temporarily use the p->offset_length_ as a hash chain.
-  int32_t* chain = (int32_t*)p->offset_length_;
-  assert(size > 0);
-  assert(p->size_ != 0);
-  assert(p->offset_length_ != NULL);
-
-  if (size <= 2) {
-    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
-    return 1;
-  }
-
-  hash_to_first_index =
-      (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
-  if (hash_to_first_index == NULL) return 0;
-
-  // Set the int32_t array to -1.
-  memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
-  // Fill the chain linking pixels with the same hash.
-  argb_comp = (argb[0] == argb[1]);
-  for (pos = 0; pos < size - 2;) {
-    uint32_t hash_code;
-    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
-    if (argb_comp && argb_comp_next) {
-      // Consecutive pixels with the same color will share the same hash.
-      // We therefore use a different hash: the color and its repetition
-      // length.
-      uint32_t tmp[2];
-      uint32_t len = 1;
-      tmp[0] = argb[pos];
-      // Figure out how far the pixels are the same.
-      // The last pixel has a different 64 bit hash, as its next pixel does
-      // not have the same color, so we just need to get to the last pixel equal
-      // to its follower.
-      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
-        ++len;
-      }
-      if (len > MAX_LENGTH) {
-        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
-        // because they are linked to their predecessor and we automatically
-        // check that in the main for loop below. Skipping means setting no
-        // predecessor in the chain, hence -1.
-        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
-        pos += len - MAX_LENGTH;
-        len = MAX_LENGTH;
-      }
-      // Process the rest of the hash chain.
-      while (len) {
-        tmp[1] = len--;
-        hash_code = GetPixPairHash64(tmp);
-        chain[pos] = hash_to_first_index[hash_code];
-        hash_to_first_index[hash_code] = pos++;
-      }
-      argb_comp = 0;
-    } else {
-      // Just move one pixel forward.
-      hash_code = GetPixPairHash64(argb + pos);
-      chain[pos] = hash_to_first_index[hash_code];
-      hash_to_first_index[hash_code] = pos++;
-      argb_comp = argb_comp_next;
-    }
-  }
-  // Process the penultimate pixel.
-  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
-
-  WebPSafeFree(hash_to_first_index);
-
-  // Find the best match interval at each pixel, defined by an offset to the
-  // pixel and a length. The right-most pixel cannot match anything to the right
-  // (hence a best length of 0) and the left-most pixel nothing to the left
-  // (hence an offset of 0).
-  assert(size > 2);
-  p->offset_length_[0] = p->offset_length_[size - 1] = 0;
-  for (base_position = size - 2; base_position > 0;) {
-    const int max_len = MaxFindCopyLength(size - 1 - base_position);
-    const uint32_t* const argb_start = argb + base_position;
-    int iter = iter_max;
-    int best_length = 0;
-    uint32_t best_distance = 0;
-    uint32_t best_argb;
-    const int min_pos =
-        (base_position > window_size) ? base_position - window_size : 0;
-    const int length_max = (max_len < 256) ? max_len : 256;
-    uint32_t max_base_position;
-
-    pos = chain[base_position];
-    if (!low_effort) {
-      int curr_length;
-      // Heuristic: use the comparison with the above line as an initialization.
-      if (base_position >= (uint32_t)xsize) {
-        curr_length = FindMatchLength(argb_start - xsize, argb_start,
-                                      best_length, max_len);
-        if (curr_length > best_length) {
-          best_length = curr_length;
-          best_distance = xsize;
-        }
-        --iter;
-      }
-      // Heuristic: compare to the previous pixel.
-      curr_length =
-          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
-      if (curr_length > best_length) {
-        best_length = curr_length;
-        best_distance = 1;
-      }
-      --iter;
-      // Skip the for loop if we already have the maximum.
-      if (best_length == MAX_LENGTH) pos = min_pos - 1;
-    }
-    best_argb = argb_start[best_length];
-
-    for (; pos >= min_pos && --iter; pos = chain[pos]) {
-      int curr_length;
-      assert(base_position > (uint32_t)pos);
-
-      if (argb[pos + best_length] != best_argb) continue;
-
-      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
-      if (best_length < curr_length) {
-        best_length = curr_length;
-        best_distance = base_position - pos;
-        best_argb = argb_start[best_length];
-        // Stop if we have reached a good enough length.
-        if (best_length >= length_max) break;
-      }
-    }
-    // We have the best match but in case the two intervals continue matching
-    // to the left, we have the best matches for the left-extended pixels.
-    max_base_position = base_position;
-    while (1) {
-      assert(best_length <= MAX_LENGTH);
-      assert(best_distance <= WINDOW_SIZE);
-      p->offset_length_[base_position] =
-          (best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
-      --base_position;
-      // Stop if we don't have a match or if we are out of bounds.
-      if (best_distance == 0 || base_position == 0) break;
-      // Stop if we cannot extend the matching intervals to the left.
-      if (base_position < best_distance ||
-          argb[base_position - best_distance] != argb[base_position]) {
-        break;
-      }
-      // Stop if we are matching at its limit because there could be a closer
-      // matching interval with the same maximum length. Then again, if the
-      // matching interval is as close as possible (best_distance == 1), we will
-      // never find anything better so let's continue.
-      if (best_length == MAX_LENGTH && best_distance != 1 &&
-          base_position + MAX_LENGTH < max_base_position) {
-        break;
-      }
-      if (best_length < MAX_LENGTH) {
-        ++best_length;
-        max_base_position = base_position;
-      }
-    }
-  }
-  return 1;
-}
-
-static WEBP_INLINE int HashChainFindOffset(const VP8LHashChain* const p,
-                                           const int base_position) {
-  return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
-}
-
-static WEBP_INLINE int HashChainFindLength(const VP8LHashChain* const p,
-                                           const int base_position) {
-  return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
-}
-
-static WEBP_INLINE void HashChainFindCopy(const VP8LHashChain* const p,
-                                          int base_position,
-                                          int* const offset_ptr,
-                                          int* const length_ptr) {
-  *offset_ptr = HashChainFindOffset(p, base_position);
-  *length_ptr = HashChainFindLength(p, base_position);
-}
-
-static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
-                                         VP8LColorCache* const hashers,
-                                         VP8LBackwardRefs* const refs) {
-  PixOrCopy v;
-  if (use_color_cache) {
-    const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
-    if (VP8LColorCacheLookup(hashers, key) == pixel) {
-      v = PixOrCopyCreateCacheIdx(key);
-    } else {
-      v = PixOrCopyCreateLiteral(pixel);
-      VP8LColorCacheSet(hashers, key, pixel);
-    }
-  } else {
-    v = PixOrCopyCreateLiteral(pixel);
-  }
-  BackwardRefsCursorAdd(refs, v);
-}
-
-static int BackwardReferencesRle(int xsize, int ysize,
-                                 const uint32_t* const argb,
-                                 int cache_bits, VP8LBackwardRefs* const refs) {
-  const int pix_count = xsize * ysize;
-  int i, k;
-  const int use_color_cache = (cache_bits > 0);
-  VP8LColorCache hashers;
-
-  if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
-    return 0;
-  }
-  ClearBackwardRefs(refs);
-  // Add first pixel as literal.
-  AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
-  i = 1;
-  while (i < pix_count) {
-    const int max_len = MaxFindCopyLength(pix_count - i);
-    const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
-    const int prev_row_len = (i < xsize) ? 0 :
-        FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
-    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
-      // We don't need to update the color cache here since it is always the
-      // same pixel being copied, and that does not change the color cache
-      // state.
-      i += rle_len;
-    } else if (prev_row_len >= MIN_LENGTH) {
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
-      if (use_color_cache) {
-        for (k = 0; k < prev_row_len; ++k) {
-          VP8LColorCacheInsert(&hashers, argb[i + k]);
-        }
-      }
-      i += prev_row_len;
-    } else {
-      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
-      i++;
-    }
-  }
-  if (use_color_cache) VP8LColorCacheClear(&hashers);
-  return !refs->error_;
-}
-
-static int BackwardReferencesLz77(int xsize, int ysize,
-                                  const uint32_t* const argb, int cache_bits,
-                                  const VP8LHashChain* const hash_chain,
-                                  VP8LBackwardRefs* const refs) {
-  int i;
-  int i_last_check = -1;
-  int ok = 0;
-  int cc_init = 0;
-  const int use_color_cache = (cache_bits > 0);
-  const int pix_count = xsize * ysize;
-  VP8LColorCache hashers;
-
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
-  }
-  ClearBackwardRefs(refs);
-  for (i = 0; i < pix_count;) {
-    // Alternative#1: Code the pixels starting at 'i' using backward reference.
-    int offset = 0;
-    int len = 0;
-    int j;
-    HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len >= MIN_LENGTH) {
-      const int len_ini = len;
-      int max_reach = 0;
-      assert(i + len < pix_count);
-      // Only start from what we have not checked already.
-      i_last_check = (i > i_last_check) ? i : i_last_check;
-      // We know the best match for the current pixel but we try to find the
-      // best matches for the current pixel AND the next one combined.
-      // The naive method would use the intervals:
-      // [i,i+len) + [i+len, length of best match at i+len)
-      // while we check if we can use:
-      // [i,j) (where j<=i+len) + [j, length of best match at j)
-      for (j = i_last_check + 1; j <= i + len_ini; ++j) {
-        const int len_j = HashChainFindLength(hash_chain, j);
-        const int reach =
-            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
-        if (reach > max_reach) {
-          len = j - i;
-          max_reach = reach;
-        }
-      }
-    } else {
-      len = 1;
-    }
-    // Go with literal or backward reference.
-    assert(len > 0);
-    if (len == 1) {
-      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
-    } else {
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
-      if (use_color_cache) {
-        for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
-      }
-    }
-    i += len;
-  }
-
-  ok = !refs->error_;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  return ok;
-}
-
-// -----------------------------------------------------------------------------
-
-typedef struct {
-  double alpha_[VALUES_IN_BYTE];
-  double red_[VALUES_IN_BYTE];
-  double blue_[VALUES_IN_BYTE];
-  double distance_[NUM_DISTANCE_CODES];
-  double* literal_;
-} CostModel;
-
-static int BackwardReferencesTraceBackwards(
-    int xsize, int ysize, const uint32_t* const argb, int quality,
-    int cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs);
-
-static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const uint32_t population_counts[], double output[]) {
-  uint32_t sum = 0;
-  int nonzeros = 0;
-  int i;
-  for (i = 0; i < num_symbols; ++i) {
-    sum += population_counts[i];
-    if (population_counts[i] > 0) {
-      ++nonzeros;
-    }
-  }
-  if (nonzeros <= 1) {
-    memset(output, 0, num_symbols * sizeof(*output));
-  } else {
-    const double logsum = VP8LFastLog2(sum);
-    for (i = 0; i < num_symbols; ++i) {
-      output[i] = logsum - VP8LFastLog2(population_counts[i]);
-    }
-  }
-}
-
-static int CostModelBuild(CostModel* const m, int cache_bits,
-                          VP8LBackwardRefs* const refs) {
-  int ok = 0;
-  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
-  if (histo == NULL) goto Error;
-
-  VP8LHistogramCreate(histo, refs, cache_bits);
-
-  ConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(histo->palette_code_bits_),
-      histo->literal_, m->literal_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->red_, m->red_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->blue_, m->blue_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
-  ConvertPopulationCountTableToBitEstimates(
-      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
-  ok = 1;
-
- Error:
-  VP8LFreeHistogram(histo);
-  return ok;
-}
-
-static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
-  return m->alpha_[v >> 24] +
-         m->red_[(v >> 16) & 0xff] +
-         m->literal_[(v >> 8) & 0xff] +
-         m->blue_[v & 0xff];
-}
-
-static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
-  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
-  return m->literal_[literal_idx];
-}
-
-static WEBP_INLINE double GetLengthCost(const CostModel* const m,
-                                        uint32_t length) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(length, &code, &extra_bits);
-  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
-}
-
-static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
-                                          uint32_t distance) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
-  return m->distance_[code] + extra_bits;
-}
-
-static void AddSingleLiteralWithCostModel(const uint32_t* const argb,
-                                          VP8LColorCache* const hashers,
-                                          const CostModel* const cost_model,
-                                          int idx, int use_color_cache,
-                                          double prev_cost, float* const cost,
-                                          uint16_t* const dist_array) {
-  double cost_val = prev_cost;
-  const uint32_t color = argb[0];
-  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
-  if (ix >= 0) {
-    // use_color_cache is true and hashers contains color
-    const double mul0 = 0.68;
-    cost_val += GetCacheCost(cost_model, ix) * mul0;
-  } else {
-    const double mul1 = 0.82;
-    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
-    cost_val += GetLiteralCost(cost_model, color) * mul1;
-  }
-  if (cost[idx] > cost_val) {
-    cost[idx] = (float)cost_val;
-    dist_array[idx] = 1;  // only one is inserted.
-  }
-}
-
-// -----------------------------------------------------------------------------
-// CostManager and interval handling
-
-// Empirical value to avoid high memory consumption but good for performance.
-#define COST_CACHE_INTERVAL_SIZE_MAX 100
-
-// To perform backward reference every pixel at index index_ is considered and
-// the cost for the MAX_LENGTH following pixels computed. Those following pixels
-// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
-//     distance_cost_ at index_ + GetLengthCost(cost_model, k)
-//            (named cost)            (named cached cost)
-// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
-// array of size MAX_LENGTH.
-// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
-// minimal values using intervals, for which lower_ and upper_ bounds are kept.
-// An interval is defined by the index_ of the pixel that generated it and
-// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
-// it contains the minimum value for pixels between start_ and end_.
-// Intervals are stored in a linked list and ordered by start_. When a new
-// interval has a better minimum, old intervals are split or removed.
-typedef struct CostInterval CostInterval;
-struct CostInterval {
-  double lower_;
-  double upper_;
-  int start_;
-  int end_;
-  double distance_cost_;
-  int index_;
-  CostInterval* previous_;
-  CostInterval* next_;
-};
-
-// The GetLengthCost(cost_model, k) part of the costs is also bounded for
-// efficiency in a set of intervals of a different type.
-// If those intervals are small enough, they are not used for comparison and
-// written into the costs right away.
-typedef struct {
-  double lower_;  // Lower bound of the interval.
-  double upper_;  // Upper bound of the interval.
-  int start_;
-  int end_;       // Exclusive.
-  int do_write_;  // If !=0, the interval is saved to cost instead of being kept
-                  // for comparison.
-} CostCacheInterval;
-
-// This structure is in charge of managing intervals and costs.
-// It caches the different CostCacheInterval, caches the different
-// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
-// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
-#define COST_MANAGER_MAX_FREE_LIST 10
-typedef struct {
-  CostInterval* head_;
-  int count_;  // The number of stored intervals.
-  CostCacheInterval* cache_intervals_;
-  size_t cache_intervals_size_;
-  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
-  double min_cost_cache_;          // The minimum value in cost_cache_[1:].
-  double max_cost_cache_;          // The maximum value in cost_cache_[1:].
-  float* costs_;
-  uint16_t* dist_array_;
-  // Most of the time, we only need few intervals -> use a free-list, to avoid
-  // fragmentation with small allocs in most common cases.
-  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
-  CostInterval* free_intervals_;
-  // These are regularly malloc'd remains. This list can't grow larger than than
-  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
-  CostInterval* recycled_intervals_;
-  // Buffer used in BackwardReferencesHashChainDistanceOnly to store the ends
-  // of the intervals that can have impacted the cost at a pixel.
-  int* interval_ends_;
-  int interval_ends_size_;
-} CostManager;
-
-static int IsCostCacheIntervalWritable(int start, int end) {
-  // 100 is the length for which we consider an interval for comparison, and not
-  // for writing.
-  // The first intervals are very small and go in increasing size. This constant
-  // helps merging them into one big interval (up to index 150/200 usually from
-  // which intervals start getting much bigger).
-  // This value is empirical.
-  return (end - start + 1 < 100);
-}
-
-static void CostIntervalAddToFreeList(CostManager* const manager,
-                                      CostInterval* const interval) {
-  interval->next_ = manager->free_intervals_;
-  manager->free_intervals_ = interval;
-}
-
-static int CostIntervalIsInFreeList(const CostManager* const manager,
-                                    const CostInterval* const interval) {
-  return (interval >= &manager->intervals_[0] &&
-          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
-}
-
-static void CostManagerInitFreeList(CostManager* const manager) {
-  int i;
-  manager->free_intervals_ = NULL;
-  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
-    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
-  }
-}
-
-static void DeleteIntervalList(CostManager* const manager,
-                               const CostInterval* interval) {
-  while (interval != NULL) {
-    const CostInterval* const next = interval->next_;
-    if (!CostIntervalIsInFreeList(manager, interval)) {
-      WebPSafeFree((void*)interval);
-    }  // else: do nothing
-    interval = next;
-  }
-}
-
-static void CostManagerClear(CostManager* const manager) {
-  if (manager == NULL) return;
-
-  WebPSafeFree(manager->costs_);
-  WebPSafeFree(manager->cache_intervals_);
-  WebPSafeFree(manager->interval_ends_);
-
-  // Clear the interval lists.
-  DeleteIntervalList(manager, manager->head_);
-  manager->head_ = NULL;
-  DeleteIntervalList(manager, manager->recycled_intervals_);
-  manager->recycled_intervals_ = NULL;
-
-  // Reset pointers, count_ and cache_intervals_size_.
-  memset(manager, 0, sizeof(*manager));
-  CostManagerInitFreeList(manager);
-}
-
-static int CostManagerInit(CostManager* const manager,
-                           uint16_t* const dist_array, int pix_count,
-                           const CostModel* const cost_model) {
-  int i;
-  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
-  // This constant is tied to the cost_model we use.
-  // Empirically, differences between intervals is usually of more than 1.
-  const double min_cost_diff = 0.1;
-
-  manager->costs_ = NULL;
-  manager->cache_intervals_ = NULL;
-  manager->interval_ends_ = NULL;
-  manager->head_ = NULL;
-  manager->recycled_intervals_ = NULL;
-  manager->count_ = 0;
-  manager->dist_array_ = dist_array;
-  CostManagerInitFreeList(manager);
-
-  // Fill in the cost_cache_.
-  manager->cache_intervals_size_ = 1;
-  manager->cost_cache_[0] = 0;
-  for (i = 1; i < cost_cache_size; ++i) {
-    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
-    // Get an approximation of the number of bound intervals.
-    if (fabs(manager->cost_cache_[i] - manager->cost_cache_[i - 1]) >
-        min_cost_diff) {
-      ++manager->cache_intervals_size_;
-    }
-    // Compute the minimum of cost_cache_.
-    if (i == 1) {
-      manager->min_cost_cache_ = manager->cost_cache_[1];
-      manager->max_cost_cache_ = manager->cost_cache_[1];
-    } else if (manager->cost_cache_[i] < manager->min_cost_cache_) {
-      manager->min_cost_cache_ = manager->cost_cache_[i];
-    } else if (manager->cost_cache_[i] > manager->max_cost_cache_) {
-      manager->max_cost_cache_ = manager->cost_cache_[i];
-    }
-  }
-
-  // With the current cost models, we have 15 intervals, so we are safe by
-  // setting a maximum of COST_CACHE_INTERVAL_SIZE_MAX.
-  if (manager->cache_intervals_size_ > COST_CACHE_INTERVAL_SIZE_MAX) {
-    manager->cache_intervals_size_ = COST_CACHE_INTERVAL_SIZE_MAX;
-  }
-  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
-      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
-  if (manager->cache_intervals_ == NULL) {
-    CostManagerClear(manager);
-    return 0;
-  }
-
-  // Fill in the cache_intervals_.
-  {
-    double cost_prev = -1e38f;  // unprobably low initial value
-    CostCacheInterval* prev = NULL;
-    CostCacheInterval* cur = manager->cache_intervals_;
-    const CostCacheInterval* const end =
-        manager->cache_intervals_ + manager->cache_intervals_size_;
-
-    // Consecutive values in cost_cache_ are compared and if a big enough
-    // difference is found, a new interval is created and bounded.
-    for (i = 0; i < cost_cache_size; ++i) {
-      const double cost_val = manager->cost_cache_[i];
-      if (i == 0 ||
-          (fabs(cost_val - cost_prev) > min_cost_diff && cur + 1 < end)) {
-        if (i > 1) {
-          const int is_writable =
-              IsCostCacheIntervalWritable(cur->start_, cur->end_);
-          // Merge with the previous interval if both are writable.
-          if (is_writable && cur != manager->cache_intervals_ &&
-              prev->do_write_) {
-            // Update the previous interval.
-            prev->end_ = cur->end_;
-            if (cur->lower_ < prev->lower_) {
-              prev->lower_ = cur->lower_;
-            } else if (cur->upper_ > prev->upper_) {
-              prev->upper_ = cur->upper_;
-            }
-          } else {
-            cur->do_write_ = is_writable;
-            prev = cur;
-            ++cur;
-          }
-        }
-        // Initialize an interval.
-        cur->start_ = i;
-        cur->do_write_ = 0;
-        cur->lower_ = cost_val;
-        cur->upper_ = cost_val;
-      } else {
-        // Update the current interval bounds.
-        if (cost_val < cur->lower_) {
-          cur->lower_ = cost_val;
-        } else if (cost_val > cur->upper_) {
-          cur->upper_ = cost_val;
-        }
-      }
-      cur->end_ = i + 1;
-      cost_prev = cost_val;
-    }
-    manager->cache_intervals_size_ = cur + 1 - manager->cache_intervals_;
-  }
-
-  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
-  if (manager->costs_ == NULL) {
-    CostManagerClear(manager);
-    return 0;
-  }
-  // Set the initial costs_ high for every pixel as we will keep the minimum.
-  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
-
-  // The cost at pixel is influenced by the cost intervals from previous pixels.
-  // Let us take the specific case where the offset is the same (which actually
-  // happens a lot in case of uniform regions).
-  // pixel i contributes to j>i a cost of: offset cost + cost_cache_[j-i]
-  // pixel i+1 contributes to j>i a cost of: 2*offset cost + cost_cache_[j-i-1]
-  // pixel i+2 contributes to j>i a cost of: 3*offset cost + cost_cache_[j-i-2]
-  // and so on.
-  // A pixel i influences the following length(j) < MAX_LENGTH pixels. What is
-  // the value of j such that pixel i + j cannot influence any of those pixels?
-  // This value is such that:
-  //               max of cost_cache_ < j*offset cost + min of cost_cache_
-  // (pixel i + j 's cost cannot beat the worst cost given by pixel i).
-  // This value will be used to optimize the cost computation in
-  // BackwardReferencesHashChainDistanceOnly.
-  {
-    // The offset cost is computed in GetDistanceCost and has a minimum value of
-    // the minimum in cost_model->distance_. The case where the offset cost is 0
-    // will be dealt with differently later so we are only interested in the
-    // minimum non-zero offset cost.
-    double offset_cost_min = 0.;
-    int size;
-    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-      if (cost_model->distance_[i] != 0) {
-        if (offset_cost_min == 0.) {
-          offset_cost_min = cost_model->distance_[i];
-        } else if (cost_model->distance_[i] < offset_cost_min) {
-          offset_cost_min = cost_model->distance_[i];
-        }
-      }
-    }
-    // In case all the cost_model->distance_ is 0, the next non-zero cost we
-    // can have is from the extra bit in GetDistanceCost, hence 1.
-    if (offset_cost_min < 1.) offset_cost_min = 1.;
-
-    size = 1 + (int)ceil((manager->max_cost_cache_ - manager->min_cost_cache_) /
-                         offset_cost_min);
-    // Empirically, we usually end up with a value below 100.
-    if (size > MAX_LENGTH) size = MAX_LENGTH;
-
-    manager->interval_ends_ =
-        (int*)WebPSafeMalloc(size, sizeof(*manager->interval_ends_));
-    if (manager->interval_ends_ == NULL) {
-      CostManagerClear(manager);
-      return 0;
-    }
-    manager->interval_ends_size_ = size;
-  }
-
-  return 1;
-}
-
-// Given the distance_cost for pixel 'index', update the cost at pixel 'i' if it
-// is smaller than the previously computed value.
-static WEBP_INLINE void UpdateCost(CostManager* const manager, int i, int index,
-                                   double distance_cost) {
-  int k = i - index;
-  double cost_tmp;
-  assert(k >= 0 && k < MAX_LENGTH);
-  cost_tmp = distance_cost + manager->cost_cache_[k];
-
-  if (manager->costs_[i] > cost_tmp) {
-    manager->costs_[i] = (float)cost_tmp;
-    manager->dist_array_[i] = k + 1;
-  }
-}
-
-// Given the distance_cost for pixel 'index', update the cost for all the pixels
-// between 'start' and 'end' excluded.
-static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
-                                              int start, int end, int index,
-                                              double distance_cost) {
-  int i;
-  for (i = start; i < end; ++i) UpdateCost(manager, i, index, distance_cost);
-}
-
-// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
-static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
-                                         CostInterval* const prev,
-                                         CostInterval* const next) {
-  if (prev != NULL) {
-    prev->next_ = next;
-  } else {
-    manager->head_ = next;
-  }
-
-  if (next != NULL) next->previous_ = prev;
-}
-
-// Pop an interval in the manager.
-static WEBP_INLINE void PopInterval(CostManager* const manager,
-                                    CostInterval* const interval) {
-  CostInterval* const next = interval->next_;
-
-  if (interval == NULL) return;
-
-  ConnectIntervals(manager, interval->previous_, next);
-  if (CostIntervalIsInFreeList(manager, interval)) {
-    CostIntervalAddToFreeList(manager, interval);
-  } else {  // recycle regularly malloc'd intervals too
-    interval->next_ = manager->recycled_intervals_;
-    manager->recycled_intervals_ = interval;
-  }
-  --manager->count_;
-  assert(manager->count_ >= 0);
-}
-
-// Update the cost at index i by going over all the stored intervals that
-// overlap with i.
-static WEBP_INLINE void UpdateCostPerIndex(CostManager* const manager, int i) {
-  CostInterval* current = manager->head_;
-
-  while (current != NULL && current->start_ <= i) {
-    if (current->end_ <= i) {
-      // We have an outdated interval, remove it.
-      CostInterval* next = current->next_;
-      PopInterval(manager, current);
-      current = next;
-    } else {
-      UpdateCost(manager, i, current->index_, current->distance_cost_);
-      current = current->next_;
-    }
-  }
-}
-
-// Given a current orphan interval and its previous interval, before
-// it was orphaned (which can be NULL), set it at the right place in the list
-// of intervals using the start_ ordering and the previous interval as a hint.
-static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
-                                               CostInterval* const current,
-                                               CostInterval* previous) {
-  assert(current != NULL);
-
-  if (previous == NULL) previous = manager->head_;
-  while (previous != NULL && current->start_ < previous->start_) {
-    previous = previous->previous_;
-  }
-  while (previous != NULL && previous->next_ != NULL &&
-         previous->next_->start_ < current->start_) {
-    previous = previous->next_;
-  }
-
-  if (previous != NULL) {
-    ConnectIntervals(manager, current, previous->next_);
-  } else {
-    ConnectIntervals(manager, current, manager->head_);
-  }
-  ConnectIntervals(manager, previous, current);
-}
-
-// Insert an interval in the list contained in the manager by starting at
-// interval_in as a hint. The intervals are sorted by start_ value.
-static WEBP_INLINE void InsertInterval(CostManager* const manager,
-                                       CostInterval* const interval_in,
-                                       double distance_cost, double lower,
-                                       double upper, int index, int start,
-                                       int end) {
-  CostInterval* interval_new;
-
-  if (IsCostCacheIntervalWritable(start, end) ||
-      manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
-    // Write down the interval if it is too small.
-    UpdateCostPerInterval(manager, start, end, index, distance_cost);
-    return;
-  }
-  if (manager->free_intervals_ != NULL) {
-    interval_new = manager->free_intervals_;
-    manager->free_intervals_ = interval_new->next_;
-  } else if (manager->recycled_intervals_ != NULL) {
-    interval_new = manager->recycled_intervals_;
-    manager->recycled_intervals_ = interval_new->next_;
-  } else {   // malloc for good
-    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
-    if (interval_new == NULL) {
-      // Write down the interval if we cannot create it.
-      UpdateCostPerInterval(manager, start, end, index, distance_cost);
-      return;
-    }
-  }
-
-  interval_new->distance_cost_ = distance_cost;
-  interval_new->lower_ = lower;
-  interval_new->upper_ = upper;
-  interval_new->index_ = index;
-  interval_new->start_ = start;
-  interval_new->end_ = end;
-  PositionOrphanInterval(manager, interval_new, interval_in);
-
-  ++manager->count_;
-}
-
-// When an interval has its start_ or end_ modified, it needs to be
-// repositioned in the linked list.
-static WEBP_INLINE void RepositionInterval(CostManager* const manager,
-                                           CostInterval* const interval) {
-  if (IsCostCacheIntervalWritable(interval->start_, interval->end_)) {
-    // Maybe interval has been resized and is small enough to be removed.
-    UpdateCostPerInterval(manager, interval->start_, interval->end_,
-                          interval->index_, interval->distance_cost_);
-    PopInterval(manager, interval);
-    return;
-  }
-
-  // Early exit if interval is at the right spot.
-  if ((interval->previous_ == NULL ||
-       interval->previous_->start_ <= interval->start_) &&
-      (interval->next_ == NULL ||
-       interval->start_ <= interval->next_->start_)) {
-    return;
-  }
-
-  ConnectIntervals(manager, interval->previous_, interval->next_);
-  PositionOrphanInterval(manager, interval, interval->previous_);
-}
-
-// Given a new cost interval defined by its start at index, its last value and
-// distance_cost, add its contributions to the previous intervals and costs.
-// If handling the interval or one of its subintervals becomes to heavy, its
-// contribution is added to the costs right away.
-static WEBP_INLINE void PushInterval(CostManager* const manager,
-                                     double distance_cost, int index,
-                                     int last) {
-  size_t i;
-  CostInterval* interval = manager->head_;
-  CostInterval* interval_next;
-  const CostCacheInterval* const cost_cache_intervals =
-      manager->cache_intervals_;
-
-  for (i = 0; i < manager->cache_intervals_size_ &&
-              cost_cache_intervals[i].start_ < last;
-       ++i) {
-    // Define the intersection of the ith interval with the new one.
-    int start = index + cost_cache_intervals[i].start_;
-    const int end = index + (cost_cache_intervals[i].end_ > last
-                                 ? last
-                                 : cost_cache_intervals[i].end_);
-    const double lower_in = cost_cache_intervals[i].lower_;
-    const double upper_in = cost_cache_intervals[i].upper_;
-    const double lower_full_in = distance_cost + lower_in;
-    const double upper_full_in = distance_cost + upper_in;
-
-    if (cost_cache_intervals[i].do_write_) {
-      UpdateCostPerInterval(manager, start, end, index, distance_cost);
-      continue;
-    }
-
-    for (; interval != NULL && interval->start_ < end && start < end;
-         interval = interval_next) {
-      const double lower_full_interval =
-          interval->distance_cost_ + interval->lower_;
-      const double upper_full_interval =
-          interval->distance_cost_ + interval->upper_;
-
-      interval_next = interval->next_;
-
-      // Make sure we have some overlap
-      if (start >= interval->end_) continue;
-
-      if (lower_full_in >= upper_full_interval) {
-        // When intervals are represented, the lower, the better.
-        // [**********************************************************]
-        // start                                                    end
-        //                   [----------------------------------]
-        //                   interval->start_       interval->end_
-        // If we are worse than what we already have, add whatever we have so
-        // far up to interval.
-        const int start_new = interval->end_;
-        InsertInterval(manager, interval, distance_cost, lower_in, upper_in,
-                       index, start, interval->start_);
-        start = start_new;
-        continue;
-      }
-
-      // We know the two intervals intersect.
-      if (upper_full_in >= lower_full_interval) {
-        // There is no clear cut on which is best, so let's keep both.
-        // [*********[*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*]***********]
-        // start     interval->start_     interval->end_         end
-        // OR
-        // [*********[*-*-*-*-*-*-*-*-*-*-*-]----------------------]
-        // start     interval->start_     end          interval->end_
-        const int end_new = (interval->end_ <= end) ? interval->end_ : end;
-        InsertInterval(manager, interval, distance_cost, lower_in, upper_in,
-                       index, start, end_new);
-        start = end_new;
-      } else if (start <= interval->start_ && interval->end_ <= end) {
-        //                   [----------------------------------]
-        //                   interval->start_       interval->end_
-        // [**************************************************************]
-        // start                                                        end
-        // We can safely remove the old interval as it is fully included.
-        PopInterval(manager, interval);
-      } else {
-        if (interval->start_ <= start && end <= interval->end_) {
-          // [--------------------------------------------------------------]
-          // interval->start_                                  interval->end_
-          //                     [*****************************]
-          //                     start                       end
-          // We have to split the old interval as it fully contains the new one.
-          const int end_original = interval->end_;
-          interval->end_ = start;
-          InsertInterval(manager, interval, interval->distance_cost_,
-                         interval->lower_, interval->upper_, interval->index_,
-                         end, end_original);
-        } else if (interval->start_ < start) {
-          // [------------------------------------]
-          // interval->start_        interval->end_
-          //                     [*****************************]
-          //                     start                       end
-          interval->end_ = start;
-        } else {
-          //              [------------------------------------]
-          //              interval->start_        interval->end_
-          // [*****************************]
-          // start                       end
-          interval->start_ = end;
-        }
-
-        // The interval has been modified, we need to reposition it or write it.
-        RepositionInterval(manager, interval);
-      }
-    }
-    // Insert the remaining interval from start to end.
-    InsertInterval(manager, interval, distance_cost, lower_in, upper_in, index,
-                   start, end);
-  }
-}
-
-static int BackwardReferencesHashChainDistanceOnly(
-    int xsize, int ysize, const uint32_t* const argb, int quality,
-    int cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs, uint16_t* const dist_array) {
-  int i;
-  int ok = 0;
-  int cc_init = 0;
-  const int pix_count = xsize * ysize;
-  const int use_color_cache = (cache_bits > 0);
-  const size_t literal_array_size = sizeof(double) *
-      (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
-       ((cache_bits > 0) ? (1 << cache_bits) : 0));
-  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
-  CostModel* const cost_model =
-      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
-  VP8LColorCache hashers;
-  const int skip_length = 32 + quality;
-  const int skip_min_distance_code = 2;
-  CostManager* cost_manager =
-      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
-
-  if (cost_model == NULL || cost_manager == NULL) goto Error;
-
-  cost_model->literal_ = (double*)(cost_model + 1);
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
-  }
-
-  if (!CostModelBuild(cost_model, cache_bits, refs)) {
-    goto Error;
-  }
-
-  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
-    goto Error;
-  }
-
-  // We loop one pixel at a time, but store all currently best points to
-  // non-processed locations from this point.
-  dist_array[0] = 0;
-  // Add first pixel as literal.
-  AddSingleLiteralWithCostModel(argb + 0, &hashers, cost_model, 0,
-                                use_color_cache, 0.0, cost_manager->costs_,
-                                dist_array);
-
-  for (i = 1; i < pix_count - 1; ++i) {
-    int offset = 0, len = 0;
-    double prev_cost = cost_manager->costs_[i - 1];
-    HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len >= 2) {
-      // If we are dealing with a non-literal.
-      const int code = DistanceToPlaneCode(xsize, offset);
-      const double offset_cost = GetDistanceCost(cost_model, code);
-      const int first_i = i;
-      int j_max = 0, interval_ends_index = 0;
-      const int is_offset_zero = (offset_cost == 0.);
-
-      if (!is_offset_zero) {
-        j_max = (int)ceil(
-            (cost_manager->max_cost_cache_ - cost_manager->min_cost_cache_) /
-            offset_cost);
-        if (j_max < 1) {
-          j_max = 1;
-        } else if (j_max > cost_manager->interval_ends_size_ - 1) {
-          // This could only happen in the case of MAX_LENGTH.
-          j_max = cost_manager->interval_ends_size_ - 1;
-        }
-      }  // else j_max is unused anyway.
-
-      // Instead of considering all contributions from a pixel i by calling:
-      //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
-      // we optimize these contributions in case offset_cost stays the same for
-      // consecutive pixels. This describes a set of pixels similar to a
-      // previous set (e.g. constant color regions).
-      for (; i < pix_count - 1; ++i) {
-        int offset_next, len_next;
-        prev_cost = cost_manager->costs_[i - 1];
-
-        if (is_offset_zero) {
-          // No optimization can be made so we just push all of the
-          // contributions from i.
-          PushInterval(cost_manager, prev_cost, i, len);
-        } else {
-          // j_max is chosen as the smallest j such that:
-          //       max of cost_cache_ < j*offset cost + min of cost_cache_
-          // Therefore, the pixel influenced by i-j_max, cannot be influenced
-          // by i. Only the costs after the end of what i contributed need to be
-          // updated. cost_manager->interval_ends_ is a circular buffer that
-          // stores those ends.
-          const double distance_cost = prev_cost + offset_cost;
-          int j = cost_manager->interval_ends_[interval_ends_index];
-          if (i - first_i <= j_max ||
-              !IsCostCacheIntervalWritable(j, i + len)) {
-            PushInterval(cost_manager, distance_cost, i, len);
-          } else {
-            for (; j < i + len; ++j) {
-              UpdateCost(cost_manager, j, i, distance_cost);
-            }
-          }
-          // Store the new end in the circular buffer.
-          assert(interval_ends_index < cost_manager->interval_ends_size_);
-          cost_manager->interval_ends_[interval_ends_index] = i + len;
-          if (++interval_ends_index > j_max) interval_ends_index = 0;
-        }
-
-        // Check whether i is the last pixel to consider, as it is handled
-        // differently.
-        if (i + 1 >= pix_count - 1) break;
-        HashChainFindCopy(hash_chain, i + 1, &offset_next, &len_next);
-        if (offset_next != offset) break;
-        len = len_next;
-        UpdateCostPerIndex(cost_manager, i);
-        AddSingleLiteralWithCostModel(argb + i, &hashers, cost_model, i,
-                                      use_color_cache, prev_cost,
-                                      cost_manager->costs_, dist_array);
-      }
-      // Submit the last pixel.
-      UpdateCostPerIndex(cost_manager, i + 1);
-
-      // This if is for speedup only. It roughly doubles the speed, and
-      // makes compression worse by .1 %.
-      if (len >= skip_length && code <= skip_min_distance_code) {
-        // Long copy for short distances, let's skip the middle
-        // lookups for better copies.
-        // 1) insert the hashes.
-        if (use_color_cache) {
-          int k;
-          for (k = 0; k < len; ++k) {
-            VP8LColorCacheInsert(&hashers, argb[i + k]);
-          }
-        }
-        // 2) jump.
-        {
-          const int i_next = i + len - 1;  // for loop does ++i, thus -1 here.
-          for (; i <= i_next; ++i) UpdateCostPerIndex(cost_manager, i + 1);
-          i = i_next;
-        }
-        goto next_symbol;
-      }
-      if (len > 2) {
-        // Also try the smallest interval possible (size 2).
-        double cost_total =
-            prev_cost + offset_cost + GetLengthCost(cost_model, 1);
-        if (cost_manager->costs_[i + 1] > cost_total) {
-          cost_manager->costs_[i + 1] = (float)cost_total;
-          dist_array[i + 1] = 2;
-        }
-      }
-    } else {
-      // The pixel is added as a single literal so just update the costs.
-      UpdateCostPerIndex(cost_manager, i + 1);
-    }
-
-    AddSingleLiteralWithCostModel(argb + i, &hashers, cost_model, i,
-                                  use_color_cache, prev_cost,
-                                  cost_manager->costs_, dist_array);
-
- next_symbol: ;
-  }
-  // Handle the last pixel.
-  if (i == (pix_count - 1)) {
-    AddSingleLiteralWithCostModel(
-        argb + i, &hashers, cost_model, i, use_color_cache,
-        cost_manager->costs_[pix_count - 2], cost_manager->costs_, dist_array);
-  }
-
-  ok = !refs->error_;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  CostManagerClear(cost_manager);
-  WebPSafeFree(cost_model);
-  WebPSafeFree(cost_manager);
-  return ok;
-}
-
-// We pack the path at the end of *dist_array and return
-// a pointer to this part of the array. Example:
-// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
-static void TraceBackwards(uint16_t* const dist_array,
-                           int dist_array_size,
-                           uint16_t** const chosen_path,
-                           int* const chosen_path_size) {
-  uint16_t* path = dist_array + dist_array_size;
-  uint16_t* cur = dist_array + dist_array_size - 1;
-  while (cur >= dist_array) {
-    const int k = *cur;
-    --path;
-    *path = k;
-    cur -= k;
-  }
-  *chosen_path = path;
-  *chosen_path_size = (int)(dist_array + dist_array_size - path);
-}
-
-static int BackwardReferencesHashChainFollowChosenPath(
-    const uint32_t* const argb, int cache_bits,
-    const uint16_t* const chosen_path, int chosen_path_size,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
-  const int use_color_cache = (cache_bits > 0);
-  int ix;
-  int i = 0;
-  int ok = 0;
-  int cc_init = 0;
-  VP8LColorCache hashers;
-
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
-  }
-
-  ClearBackwardRefs(refs);
-  for (ix = 0; ix < chosen_path_size; ++ix) {
-    const int len = chosen_path[ix];
-    if (len != 1) {
-      int k;
-      const int offset = HashChainFindOffset(hash_chain, i);
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
-      if (use_color_cache) {
-        for (k = 0; k < len; ++k) {
-          VP8LColorCacheInsert(&hashers, argb[i + k]);
-        }
-      }
-      i += len;
-    } else {
-      PixOrCopy v;
-      const int idx =
-          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
-      if (idx >= 0) {
-        // use_color_cache is true and hashers contains argb[i]
-        // push pixel as a color cache index
-        v = PixOrCopyCreateCacheIdx(idx);
-      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
-        v = PixOrCopyCreateLiteral(argb[i]);
-      }
-      BackwardRefsCursorAdd(refs, v);
-      ++i;
-    }
-  }
-  ok = !refs->error_;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  return ok;
-}
-
-// Returns 1 on success.
-static int BackwardReferencesTraceBackwards(
-    int xsize, int ysize, const uint32_t* const argb, int quality,
-    int cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs) {
-  int ok = 0;
-  const int dist_array_size = xsize * ysize;
-  uint16_t* chosen_path = NULL;
-  int chosen_path_size = 0;
-  uint16_t* dist_array =
-      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
-
-  if (dist_array == NULL) goto Error;
-
-  if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, argb, quality, cache_bits, hash_chain,
-      refs, dist_array)) {
-    goto Error;
-  }
-  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
-  if (!BackwardReferencesHashChainFollowChosenPath(
-          argb, cache_bits, chosen_path, chosen_path_size, hash_chain, refs)) {
-    goto Error;
-  }
-  ok = 1;
- Error:
-  WebPSafeFree(dist_array);
-  return ok;
-}
-
-static void BackwardReferences2DLocality(int xsize,
-                                         const VP8LBackwardRefs* const refs) {
-  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  while (VP8LRefsCursorOk(&c)) {
-    if (PixOrCopyIsCopy(c.cur_pos)) {
-      const int dist = c.cur_pos->argb_or_distance;
-      const int transformed_dist = DistanceToPlaneCode(xsize, dist);
-      c.cur_pos->argb_or_distance = transformed_dist;
-    }
-    VP8LRefsCursorNext(&c);
-  }
-}
-
-// Computes the entropies for a color cache size (in bits) between 0 (unused)
-// and cache_bits_max (inclusive).
-// Returns 1 on success, 0 in case of allocation error.
-static int ComputeCacheEntropies(const uint32_t* argb,
-                                 const VP8LBackwardRefs* const refs,
-                                 int cache_bits_max, double entropies[]) {
-  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
-  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
-  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
-  int ok = 0;
-  int i;
-
-  for (i = 0; i <= cache_bits_max; ++i) {
-    histos[i] = VP8LAllocateHistogram(i);
-    if (histos[i] == NULL) goto Error;
-    if (i == 0) continue;
-    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
-    if (!cc_init[i]) goto Error;
-  }
-
-  assert(cache_bits_max >= 0);
-  // Do not use the color cache for cache_bits=0.
-  while (VP8LRefsCursorOk(&c)) {
-    VP8LHistogramAddSinglePixOrCopy(histos[0], c.cur_pos);
-    VP8LRefsCursorNext(&c);
-  }
-  if (cache_bits_max > 0) {
-    c = VP8LRefsCursorInit(refs);
-    while (VP8LRefsCursorOk(&c)) {
-      const PixOrCopy* const v = c.cur_pos;
-      if (PixOrCopyIsLiteral(v)) {
-        const uint32_t pix = *argb++;
-        // The keys of the caches can be derived from the longest one.
-        int key = HashPix(pix, 32 - cache_bits_max);
-        for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
-          if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
-            ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
-          } else {
-            VP8LColorCacheSet(&hashers[i], key, pix);
-            ++histos[i]->blue_[pix & 0xff];
-            ++histos[i]->literal_[(pix >> 8) & 0xff];
-            ++histos[i]->red_[(pix >> 16) & 0xff];
-            ++histos[i]->alpha_[pix >> 24];
-          }
-        }
-      } else {
-        // Update the histograms for distance/length.
-        int len = PixOrCopyLength(v);
-        int code_dist, code_len, extra_bits;
-        uint32_t argb_prev = *argb ^ 0xffffffffu;
-        VP8LPrefixEncodeBits(len, &code_len, &extra_bits);
-        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code_dist, &extra_bits);
-        for (i = 1; i <= cache_bits_max; ++i) {
-          ++histos[i]->literal_[NUM_LITERAL_CODES + code_len];
-          ++histos[i]->distance_[code_dist];
-        }
-        // Update the colors caches.
-        do {
-          if (*argb != argb_prev) {
-            // Efficiency: insert only if the color changes.
-            int key = HashPix(*argb, 32 - cache_bits_max);
-            for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
-              hashers[i].colors_[key] = *argb;
-            }
-            argb_prev = *argb;
-          }
-          argb++;
-        } while (--len != 0);
-      }
-      VP8LRefsCursorNext(&c);
-    }
-  }
-  for (i = 0; i <= cache_bits_max; ++i) {
-    entropies[i] = VP8LHistogramEstimateBits(histos[i]);
-  }
-  ok = 1;
-Error:
-  for (i = 0; i <= cache_bits_max; ++i) {
-    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
-    VP8LFreeHistogram(histos[i]);
-  }
-  return ok;
-}
-
-// Evaluate optimal cache bits for the local color cache.
-// The input *best_cache_bits sets the maximum cache bits to use (passing 0
-// implies disabling the local color cache). The local color cache is also
-// disabled for the lower (<= 25) quality.
-// Returns 0 in case of memory error.
-static int CalculateBestCacheSize(const uint32_t* const argb,
-                                  int xsize, int ysize, int quality,
-                                  const VP8LHashChain* const hash_chain,
-                                  VP8LBackwardRefs* const refs,
-                                  int* const lz77_computed,
-                                  int* const best_cache_bits) {
-  int i;
-  int cache_bits_high = (quality <= 25) ? 0 : *best_cache_bits;
-  double entropy_min = MAX_ENTROPY;
-  double entropies[MAX_COLOR_CACHE_BITS + 1];
-
-  assert(cache_bits_high <= MAX_COLOR_CACHE_BITS);
-
-  *lz77_computed = 0;
-  if (cache_bits_high == 0) {
-    *best_cache_bits = 0;
-    // Local color cache is disabled.
-    return 1;
-  }
-  // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color cache
-  // is not that different in practice.
-  if (!BackwardReferencesLz77(xsize, ysize, argb, 0, hash_chain, refs)) {
-    return 0;
-  }
-  // Find the cache_bits giving the lowest entropy. The search is done in a
-  // brute-force way as the function (entropy w.r.t cache_bits) can be
-  // anything in practice.
-  if (!ComputeCacheEntropies(argb, refs, cache_bits_high, entropies)) {
-    return 0;
-  }
-  for (i = 0; i <= cache_bits_high; ++i) {
-    if (i == 0 || entropies[i] < entropy_min) {
-      entropy_min = entropies[i];
-      *best_cache_bits = i;
-    }
-  }
-  return 1;
-}
-
-// Update (in-place) backward references for specified cache_bits.
-static int BackwardRefsWithLocalCache(const uint32_t* const argb,
-                                      int cache_bits,
-                                      VP8LBackwardRefs* const refs) {
-  int pixel_index = 0;
-  VP8LColorCache hashers;
-  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
-
-  while (VP8LRefsCursorOk(&c)) {
-    PixOrCopy* const v = c.cur_pos;
-    if (PixOrCopyIsLiteral(v)) {
-      const uint32_t argb_literal = v->argb_or_distance;
-      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
-      if (ix >= 0) {
-        // hashers contains argb_literal
-        *v = PixOrCopyCreateCacheIdx(ix);
-      } else {
-        VP8LColorCacheInsert(&hashers, argb_literal);
-      }
-      ++pixel_index;
-    } else {
-      // refs was created without local cache, so it can not have cache indexes.
-      int k;
-      assert(PixOrCopyIsCopy(v));
-      for (k = 0; k < v->len; ++k) {
-        VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
-      }
-    }
-    VP8LRefsCursorNext(&c);
-  }
-  VP8LColorCacheClear(&hashers);
-  return 1;
-}
-
-static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
-    int width, int height, const uint32_t* const argb,
-    int* const cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs refs_array[2]) {
-  VP8LBackwardRefs* refs_lz77 = &refs_array[0];
-  *cache_bits = 0;
-  if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
-    return NULL;
-  }
-  BackwardReferences2DLocality(width, refs_lz77);
-  return refs_lz77;
-}
-
-static VP8LBackwardRefs* GetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int* const cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs refs_array[2]) {
-  int lz77_is_useful;
-  int lz77_computed;
-  double bit_cost_lz77, bit_cost_rle;
-  VP8LBackwardRefs* best = NULL;
-  VP8LBackwardRefs* refs_lz77 = &refs_array[0];
-  VP8LBackwardRefs* refs_rle = &refs_array[1];
-  VP8LHistogram* histo = NULL;
-
-  if (!CalculateBestCacheSize(argb, width, height, quality, hash_chain,
-                              refs_lz77, &lz77_computed, cache_bits)) {
-    goto Error;
-  }
-
-  if (lz77_computed) {
-    // Transform refs_lz77 for the optimized cache_bits.
-    if (*cache_bits > 0) {
-      if (!BackwardRefsWithLocalCache(argb, *cache_bits, refs_lz77)) {
-        goto Error;
-      }
-    }
-  } else {
-    if (!BackwardReferencesLz77(width, height, argb, *cache_bits, hash_chain,
-                                refs_lz77)) {
-      goto Error;
-    }
-  }
-
-  if (!BackwardReferencesRle(width, height, argb, *cache_bits, refs_rle)) {
-    goto Error;
-  }
-
-  histo = VP8LAllocateHistogram(*cache_bits);
-  if (histo == NULL) goto Error;
-
-  {
-    // Evaluate LZ77 coding.
-    VP8LHistogramCreate(histo, refs_lz77, *cache_bits);
-    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
-    // Evaluate RLE coding.
-    VP8LHistogramCreate(histo, refs_rle, *cache_bits);
-    bit_cost_rle = VP8LHistogramEstimateBits(histo);
-    // Decide if LZ77 is useful.
-    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
-  }
-
-  // Choose appropriate backward reference.
-  if (lz77_is_useful) {
-    // TraceBackwards is costly. Don't execute it at lower quality.
-    const int try_lz77_trace_backwards = (quality >= 25);
-    best = refs_lz77;   // default guess: lz77 is better
-    if (try_lz77_trace_backwards) {
-      VP8LBackwardRefs* const refs_trace = refs_rle;
-      if (!VP8LBackwardRefsCopy(refs_lz77, refs_trace)) {
-        best = NULL;
-        goto Error;
-      }
-      if (BackwardReferencesTraceBackwards(width, height, argb, quality,
-                                           *cache_bits, hash_chain,
-                                           refs_trace)) {
-        double bit_cost_trace;
-        // Evaluate LZ77 coding.
-        VP8LHistogramCreate(histo, refs_trace, *cache_bits);
-        bit_cost_trace = VP8LHistogramEstimateBits(histo);
-        if (bit_cost_trace < bit_cost_lz77) {
-          best = refs_trace;
-        }
-      }
-    }
-  } else {
-    best = refs_rle;
-  }
-
-  BackwardReferences2DLocality(width, best);
-
- Error:
-  VP8LFreeHistogram(histo);
-  return best;
-}
-
-VP8LBackwardRefs* VP8LGetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[2]) {
-  if (low_effort) {
-    return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
-                                          hash_chain, refs_array);
-  } else {
-    return GetBackwardReferences(width, height, argb, quality, cache_bits,
-                                 hash_chain, refs_array);
-  }
-}
diff --git a/thirdparty/libwebp/dec/alpha_dec.c b/thirdparty/libwebp/src/dec/alpha_dec.c
index 83ffd4b609..bce735bfc2 100644
--- a/thirdparty/libwebp/dec/alpha_dec.c
+++ b/thirdparty/libwebp/src/dec/alpha_dec.c
@@ -12,13 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./alphai_dec.h"
-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "../dsp/dsp.h"
-#include "../utils/quant_levels_dec_utils.h"
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/quant_levels_dec_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 //------------------------------------------------------------------------------
 // ALPHDecoder object.
diff --git a/thirdparty/libwebp/dec/alphai_dec.h b/thirdparty/libwebp/src/dec/alphai_dec.h
index 561e8151ee..e0fa281a55 100644
--- a/thirdparty/libwebp/dec/alphai_dec.h
+++ b/thirdparty/libwebp/src/dec/alphai_dec.h
@@ -11,11 +11,11 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#ifndef WEBP_DEC_ALPHAI_H_
-#define WEBP_DEC_ALPHAI_H_
+#ifndef WEBP_DEC_ALPHAI_DEC_H_
+#define WEBP_DEC_ALPHAI_DEC_H_
 
-#include "./webpi_dec.h"
-#include "../utils/filters_utils.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/filters_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_ALPHAI_H_ */
+#endif  /* WEBP_DEC_ALPHAI_DEC_H_ */
diff --git a/thirdparty/libwebp/dec/buffer_dec.c b/thirdparty/libwebp/src/dec/buffer_dec.c
index c685fd5646..75eb3c40b4 100644
--- a/thirdparty/libwebp/dec/buffer_dec.c
+++ b/thirdparty/libwebp/src/dec/buffer_dec.c
@@ -13,15 +13,15 @@
 
 #include <stdlib.h>
 
-#include "./vp8i_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // WebPDecBuffer
 
 // Number of bytes per pixel for the different color-spaces.
-static const int kModeBpp[MODE_LAST] = {
+static const uint8_t kModeBpp[MODE_LAST] = {
   3, 4, 3, 4, 4, 2, 2,
   4, 4, 4, 2,    // pre-multiplied modes
   1, 1 };
@@ -36,7 +36,7 @@ static int IsValidColorspace(int webp_csp_mode) {
 // strictly speaking, the very last (or first, if flipped) row
 // doesn't require padding.
 #define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
-    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+    ((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH))
 
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
   int ok = 1;
@@ -98,9 +98,14 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     uint64_t uv_size = 0, a_size = 0, total_size;
     // We need memory and it hasn't been allocated yet.
     // => initialize output buffer, now that dimensions are known.
-    const int stride = w * kModeBpp[mode];
-    const uint64_t size = (uint64_t)stride * h;
+    int stride;
+    uint64_t size;
 
+    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
+      return VP8_STATUS_INVALID_PARAM;
+    }
+    stride = w * kModeBpp[mode];
+    size = (uint64_t)stride * h;
     if (!WebPIsRGBMode(mode)) {
       uv_stride = (w + 1) / 2;
       uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
@@ -169,11 +174,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
   return VP8_STATUS_OK;
 }
 
-VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
                                     const WebPDecoderOptions* const options,
-                                    WebPDecBuffer* const out) {
+                                    WebPDecBuffer* const buffer) {
   VP8StatusCode status;
-  if (out == NULL || w <= 0 || h <= 0) {
+  if (buffer == NULL || width <= 0 || height <= 0) {
     return VP8_STATUS_INVALID_PARAM;
   }
   if (options != NULL) {    // First, apply options if there is any.
@@ -182,33 +187,39 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
       const int ch = options->crop_height;
       const int x = options->crop_left & ~1;
       const int y = options->crop_top & ~1;
-      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
+          x + cw > width || y + ch > height) {
         return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
       }
-      w = cw;
-      h = ch;
+      width = cw;
+      height = ch;
     }
+
     if (options->use_scaling) {
+#if !defined(WEBP_REDUCE_SIZE)
       int scaled_width = options->scaled_width;
       int scaled_height = options->scaled_height;
       if (!WebPRescalerGetScaledDimensions(
-              w, h, &scaled_width, &scaled_height)) {
+              width, height, &scaled_width, &scaled_height)) {
         return VP8_STATUS_INVALID_PARAM;
       }
-      w = scaled_width;
-      h = scaled_height;
+      width = scaled_width;
+      height = scaled_height;
+#else
+      return VP8_STATUS_INVALID_PARAM;   // rescaling not supported
+#endif
     }
   }
-  out->width = w;
-  out->height = h;
+  buffer->width = width;
+  buffer->height = height;
 
   // Then, allocate buffer for real.
-  status = AllocateBuffer(out);
+  status = AllocateBuffer(buffer);
   if (status != VP8_STATUS_OK) return status;
 
   // Use the stride trick if vertical flip is needed.
   if (options != NULL && options->flip) {
-    status = WebPFlipBuffer(out);
+    status = WebPFlipBuffer(buffer);
   }
   return status;
 }
diff --git a/thirdparty/libwebp/dec/common_dec.h b/thirdparty/libwebp/src/dec/common_dec.h
index 6961e22470..9995f1a51a 100644
--- a/thirdparty/libwebp/dec/common_dec.h
+++ b/thirdparty/libwebp/src/dec/common_dec.h
@@ -11,8 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_DEC_COMMON_H_
-#define WEBP_DEC_COMMON_H_
+#ifndef WEBP_DEC_COMMON_DEC_H_
+#define WEBP_DEC_COMMON_DEC_H_
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3,
        NUM_PROBAS = 11
      };
 
-#endif    // WEBP_DEC_COMMON_H_
+#endif    // WEBP_DEC_COMMON_DEC_H_
diff --git a/thirdparty/libwebp/dec/frame_dec.c b/thirdparty/libwebp/src/dec/frame_dec.c
index f91e27f7c8..517d0f5850 100644
--- a/thirdparty/libwebp/dec/frame_dec.c
+++ b/thirdparty/libwebp/src/dec/frame_dec.c
@@ -12,13 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Main reconstruction function.
 
-static const int kScan[16] = {
+static const uint16_t kScan[16] = {
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
@@ -320,7 +320,7 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
 #define MIN_DITHER_AMP 4
 
 #define DITHER_AMP_TAB_SIZE 12
-static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
   // roughly, it's dqm->uv_mat_[1]
   8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
 };
@@ -728,7 +728,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   }
 
   mem = (uint8_t*)dec->mem_;
-  dec->intra_t_ = (uint8_t*)mem;
+  dec->intra_t_ = mem;
   mem += intra_pred_mode_size;
 
   dec->yuv_t_ = (VP8TopSamples*)mem;
@@ -750,7 +750,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
 
   mem = (uint8_t*)WEBP_ALIGN(mem);
   assert((yuv_size & WEBP_ALIGN_CST) == 0);
-  dec->yuv_b_ = (uint8_t*)mem;
+  dec->yuv_b_ = mem;
   mem += yuv_size;
 
   dec->mb_data_ = (VP8MBData*)mem;
@@ -766,7 +766,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
     const int extra_rows = kFilterExtraRows[dec->filter_type_];
     const int extra_y = extra_rows * dec->cache_y_stride_;
     const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
-    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
+    dec->cache_y_ = mem + extra_y;
     dec->cache_u_ = dec->cache_y_
                   + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
     dec->cache_v_ = dec->cache_u_
@@ -776,7 +776,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += cache_size;
 
   // alpha plane
-  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
+  dec->alpha_plane_ = alpha_size ? mem : NULL;
   mem += alpha_size;
   assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
 
diff --git a/thirdparty/libwebp/dec/idec_dec.c b/thirdparty/libwebp/src/dec/idec_dec.c
index 78fb2e7186..a371ed7500 100644
--- a/thirdparty/libwebp/dec/idec_dec.c
+++ b/thirdparty/libwebp/src/dec/idec_dec.c
@@ -15,10 +15,10 @@
 #include <string.h>
 #include <stdlib.h>
 
-#include "./alphai_dec.h"
-#include "./webpi_dec.h"
-#include "./vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
@@ -673,12 +673,12 @@ void WebPIDelete(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Wrapper toward WebPINewDecoder
 
-WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
+WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
                           size_t output_buffer_size, int output_stride) {
   const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
   WebPIDecoder* idec;
 
-  if (mode >= MODE_YUV) return NULL;
+  if (csp >= MODE_YUV) return NULL;
   if (is_external_memory == 0) {    // Overwrite parameters to sane values.
     output_buffer_size = 0;
     output_stride = 0;
@@ -689,7 +689,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
   }
   idec = WebPINewDecoder(NULL);
   if (idec == NULL) return NULL;
-  idec->output_.colorspace = mode;
+  idec->output_.colorspace = csp;
   idec->output_.is_external_memory = is_external_memory;
   idec->output_.u.RGBA.rgba = output_buffer;
   idec->output_.u.RGBA.stride = output_stride;
diff --git a/thirdparty/libwebp/dec/io_dec.c b/thirdparty/libwebp/src/dec/io_dec.c
index 8bfab86959..e603f19c98 100644
--- a/thirdparty/libwebp/dec/io_dec.c
+++ b/thirdparty/libwebp/src/dec/io_dec.c
@@ -13,11 +13,11 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "../dec/vp8i_dec.h"
-#include "./webpi_dec.h"
-#include "../dsp/dsp.h"
-#include "../dsp/yuv.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/yuv.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
@@ -212,7 +212,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
     int num_rows;
     const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     uint8_t* alpha_dst = base_rgba;
 #else
     uint8_t* alpha_dst = base_rgba + 1;
@@ -241,6 +241,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
 //------------------------------------------------------------------------------
 // YUV rescaling (no final RGB conversion needed)
 
+#if !defined(WEBP_REDUCE_SIZE)
 static int Rescale(const uint8_t* src, int src_stride,
                    int new_lines, WebPRescaler* const wrk) {
   int num_lines_out = 0;
@@ -431,7 +432,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                                int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
   uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   uint8_t* alpha_dst = base_rgba;
 #else
   uint8_t* alpha_dst = base_rgba + 1;
@@ -541,6 +542,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   return 1;
 }
 
+#endif  // WEBP_REDUCE_SIZE
+
 //------------------------------------------------------------------------------
 // Default custom functions
 
@@ -561,10 +564,14 @@ static int CustomSetup(VP8Io* io) {
     WebPInitUpsamplers();
   }
   if (io->use_scaling) {
+#if !defined(WEBP_REDUCE_SIZE)
     const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
     if (!ok) {
       return 0;    // memory error
     }
+#else
+    return 0;   // rescaling support not compiled
+#endif
   } else {
     if (is_rgb) {
       WebPInitSamplers();
@@ -598,9 +605,6 @@ static int CustomSetup(VP8Io* io) {
     }
   }
 
-  if (is_rgb) {
-    VP8YUVInit();
-  }
   return 1;
 }
 
diff --git a/thirdparty/libwebp/dec/quant_dec.c b/thirdparty/libwebp/src/dec/quant_dec.c
index 14e3198946..f07212ad73 100644
--- a/thirdparty/libwebp/dec/quant_dec.c
+++ b/thirdparty/libwebp/src/dec/quant_dec.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i_dec.h"
+#include "src/dec/vp8i_dec.h"
 
 static WEBP_INLINE int clip(int v, int M) {
   return v < 0 ? 0 : v > M ? M : v;
diff --git a/thirdparty/libwebp/dec/tree_dec.c b/thirdparty/libwebp/src/dec/tree_dec.c
index 9e805f60f3..3f5a957d32 100644
--- a/thirdparty/libwebp/dec/tree_dec.c
+++ b/thirdparty/libwebp/src/dec/tree_dec.c
@@ -11,15 +11,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i_dec.h"
-#include "../utils/bit_reader_inl_utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/bit_reader_inl_utils.h"
 
+#if !defined(USE_GENERIC_TREE)
 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
-#define USE_GENERIC_TREE
+#define USE_GENERIC_TREE 1   // ALTERNATE_CODE
+#else
+#define USE_GENERIC_TREE 0
 #endif
+#endif  // USE_GENERIC_TREE
 
-#ifdef USE_GENERIC_TREE
+#if (USE_GENERIC_TREE == 1)
 static const int8_t kYModesIntra4[18] = {
   -B_DC_PRED, 1,
     -B_TM_PRED, 2,
@@ -317,7 +321,7 @@ static void ParseIntraMode(VP8BitReader* const br,
       int x;
       for (x = 0; x < 4; ++x) {
         const uint8_t* const prob = kBModesProba[top[x]][ymode];
-#ifdef USE_GENERIC_TREE
+#if (USE_GENERIC_TREE == 1)
         // Generic tree-parsing
         int i = kYModesIntra4[VP8GetBit(br, prob[0])];
         while (i > 0) {
@@ -335,7 +339,7 @@ static void ParseIntraMode(VP8BitReader* const br,
                         (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
                           (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
                             (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
-#endif    // USE_GENERIC_TREE
+#endif  // USE_GENERIC_TREE
         top[x] = ymode;
       }
       memcpy(modes, top, 4 * sizeof(*top));
@@ -498,7 +502,7 @@ static const uint8_t
 
 // Paragraph 9.9
 
-static const int kBands[16 + 1] = {
+static const uint8_t kBands[16 + 1] = {
   0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
   0  // extra entry as sentinel
 };
diff --git a/thirdparty/libwebp/dec/vp8_dec.c b/thirdparty/libwebp/src/dec/vp8_dec.c
index fad8d9cf35..6212efd179 100644
--- a/thirdparty/libwebp/dec/vp8_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8_dec.c
@@ -13,12 +13,12 @@
 
 #include <stdlib.h>
 
-#include "./alphai_dec.h"
-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/bit_reader_inl_utils.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/bit_reader_inl_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/dec/vp8_dec.h b/thirdparty/libwebp/src/dec/vp8_dec.h
index b9337bbec0..ca85b340cf 100644
--- a/thirdparty/libwebp/dec/vp8_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8_dec.h
@@ -11,10 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_WEBP_DECODE_VP8_H_
-#define WEBP_WEBP_DECODE_VP8_H_
+#ifndef WEBP_DEC_VP8_DEC_H_
+#define WEBP_DEC_VP8_DEC_H_
 
-#include "../webp/decode.h"
+#include "src/webp/decode.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,7 +33,7 @@ extern "C" {
 //   /* customize io's functions (setup()/put()/teardown()) if needed. */
 //
 //   VP8Decoder* dec = VP8New();
-//   bool ok = VP8Decode(dec);
+//   int ok = VP8Decode(dec, &io);
 //   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
 //   VP8Delete(dec);
 //   return ok;
@@ -157,24 +157,24 @@ void VP8Delete(VP8Decoder* const dec);
 // Miscellaneous VP8/VP8L bitstream probing functions.
 
 // Returns true if the next 3 bytes in data contain the VP8 signature.
-WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
+WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size);
 
 // Validates the VP8 data-header and retrieves basic header information viz
 // width and height. Returns 0 in case of formatting error. *width/*height
 // can be passed NULL.
-WEBP_EXTERN(int) VP8GetInfo(
+WEBP_EXTERN int VP8GetInfo(
     const uint8_t* data,
     size_t data_size,    // data available so far
     size_t chunk_size,   // total data size expected in the chunk
     int* const width, int* const height);
 
 // Returns true if the next byte(s) in data is a VP8L signature.
-WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
+WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size);
 
 // Validates the VP8L data-header and retrieves basic header information viz
 // width, height and alpha. Returns 0 in case of formatting error.
 // width/height/has_alpha can be passed NULL.
-WEBP_EXTERN(int) VP8LGetInfo(
+WEBP_EXTERN int VP8LGetInfo(
     const uint8_t* data, size_t data_size,  // data available so far
     int* const width, int* const height, int* const has_alpha);
 
@@ -182,4 +182,4 @@ WEBP_EXTERN(int) VP8LGetInfo(
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_WEBP_DECODE_VP8_H_ */
+#endif  /* WEBP_DEC_VP8_DEC_H_ */
diff --git a/thirdparty/libwebp/dec/vp8i_dec.h b/thirdparty/libwebp/src/dec/vp8i_dec.h
index 555853e8f8..28244d9d7a 100644
--- a/thirdparty/libwebp/dec/vp8i_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8i_dec.h
@@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_DEC_VP8I_H_
-#define WEBP_DEC_VP8I_H_
+#ifndef WEBP_DEC_VP8I_DEC_H_
+#define WEBP_DEC_VP8I_DEC_H_
 
 #include <string.h>     // for memcpy()
-#include "./common_dec.h"
-#include "./vp8li_dec.h"
-#include "../utils/bit_reader_utils.h"
-#include "../utils/random_utils.h"
-#include "../utils/thread_utils.h"
-#include "../dsp/dsp.h"
+#include "src/dec/common_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/bit_reader_utils.h"
+#include "src/utils/random_utils.h"
+#include "src/utils/thread_utils.h"
+#include "src/dsp/dsp.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 6
-#define DEC_REV_VERSION 0
+#define DEC_REV_VERSION 1
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
@@ -57,7 +57,6 @@ extern "C" {
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
 #define YUV_SIZE (BPS * 17 + BPS * 9)
-#define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
@@ -317,4 +316,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_VP8I_H_ */
+#endif  /* WEBP_DEC_VP8I_DEC_H_ */
diff --git a/thirdparty/libwebp/dec/vp8l_dec.c b/thirdparty/libwebp/src/dec/vp8l_dec.c
index ef359a91f0..42ea3b5e4c 100644
--- a/thirdparty/libwebp/dec/vp8l_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8l_dec.c
@@ -14,22 +14,22 @@
 
 #include <stdlib.h>
 
-#include "./alphai_dec.h"
-#include "./vp8li_dec.h"
-#include "../dsp/dsp.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../dsp/yuv.h"
-#include "../utils/endian_inl_utils.h"
-#include "../utils/huffman_utils.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/dsp/yuv.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/utils/huffman_utils.h"
+#include "src/utils/utils.h"
 
 #define NUM_ARGB_CACHE_ROWS          16
 
 static const int kCodeLengthLiterals = 16;
 static const int kCodeLengthRepeatCode = 16;
-static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
-static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
+static const uint8_t kCodeLengthExtraBits[3] = { 2, 3, 7 };
+static const uint8_t kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
 
 // -----------------------------------------------------------------------------
 //  Five Huffman codes are used at each meta code:
@@ -86,7 +86,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
 // http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
-static const int kTableSize[12] = {
+static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 654,
   FIXED_TABLE_SIZE + 656,
   FIXED_TABLE_SIZE + 658,
@@ -485,6 +485,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 //------------------------------------------------------------------------------
 // Scaling.
 
+#if !defined(WEBP_REDUCE_SIZE)
 static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   const int num_channels = 4;
   const int in_width = io->mb_w;
@@ -516,10 +517,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
                    out_width, out_height, 0, num_channels, work);
   return 1;
 }
+#endif   // WEBP_REDUCE_SIZE
 
 //------------------------------------------------------------------------------
 // Export to ARGB
 
+#if !defined(WEBP_REDUCE_SIZE)
+
 // We have special "export" function since we need to convert from BGRA
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
@@ -561,6 +565,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
   return num_lines_out;
 }
 
+#endif   // WEBP_REDUCE_SIZE
+
 // Emit rows without any scaling.
 static int EmitRows(WEBP_CSP_MODE colorspace,
                     const uint8_t* row_in, int in_stride,
@@ -746,9 +752,12 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
       if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
         uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
-        const int num_rows_out = io->use_scaling ?
+        const int num_rows_out =
+#if !defined(WEBP_REDUCE_SIZE)
+         io->use_scaling ?
             EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
                                  rgba, buf->stride) :
+#endif  // WEBP_REDUCE_SIZE
             EmitRows(output->colorspace, rows_data, in_stride,
                      io->mb_w, io->mb_h, rgba, buf->stride);
         // Update 'last_out_row_'.
@@ -1012,12 +1021,13 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
       ok = 0;
       goto End;
     }
-    assert(br->eos_ == VP8LIsEndOfStream(br));
+    br->eos_ = VP8LIsEndOfStream(br);
   }
   // Process the remaining rows corresponding to last row-block.
   ExtractPalettedAlphaRows(dec, row > last_row ? last_row : row);
 
  End:
+  br->eos_ = VP8LIsEndOfStream(br);
   if (!ok || (br->eos_ && pos < end)) {
     ok = 0;
     dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
@@ -1090,11 +1100,12 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
     VP8LFillBitWindow(br);
     if (htree_group->use_packed_table) {
       code = ReadPackedSymbols(htree_group, br, src);
+      if (VP8LIsEndOfStream(br)) break;
       if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne;
     } else {
       code = ReadSymbol(htree_group->htrees[GREEN], br);
     }
-    if (br->eos_) break;  // early out
+    if (VP8LIsEndOfStream(br)) break;
     if (code < NUM_LITERAL_CODES) {  // Literal
       if (htree_group->is_trivial_literal) {
         *src = htree_group->literal_arb | (code << 8);
@@ -1104,7 +1115,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
         VP8LFillBitWindow(br);
         blue = ReadSymbol(htree_group->htrees[BLUE], br);
         alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
-        if (br->eos_) break;
+        if (VP8LIsEndOfStream(br)) break;
         *src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
       }
     AdvanceByOne:
@@ -1132,7 +1143,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       VP8LFillBitWindow(br);
       dist_code = GetCopyDistance(dist_symbol, br);
       dist = PlaneCodeToDistance(width, dist_code);
-      if (br->eos_) break;
+      if (VP8LIsEndOfStream(br)) break;
       if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
         goto Error;
       } else {
@@ -1169,9 +1180,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
     } else {  // Not reached
       goto Error;
     }
-    assert(br->eos_ == VP8LIsEndOfStream(br));
   }
 
+  br->eos_ = VP8LIsEndOfStream(br);
   if (dec->incremental_ && br->eos_ && src < src_end) {
     RestoreState(dec);
   } else if (!br->eos_) {
@@ -1630,12 +1641,19 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
 
     if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
 
+#if !defined(WEBP_REDUCE_SIZE)
     if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
 
     if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
       // need the alpha-multiply functions for premultiplied output or rescaling
       WebPInitAlphaProcessing();
     }
+#else
+    if (io->use_scaling) {
+      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      goto Err;
+    }
+#endif
     if (!WebPIsRGBMode(dec->output_->colorspace)) {
       WebPInitConvertARGBToYUV();
       if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
diff --git a/thirdparty/libwebp/dec/vp8li_dec.h b/thirdparty/libwebp/src/dec/vp8li_dec.h
index 097a9d0589..8e500cf9ff 100644
--- a/thirdparty/libwebp/dec/vp8li_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8li_dec.h
@@ -12,14 +12,14 @@
 // Author: Skal (pascal.massimino@gmail.com)
 //         Vikas Arora(vikaas.arora@gmail.com)
 
-#ifndef WEBP_DEC_VP8LI_H_
-#define WEBP_DEC_VP8LI_H_
+#ifndef WEBP_DEC_VP8LI_DEC_H_
+#define WEBP_DEC_VP8LI_DEC_H_
 
 #include <string.h>     // for memcpy()
-#include "./webpi_dec.h"
-#include "../utils/bit_reader_utils.h"
-#include "../utils/color_cache_utils.h"
-#include "../utils/huffman_utils.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/bit_reader_utils.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/huffman_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_VP8LI_H_ */
+#endif  /* WEBP_DEC_VP8LI_DEC_H_ */
diff --git a/thirdparty/libwebp/dec/webp_dec.c b/thirdparty/libwebp/src/dec/webp_dec.c
index a8e9c2c510..42d098874d 100644
--- a/thirdparty/libwebp/dec/webp_dec.c
+++ b/thirdparty/libwebp/src/dec/webp_dec.c
@@ -13,11 +13,11 @@
 
 #include <stdlib.h>
 
-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/utils.h"
-#include "../webp/mux_types.h"  // ALPHA_FLAG
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/utils.h"
+#include "src/webp/mux_types.h"  // ALPHA_FLAG
 
 //------------------------------------------------------------------------------
 // RIFF layout is:
@@ -421,7 +421,9 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
                                 NULL, NULL, NULL, &has_animation,
                                 NULL, headers);
   if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
-    // TODO(jzern): full support of animation frames will require API additions.
+    // The WebPDemux API + libwebp can be used to decode individual
+    // uncomposited frames or the WebPAnimDecoder can be used to fully
+    // reconstruct them (see webp/demux.h).
     if (has_animation) {
       status = VP8_STATUS_UNSUPPORTED_FEATURE;
     }
diff --git a/thirdparty/libwebp/dec/webpi_dec.h b/thirdparty/libwebp/src/dec/webpi_dec.h
index 696abc1958..c378ba6fc3 100644
--- a/thirdparty/libwebp/dec/webpi_dec.h
+++ b/thirdparty/libwebp/src/dec/webpi_dec.h
@@ -11,15 +11,15 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#ifndef WEBP_DEC_WEBPI_H_
-#define WEBP_DEC_WEBPI_H_
+#ifndef WEBP_DEC_WEBPI_DEC_H_
+#define WEBP_DEC_WEBPI_DEC_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "../utils/rescaler_utils.h"
-#include "./vp8_dec.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/dec/vp8_dec.h"
 
 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
@@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_WEBPI_H_ */
+#endif  /* WEBP_DEC_WEBPI_DEC_H_ */
diff --git a/thirdparty/libwebp/demux/anim_decode.c b/thirdparty/libwebp/src/demux/anim_decode.c
index f1cf176e72..05dd707371 100644
--- a/thirdparty/libwebp/demux/anim_decode.c
+++ b/thirdparty/libwebp/src/demux/anim_decode.c
@@ -11,15 +11,15 @@
 //
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <assert.h>
 #include <string.h>
 
-#include "../utils/utils.h"
-#include "../webp/decode.h"
-#include "../webp/demux.h"
+#include "src/utils/utils.h"
+#include "src/webp/decode.h"
+#include "src/webp/demux.h"
 
 #define NUM_CHANNELS 4
 
diff --git a/thirdparty/libwebp/demux/demux.c b/thirdparty/libwebp/src/demux/demux.c
index 100eab8c01..79c24a5a7f 100644
--- a/thirdparty/libwebp/demux/demux.c
+++ b/thirdparty/libwebp/src/demux/demux.c
@@ -11,21 +11,21 @@
 //
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "../utils/utils.h"
-#include "../webp/decode.h"     // WebPGetFeatures
-#include "../webp/demux.h"
-#include "../webp/format_constants.h"
+#include "src/utils/utils.h"
+#include "src/webp/decode.h"     // WebPGetFeatures
+#include "src/webp/demux.h"
+#include "src/webp/format_constants.h"
 
 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 2
+#define DMUX_REV_VERSION 3
 
 typedef struct {
   size_t start_;        // start location of the data
@@ -205,12 +205,14 @@ static void SetFrameInfo(size_t start_offset, size_t size,
   frame->complete_ = complete;
 }
 
-// Store image bearing chunks to 'frame'.
+// Store image bearing chunks to 'frame'. 'min_size' is an optional size
+// requirement, it may be zero.
 static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
                               MemBuffer* const mem, Frame* const frame) {
   int alpha_chunks = 0;
   int image_chunks = 0;
-  int done = (MemDataSize(mem) < min_size);
+  int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE ||
+              MemDataSize(mem) < min_size);
   ParseStatus status = PARSE_OK;
 
   if (done) return PARSE_NEED_MORE_DATA;
@@ -401,9 +403,9 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
   frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
   if (frame == NULL) return PARSE_ERROR;
 
-  // For the single image case we allow parsing of a partial frame, but we need
-  // at least CHUNK_HEADER_SIZE for parsing.
-  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
+  // For the single image case we allow parsing of a partial frame, so no
+  // minimum size is imposed here.
+  status = StoreFrame(1, 0, &dmux->mem_, frame);
   if (status != PARSE_ERROR) {
     const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
     // Clear any alpha when the alpha flag is missing.
diff --git a/thirdparty/libwebp/dsp/alpha_processing.c b/thirdparty/libwebp/src/dsp/alpha_processing.c
index 4b60e092be..590e3bc312 100644
--- a/thirdparty/libwebp/dsp/alpha_processing.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing.c
@@ -12,10 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 // Tables can be faster on some platform but incur some extra binary size (~2k).
-// #define USE_TABLES_FOR_ALPHA_MULT
+#if !defined(USE_TABLES_FOR_ALPHA_MULT)
+#define USE_TABLES_FOR_ALPHA_MULT 0   // ALTERNATE_CODE
+#endif
+
 
 // -----------------------------------------------------------------------------
 
@@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) {
   return v;
 }
 
-#ifdef USE_TABLES_FOR_ALPHA_MULT
+#if (USE_TABLES_FOR_ALPHA_MULT == 1)
 
 static const uint32_t kMultTables[2][256] = {
   {    // (255u << MFIX) / alpha
@@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
   return inverse ? (255u << MFIX) / a : a * KINV_255;
 }
 
-#endif    // USE_TABLES_FOR_ALPHA_MULT
+#endif  // USE_TABLES_FOR_ALPHA_MULT
 
-void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
+void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
     const uint32_t argb = ptr[x];
@@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
-void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
-                  int width, int inverse) {
+void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+                   int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
     const uint32_t a = alpha[x];
@@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride,
 #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
 #endif
 
-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
-                               int w, int h, int stride) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
+                                 int w, int h, int stride) {
   while (h-- > 0) {
     uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
     const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
@@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
     rgba += stride;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 #undef MULTIPLIER
 #undef PREMULTIPLY
 
@@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
   return (x * m) >> 16;
 }
 
-static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
-                                               int w, int h, int stride,
-                                               int rg_byte_pos /* 0 or 1 */) {
+static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
+                                                 int w, int h, int stride,
+                                                 int rg_byte_pos /* 0 or 1 */) {
   while (h-- > 0) {
     int i;
     for (i = 0; i < w; ++i) {
@@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
 }
 #undef MULTIPLIER
 
-static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
-                                   int w, int h, int stride) {
-#ifdef WEBP_SWAP_16BIT_CSP
-  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
+static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
+                                     int w, int h, int stride) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+  ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
 #else
-  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
+  ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
 #endif
 }
 
+#if !WEBP_NEON_OMIT_C_CODE
 static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
                            int width, int height,
                            uint8_t* dst, int dst_stride) {
@@ -338,6 +344,36 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
   int i;
   for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+
+static int HasAlpha8b_C(const uint8_t* src, int length) {
+  while (length-- > 0) if (*src++ != 0xff) return 1;
+  return 0;
+}
+
+static int HasAlpha32b_C(const uint8_t* src, int length) {
+  int x;
+  for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// Simple channel manipulations.
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                      int len, int step, uint32_t* out) {
+  int i, offset = 0;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+    offset += step;
+  }
+}
 
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
@@ -345,6 +381,11 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out);
+
+int (*WebPHasAlpha8b)(const uint8_t* src, int length);
+int (*WebPHasAlpha32b)(const uint8_t* src, int length);
 
 //------------------------------------------------------------------------------
 // Init function
@@ -360,15 +401,21 @@ static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
   if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  WebPMultARGBRow = WebPMultARGBRowC;
-  WebPMultRow = WebPMultRowC;
-  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
-  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPMultARGBRow = WebPMultARGBRow_C;
+  WebPMultRow = WebPMultRow_C;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
 
+  WebPPackRGB = PackRGB_C;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
   WebPDispatchAlpha = DispatchAlpha_C;
   WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
   WebPExtractAlpha = ExtractAlpha_C;
   WebPExtractGreen = ExtractGreen_C;
+#endif
+
+  WebPHasAlpha8b = HasAlpha8b_C;
+  WebPHasAlpha32b = HasAlpha32b_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -382,16 +429,31 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
 #endif
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitAlphaProcessingNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       WebPInitAlphaProcessingMIPSdspR2();
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitAlphaProcessingNEON();
+  }
+#endif
+
+  assert(WebPMultARGBRow != NULL);
+  assert(WebPMultRow != NULL);
+  assert(WebPApplyAlphaMultiply != NULL);
+  assert(WebPApplyAlphaMultiply4444 != NULL);
+  assert(WebPDispatchAlpha != NULL);
+  assert(WebPDispatchAlphaToGreen != NULL);
+  assert(WebPExtractAlpha != NULL);
+  assert(WebPExtractGreen != NULL);
+  assert(WebPPackRGB != NULL);
+  assert(WebPHasAlpha8b != NULL);
+  assert(WebPHasAlpha32b != NULL);
+
   alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/alpha_processing_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
index c631d78905..e0dc91bab9 100644
--- a/thirdparty/libwebp/dsp/alpha_processing_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
@@ -12,13 +12,13 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut  (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
-                         int width, int height,
-                         uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
+                                   int width, int height,
+                                   uint8_t* dst, int dst_stride) {
   uint32_t alpha_mask = 0xffffffff;
   int i, j, temp0;
 
@@ -79,7 +79,8 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xff);
 }
 
-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
+                                  int inverse) {
   int x;
   const uint32_t c_00ffffff = 0x00ffffffu;
   const uint32_t c_ff000000 = 0xff000000u;
@@ -124,14 +125,54 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
+static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
+                              const uint8_t* b, int len, int step,
+                              uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void WebPInitAlphaProcessingMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
-  WebPDispatchAlpha = DispatchAlpha;
-  WebPMultARGBRow = MultARGBRow;
+  WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
+  WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+  WebPPackRGB = PackRGB_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/alpha_processing_neon.c b/thirdparty/libwebp/src/dsp/alpha_processing_neon.c
index 606a401cf7..9d55421704 100644
--- a/thirdparty/libwebp/dsp/alpha_processing_neon.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_neon.c
@@ -11,11 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
-#include "./neon.h"
+#include "src/dsp/neon.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/dsp/alpha_processing_sse2.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c
index 83dc559fac..76587006a1 100644
--- a/thirdparty/libwebp/dsp/alpha_processing_sse2.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
-                         int width, int height,
-                         uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
+                              int width, int height,
+                              uint8_t* dst, int dst_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
   return (alpha_and != 0xff);
 }
 
-static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
-                                 int width, int height,
-                                 uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
+                                      int width, int height,
+                                      uint32_t* dst, int dst_stride) {
   int i, j;
   const __m128i zero = _mm_setzero_si128();
   const int limit = width & ~15;
@@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
-                        int width, int height,
-                        uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+                             int width, int height,
+                             uint8_t* alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
 #undef MULTIPLIER
 #undef PREMULTIPLY
 
+//------------------------------------------------------------------------------
+// Alpha detection
+
+static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
+  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  int i = 0;
+  for (; i + 16 <= length; i += 16) {
+    const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i < length; ++i) if (src[i] != 0xff) return 1;
+  return 0;
+}
+
+static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
+  const __m128i alpha_mask = _mm_set1_epi32(0xff);
+  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  int i = 0;
+  // We don't know if we can access the last 3 bytes after the last alpha
+  // value 'src[4 * length - 4]' (because we don't know if alpha is the first
+  // or the last byte of the quadruplet). Hence the '-3' protection below.
+  length = length * 4 - 3;   // size in bytes
+  for (; i + 64 <= length; i += 64) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i +  0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
+    const __m128i b0 = _mm_and_si128(a0, alpha_mask);
+    const __m128i b1 = _mm_and_si128(a1, alpha_mask);
+    const __m128i b2 = _mm_and_si128(a2, alpha_mask);
+    const __m128i b3 = _mm_and_si128(a3, alpha_mask);
+    const __m128i c0 = _mm_packs_epi32(b0, b1);
+    const __m128i c1 = _mm_packs_epi32(b2, b3);
+    const __m128i d  = _mm_packus_epi16(c0, c1);
+    const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i + 32 <= length; i += 32) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i +  0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
+    const __m128i b0 = _mm_and_si128(a0, alpha_mask);
+    const __m128i b1 = _mm_and_si128(a1, alpha_mask);
+    const __m128i c  = _mm_packs_epi32(b0, b1);
+    const __m128i d  = _mm_packus_epi16(c, c);
+    const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
+  return 0;
+}
+
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows
 
@@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
     }
   }
   width -= x;
-  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
+  if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
 }
 
 static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
@@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
     }
   }
   width -= x;
-  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
+  if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
 }
 
 //------------------------------------------------------------------------------
@@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
   WebPMultARGBRow = MultARGBRow_SSE2;
   WebPMultRow = MultRow_SSE2;
   WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
-  WebPDispatchAlpha = DispatchAlpha;
-  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
-  WebPExtractAlpha = ExtractAlpha;
+  WebPDispatchAlpha = DispatchAlpha_SSE2;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
+  WebPExtractAlpha = ExtractAlpha_SSE2;
+
+  WebPHasAlpha8b = HasAlpha8b_SSE2;
+  WebPHasAlpha32b = HasAlpha32b_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/alpha_processing_sse41.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c
index 986fde94ed..56040f9c88 100644
--- a/thirdparty/libwebp/dsp/alpha_processing_sse41.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE41)
 
@@ -19,9 +19,9 @@
 
 //------------------------------------------------------------------------------
 
-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
-                        int width, int height,
-                        uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
+                              int width, int height,
+                              uint8_t* alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
 extern void WebPInitAlphaProcessingSSE41(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
-  WebPExtractAlpha = ExtractAlpha;
+  WebPExtractAlpha = ExtractAlpha_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/thirdparty/libwebp/dsp/common_sse2.h b/thirdparty/libwebp/src/dsp/common_sse2.h
index 995d7cf4ea..995d7cf4ea 100644
--- a/thirdparty/libwebp/dsp/common_sse2.h
+++ b/thirdparty/libwebp/src/dsp/common_sse2.h
diff --git a/thirdparty/libwebp/dsp/cost.c b/thirdparty/libwebp/src/dsp/cost.c
index 58ddea7248..a732389d58 100644
--- a/thirdparty/libwebp/dsp/cost.c
+++ b/thirdparty/libwebp/src/dsp/cost.c
@@ -9,8 +9,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
-#include "../enc/cost_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Boolean-cost cost table
@@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = {
 //------------------------------------------------------------------------------
 // Mode costs
 
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
   int n = res->first;
   // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
   const int p0 = res->prob[n][ctx0][0];
@@ -354,8 +354,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   return cost;
 }
 
-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
+static void SetResidualCoeffs_C(const int16_t* const coeffs,
+                                VP8Residual* const res) {
   int n;
   res->last = -1;
   assert(res->first == 0 || coeffs[0] == 0);
@@ -384,8 +384,8 @@ static volatile VP8CPUInfo cost_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
   if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  VP8GetResidualCost = GetResidualCost;
-  VP8SetResidualCoeffs = SetResidualCoeffs;
+  VP8GetResidualCost = GetResidualCost_C;
+  VP8SetResidualCoeffs = SetResidualCoeffs_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
diff --git a/thirdparty/libwebp/dsp/cost_mips32.c b/thirdparty/libwebp/src/dsp/cost_mips32.c
index 3102da877a..0500f88c13 100644
--- a/thirdparty/libwebp/dsp/cost_mips32.c
+++ b/thirdparty/libwebp/src/dsp/cost_mips32.c
@@ -9,13 +9,13 @@
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "../enc/cost_enc.h"
+#include "src/enc/cost_enc.h"
 
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
   int temp0, temp1;
   int v_reg, ctx_reg;
   int n = res->first;
@@ -96,8 +96,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   return cost;
 }
 
-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
+static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
+                                     VP8Residual* const res) {
   const int16_t* p_coeffs = (int16_t*)coeffs;
   int temp0, temp1, temp2, n, n1;
   assert(res->first == 0 || coeffs[0] == 0);
@@ -143,8 +143,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
 extern void VP8EncDspCostInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
-  VP8GetResidualCost = GetResidualCost;
-  VP8SetResidualCoeffs = SetResidualCoeffs;
+  VP8GetResidualCost = GetResidualCost_MIPS32;
+  VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/thirdparty/libwebp/dsp/cost_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
index 6ec8aeb610..51248de7a1 100644
--- a/thirdparty/libwebp/dsp/cost_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
@@ -9,13 +9,13 @@
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "../enc/cost_enc.h"
+#include "src/enc/cost_enc.h"
 
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
   int temp0, temp1;
   int v_reg, ctx_reg;
   int n = res->first;
@@ -97,7 +97,7 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
 extern void VP8EncDspCostInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
-  VP8GetResidualCost = GetResidualCost;
+  VP8GetResidualCost = GetResidualCost_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/cost_sse2.c b/thirdparty/libwebp/src/dsp/cost_sse2.c
index 421d51fdd5..487a079921 100644
--- a/thirdparty/libwebp/dsp/cost_sse2.c
+++ b/thirdparty/libwebp/src/dsp/cost_sse2.c
@@ -11,19 +11,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 
-static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
-                                  VP8Residual* const res) {
+static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
+                                   VP8Residual* const res) {
   const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
   const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
   // Use SSE2 to compare 16 values with a single instruction.
@@ -42,7 +42,7 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
   res->coeffs = coeffs;
 }
 
-static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
   uint8_t levels[16], ctxs[16];
   uint16_t abs_levels[16];
   int n = res->first;
@@ -108,8 +108,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
 extern void VP8EncDspCostInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
-  VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
-  VP8GetResidualCost = GetResidualCostSSE2;
+  VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
+  VP8GetResidualCost = GetResidualCost_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/cpu.c b/thirdparty/libwebp/src/dsp/cpu.c
index b5583b6e9b..8b40feed29 100644
--- a/thirdparty/libwebp/dsp/cpu.c
+++ b/thirdparty/libwebp/src/dsp/cpu.c
@@ -11,7 +11,7 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_HAVE_NEON_RTCD)
 #include <stdio.h>
@@ -143,7 +143,7 @@ static int x86CPUInfo(CPUFeature feature) {
     return !!(cpu_info[2] & (1 << 0));
   }
   if (feature == kSlowSSSE3) {
-    if (is_intel && (cpu_info[2] & (1 << 0))) {   // SSSE3?
+    if (is_intel && (cpu_info[2] & (1 << 9))) {   // SSSE3?
       return CheckSlowModel(cpu_info[0]);
     }
     return 0;
diff --git a/thirdparty/libwebp/dsp/dec.c b/thirdparty/libwebp/src/dsp/dec.c
index 007e985d8b..7e82407567 100644
--- a/thirdparty/libwebp/dsp/dec.c
+++ b/thirdparty/libwebp/src/dsp/dec.c
@@ -11,9 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include <assert.h>
+
+#include "src/dsp/dsp.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 // Transforms (Paragraph 14.4)
 
 #define STORE(x, y, v) \
-  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
+  dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))
 
 #define STORE2(y, dc, d, c) do {    \
   const int DC = (dc);              \
@@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define MUL1(a) ((((a) * 20091) >> 16) + (a))
 #define MUL2(a) (((a) * 35468) >> 16)
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformOne_C(const int16_t* in, uint8_t* dst) {
   int C[4 * 4], *tmp;
   int i;
   tmp = C;
@@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
 }
 
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
   const int a = in[0] + 4;
   const int c4 = MUL2(in[4]);
   const int d4 = MUL1(in[4]);
@@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 #undef MUL2
 #undef STORE2
 
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOne(in, dst);
+static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne_C(in, dst);
   if (do_two) {
-    TransformOne(in + 16, dst + 4);
+    TransformOne_C(in + 16, dst + 4);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void TransformUV(const int16_t* in, uint8_t* dst) {
+static void TransformUV_C(const int16_t* in, uint8_t* dst) {
   VP8Transform(in + 0 * 16, dst, 1);
   VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }
 
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformDC_C(const int16_t* in, uint8_t* dst) {
   const int DC = in[0] + 4;
   int i, j;
   for (j = 0; j < 4; ++j) {
@@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
     }
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void TransformDCUV(const int16_t* in, uint8_t* dst) {
+static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
   if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
   if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
   if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
 //------------------------------------------------------------------------------
 // Paragraph 14.3
 
-static void TransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformWHT_C(const int16_t* in, int16_t* out) {
   int tmp[16];
   int i;
   for (i = 0; i < 4; ++i) {
@@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
     out += 64;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
@@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
 #define DST(x, y) dst[(x) + (y) * BPS]
 
+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
   const uint8_t* top = dst - BPS;
   const uint8_t* const clip0 = VP8kclip1 - top[-1];
@@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
     dst += BPS;
   }
 }
-static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+static void TM4_C(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16_C(uint8_t* dst)  { TrueMotion(dst, 16); }
 
 //------------------------------------------------------------------------------
 // 16x16
 
-static void VE16(uint8_t* dst) {     // vertical
+static void VE16_C(uint8_t* dst) {     // vertical
   int j;
   for (j = 0; j < 16; ++j) {
     memcpy(dst + j * BPS, dst - BPS, 16);
   }
 }
 
-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_C(uint8_t* dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
     memset(dst, dst[-1], 16);
@@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
   }
 }
 
-static void DC16(uint8_t* dst) {    // DC
+static void DC16_C(uint8_t* dst) {    // DC
   int DC = 16;
   int j;
   for (j = 0; j < 16; ++j) {
@@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) {    // DC
   Put16(DC >> 5, dst);
 }
 
-static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+static void DC16NoTop_C(uint8_t* dst) {   // DC with top samples not available
   int DC = 8;
   int j;
   for (j = 0; j < 16; ++j) {
@@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
   Put16(DC >> 4, dst);
 }
 
-static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+static void DC16NoLeft_C(uint8_t* dst) {  // DC with left samples not available
   int DC = 8;
   int i;
   for (i = 0; i < 16; ++i) {
@@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
   Put16(DC >> 4, dst);
 }
 
-static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+static void DC16NoTopLeft_C(uint8_t* dst) {  // DC with no top and left samples
   Put16(0x80, dst);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 
@@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
-static void VE4(uint8_t* dst) {    // vertical
+#if !WEBP_NEON_OMIT_C_CODE
+static void VE4_C(uint8_t* dst) {    // vertical
   const uint8_t* top = dst - BPS;
   const uint8_t vals[4] = {
     AVG3(top[-1], top[0], top[1]),
@@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) {    // vertical
     memcpy(dst + i * BPS, vals, sizeof(vals));
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void HE4(uint8_t* dst) {    // horizontal
+static void HE4_C(uint8_t* dst) {    // horizontal
   const int A = dst[-1 - BPS];
   const int B = dst[-1];
   const int C = dst[-1 + BPS];
@@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) {    // horizontal
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
 }
 
-static void DC4(uint8_t* dst) {   // DC
+#if !WEBP_NEON_OMIT_C_CODE
+static void DC4_C(uint8_t* dst) {   // DC
   uint32_t dc = 4;
   int i;
   for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) {   // DC
   for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }
 
-static void RD4(uint8_t* dst) {   // Down-right
+static void RD4_C(uint8_t* dst) {   // Down-right
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) {   // Down-right
                                       DST(3, 0) = AVG3(D, C, B);
 }
 
-static void LD4(uint8_t* dst) {   // Down-Left
+static void LD4_C(uint8_t* dst) {   // Down-Left
   const int A = dst[0 - BPS];
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
@@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) {   // Down-Left
                           DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
                                       DST(3, 3) = AVG3(G, H, H);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void VR4(uint8_t* dst) {   // Vertical-Right
+static void VR4_C(uint8_t* dst) {   // Vertical-Right
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) {   // Vertical-Right
   DST(3, 1) =             AVG3(B, C, D);
 }
 
-static void VL4(uint8_t* dst) {   // Vertical-Left
+static void VL4_C(uint8_t* dst) {   // Vertical-Left
   const int A = dst[0 - BPS];
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
@@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) {   // Vertical-Left
               DST(3, 3) = AVG3(F, G, H);
 }
 
-static void HU4(uint8_t* dst) {   // Horizontal-Up
+static void HU4_C(uint8_t* dst) {   // Horizontal-Up
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) {   // Horizontal-Up
     DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 
-static void HD4(uint8_t* dst) {  // Horizontal-Down
+static void HD4_C(uint8_t* dst) {  // Horizontal-Down
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
 //------------------------------------------------------------------------------
 // Chroma
 
-static void VE8uv(uint8_t* dst) {    // vertical
+#if !WEBP_NEON_OMIT_C_CODE
+static void VE8uv_C(uint8_t* dst) {    // vertical
   int j;
   for (j = 0; j < 8; ++j) {
     memcpy(dst + j * BPS, dst - BPS, 8);
   }
 }
 
-static void HE8uv(uint8_t* dst) {    // horizontal
+static void HE8uv_C(uint8_t* dst) {    // horizontal
   int j;
   for (j = 0; j < 8; ++j) {
     memset(dst, dst[-1], 8);
@@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
   }
 }
 
-static void DC8uv(uint8_t* dst) {     // DC
+static void DC8uv_C(uint8_t* dst) {     // DC
   int dc0 = 8;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) {     // DC
   Put8x8uv(dc0 >> 4, dst);
 }
 
-static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+static void DC8uvNoLeft_C(uint8_t* dst) {   // DC with no left samples
   int dc0 = 4;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
   Put8x8uv(dc0 >> 3, dst);
 }
 
-static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+static void DC8uvNoTop_C(uint8_t* dst) {  // DC with no top samples
   int dc0 = 4;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
   Put8x8uv(dc0 >> 3, dst);
 }
 
-static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+static void DC8uvNoTopLeft_C(uint8_t* dst) {    // DC with nothing
   Put8x8uv(0x80, dst);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
 
 //------------------------------------------------------------------------------
 // Edge filtering functions
 
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 // 4 pixels in, 2 pixels out
-static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
   const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
@@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
 }
 
 // 4 pixels in, 4 pixels out
-static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0);
   const int a1 = VP8ksclip2[(a + 4) >> 3];
@@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
 }
 
 // 6 pixels in, 6 pixels out
-static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
   const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
   const int q0 = p[0], q1 = p[step], q2 = p[2*step];
   const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
@@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
   p[ 2*step] = VP8kclip1[q2 - a3];
 }
 
-static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
   return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static WEBP_INLINE int needs_filter2(const uint8_t* p,
-                                     int step, int t, int it) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
+                                      int step, int t, int it) {
   const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
   const int p0 = p[-step], q0 = p[0];
   const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
@@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
          VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
          VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
   int i;
   const int thresh2 = 2 * thresh + 1;
   for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh2)) {
-      do_filter2(p + i, stride);
+    if (NeedsFilter_C(p + i, stride, thresh2)) {
+      DoFilter2_C(p + i, stride);
     }
   }
 }
 
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
   int i;
   const int thresh2 = 2 * thresh + 1;
   for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh2)) {
-      do_filter2(p + i * stride, 1);
+    if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
+      DoFilter2_C(p + i * stride, 1);
     }
   }
 }
 
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
+    SimpleVFilter16_C(p, stride, thresh);
   }
 }
 
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4;
-    SimpleHFilter16(p, stride, thresh);
+    SimpleHFilter16_C(p, stride, thresh);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 
-static WEBP_INLINE void FilterLoop26(uint8_t* p,
-                                     int hstride, int vstride, int size,
-                                     int thresh, int ithresh, int hev_thresh) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
+                                       int hstride, int vstride, int size,
+                                       int thresh, int ithresh,
+                                       int hev_thresh) {
   const int thresh2 = 2 * thresh + 1;
   while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh2, ithresh)) {
-      if (hev(p, hstride, hev_thresh)) {
-        do_filter2(p, hstride);
+    if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
+      if (Hev(p, hstride, hev_thresh)) {
+        DoFilter2_C(p, hstride);
       } else {
-        do_filter6(p, hstride);
+        DoFilter6_C(p, hstride);
       }
     }
     p += vstride;
   }
 }
 
-static WEBP_INLINE void FilterLoop24(uint8_t* p,
-                                     int hstride, int vstride, int size,
-                                     int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
+                                       int hstride, int vstride, int size,
+                                       int thresh, int ithresh,
+                                       int hev_thresh) {
   const int thresh2 = 2 * thresh + 1;
   while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh2, ithresh)) {
-      if (hev(p, hstride, hev_thresh)) {
-        do_filter2(p, hstride);
+    if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
+      if (Hev(p, hstride, hev_thresh)) {
+        DoFilter2_C(p, hstride);
       } else {
-        do_filter4(p, hstride);
+        DoFilter4_C(p, hstride);
       }
     }
     p += vstride;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
+#if !WEBP_NEON_OMIT_C_CODE
 // on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+static void VFilter16_C(uint8_t* p, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
 }
 
-static void HFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+static void HFilter16_C(uint8_t* p, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
 }
 
 // on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_C(uint8_t* p, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4 * stride;
-    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+    FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter16i_C(uint8_t* p, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4;
-    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+    FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
+#if !WEBP_NEON_OMIT_C_CODE
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE
+static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 
-static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
-                             int dst_stride) {
+static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
+                               int dst_stride) {
   int i, j;
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) {
@@ -709,54 +749,66 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
 
   VP8InitClipTables();
 
-  VP8TransformWHT = TransformWHT;
-  VP8Transform = TransformTwo;
-  VP8TransformUV = TransformUV;
-  VP8TransformDC = TransformDC;
-  VP8TransformDCUV = TransformDCUV;
-  VP8TransformAC3 = TransformAC3;
-
-  VP8VFilter16 = VFilter16;
-  VP8HFilter16 = HFilter16;
-  VP8VFilter8 = VFilter8;
-  VP8HFilter8 = HFilter8;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16i = HFilter16i;
-  VP8VFilter8i = VFilter8i;
-  VP8HFilter8i = HFilter8i;
-  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16i;
-
-  VP8PredLuma4[0] = DC4;
-  VP8PredLuma4[1] = TM4;
-  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[3] = HE4;
-  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[5] = VR4;
-  VP8PredLuma4[6] = LD4;
-  VP8PredLuma4[7] = VL4;
-  VP8PredLuma4[8] = HD4;
-  VP8PredLuma4[9] = HU4;
-
-  VP8PredLuma16[0] = DC16;
-  VP8PredLuma16[1] = TM16;
-  VP8PredLuma16[2] = VE16;
-  VP8PredLuma16[3] = HE16;
-  VP8PredLuma16[4] = DC16NoTop;
-  VP8PredLuma16[5] = DC16NoLeft;
-  VP8PredLuma16[6] = DC16NoTopLeft;
-
-  VP8PredChroma8[0] = DC8uv;
-  VP8PredChroma8[1] = TM8uv;
-  VP8PredChroma8[2] = VE8uv;
-  VP8PredChroma8[3] = HE8uv;
-  VP8PredChroma8[4] = DC8uvNoTop;
-  VP8PredChroma8[5] = DC8uvNoLeft;
-  VP8PredChroma8[6] = DC8uvNoTopLeft;
-
-  VP8DitherCombine8x8 = DitherCombine8x8;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8TransformWHT = TransformWHT_C;
+  VP8Transform = TransformTwo_C;
+  VP8TransformDC = TransformDC_C;
+  VP8TransformAC3 = TransformAC3_C;
+#endif
+  VP8TransformUV = TransformUV_C;
+  VP8TransformDCUV = TransformDCUV_C;
+
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8VFilter16 = VFilter16_C;
+  VP8VFilter16i = VFilter16i_C;
+  VP8HFilter16 = HFilter16_C;
+  VP8VFilter8 = VFilter8_C;
+  VP8VFilter8i = VFilter8i_C;
+  VP8SimpleVFilter16 = SimpleVFilter16_C;
+  VP8SimpleHFilter16 = SimpleHFilter16_C;
+  VP8SimpleVFilter16i = SimpleVFilter16i_C;
+  VP8SimpleHFilter16i = SimpleHFilter16i_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8HFilter16i = HFilter16i_C;
+  VP8HFilter8 = HFilter8_C;
+  VP8HFilter8i = HFilter8i_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8PredLuma4[0] = DC4_C;
+  VP8PredLuma4[1] = TM4_C;
+  VP8PredLuma4[2] = VE4_C;
+  VP8PredLuma4[4] = RD4_C;
+  VP8PredLuma4[6] = LD4_C;
+#endif
+
+  VP8PredLuma4[3] = HE4_C;
+  VP8PredLuma4[5] = VR4_C;
+  VP8PredLuma4[7] = VL4_C;
+  VP8PredLuma4[8] = HD4_C;
+  VP8PredLuma4[9] = HU4_C;
+
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8PredLuma16[0] = DC16_C;
+  VP8PredLuma16[1] = TM16_C;
+  VP8PredLuma16[2] = VE16_C;
+  VP8PredLuma16[3] = HE16_C;
+  VP8PredLuma16[4] = DC16NoTop_C;
+  VP8PredLuma16[5] = DC16NoLeft_C;
+  VP8PredLuma16[6] = DC16NoTopLeft_C;
+
+  VP8PredChroma8[0] = DC8uv_C;
+  VP8PredChroma8[1] = TM8uv_C;
+  VP8PredChroma8[2] = VE8uv_C;
+  VP8PredChroma8[3] = HE8uv_C;
+  VP8PredChroma8[4] = DC8uvNoTop_C;
+  VP8PredChroma8[5] = DC8uvNoLeft_C;
+  VP8PredChroma8[6] = DC8uvNoTopLeft_C;
+#endif
+
+  VP8DitherCombine8x8 = DitherCombine8x8_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -770,11 +822,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
 #endif
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8DspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8DspInitMIPS32();
@@ -791,5 +838,57 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8DspInitNEON();
+  }
+#endif
+
+  assert(VP8TransformWHT != NULL);
+  assert(VP8Transform != NULL);
+  assert(VP8TransformDC != NULL);
+  assert(VP8TransformAC3 != NULL);
+  assert(VP8TransformUV != NULL);
+  assert(VP8TransformDCUV != NULL);
+  assert(VP8VFilter16 != NULL);
+  assert(VP8HFilter16 != NULL);
+  assert(VP8VFilter8 != NULL);
+  assert(VP8HFilter8 != NULL);
+  assert(VP8VFilter16i != NULL);
+  assert(VP8HFilter16i != NULL);
+  assert(VP8VFilter8i != NULL);
+  assert(VP8HFilter8i != NULL);
+  assert(VP8SimpleVFilter16 != NULL);
+  assert(VP8SimpleHFilter16 != NULL);
+  assert(VP8SimpleVFilter16i != NULL);
+  assert(VP8SimpleHFilter16i != NULL);
+  assert(VP8PredLuma4[0] != NULL);
+  assert(VP8PredLuma4[1] != NULL);
+  assert(VP8PredLuma4[2] != NULL);
+  assert(VP8PredLuma4[3] != NULL);
+  assert(VP8PredLuma4[4] != NULL);
+  assert(VP8PredLuma4[5] != NULL);
+  assert(VP8PredLuma4[6] != NULL);
+  assert(VP8PredLuma4[7] != NULL);
+  assert(VP8PredLuma4[8] != NULL);
+  assert(VP8PredLuma4[9] != NULL);
+  assert(VP8PredLuma16[0] != NULL);
+  assert(VP8PredLuma16[1] != NULL);
+  assert(VP8PredLuma16[2] != NULL);
+  assert(VP8PredLuma16[3] != NULL);
+  assert(VP8PredLuma16[4] != NULL);
+  assert(VP8PredLuma16[5] != NULL);
+  assert(VP8PredLuma16[6] != NULL);
+  assert(VP8PredChroma8[0] != NULL);
+  assert(VP8PredChroma8[1] != NULL);
+  assert(VP8PredChroma8[2] != NULL);
+  assert(VP8PredChroma8[3] != NULL);
+  assert(VP8PredChroma8[4] != NULL);
+  assert(VP8PredChroma8[5] != NULL);
+  assert(VP8PredChroma8[6] != NULL);
+  assert(VP8DitherCombine8x8 != NULL);
+
   dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/dec_clip_tables.c b/thirdparty/libwebp/src/dsp/dec_clip_tables.c
index 74ba34c0bb..427b74f776 100644
--- a/thirdparty/libwebp/dsp/dec_clip_tables.c
+++ b/thirdparty/libwebp/src/dsp/dec_clip_tables.c
@@ -11,11 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#define USE_STATIC_TABLES     // undefine to have run-time table initialization
+// define to 0 to have run-time table initialization
+#if !defined(USE_STATIC_TABLES)
+#define USE_STATIC_TABLES 1   // ALTERNATE_CODE
+#endif
 
-#ifdef USE_STATIC_TABLES
+#if (USE_STATIC_TABLES == 1)
 
 static const uint8_t abs0[255 + 255 + 1] = {
   0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
@@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1];
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;
 
-#endif
+#endif    // USE_STATIC_TABLES
 
 const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
 const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
@@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
-#if !defined(USE_STATIC_TABLES)
+#if (USE_STATIC_TABLES == 0)
   int i;
   if (!tables_ok) {
     for (i = -255; i <= 255; ++i) {
diff --git a/thirdparty/libwebp/dsp/dec_mips32.c b/thirdparty/libwebp/src/dsp/dec_mips32.c
index 4e9ef42605..e4e70966d2 100644
--- a/thirdparty/libwebp/dsp/dec_mips32.c
+++ b/thirdparty/libwebp/src/dsp/dec_mips32.c
@@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "./mips_macro.h"
+#include "src/dsp/mips_macro.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
diff --git a/thirdparty/libwebp/dsp/dec_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
index db5c657228..b0936bc46e 100644
--- a/thirdparty/libwebp/dsp/dec_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
@@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./mips_macro.h"
+#include "src/dsp/mips_macro.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
diff --git a/thirdparty/libwebp/dsp/dec_msa.c b/thirdparty/libwebp/src/dsp/dec_msa.c
index 8d9c98c3cf..8090622b7b 100644
--- a/thirdparty/libwebp/dsp/dec_msa.c
+++ b/thirdparty/libwebp/src/dsp/dec_msa.c
@@ -12,11 +12,11 @@
 // Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
 
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./msa_macro.h"
+#include "src/dsp/msa_macro.h"
 
 //------------------------------------------------------------------------------
 // Transforms
@@ -222,6 +222,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const v16i8 cnst4b = __msa_ldi_b(4);                        \
   const v16i8 cnst3b = __msa_ldi_b(3);                        \
   const v8i16 cnst9h = __msa_ldi_h(9);                        \
+  const v8i16 cnst63h = __msa_ldi_h(63);                      \
                                                               \
   FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);         \
   filt = __msa_subs_s_b(p1_m, q1_m);                          \
@@ -241,9 +242,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);               \
   /* update q2/p2 */                                          \
   temp0 = filt_r * cnst9h;                                    \
-  temp1 = ADDVI_H(temp0, 63);                                 \
+  temp1 = temp0 + cnst63h;                                    \
   temp2 = filt_l * cnst9h;                                    \
-  temp3 = ADDVI_H(temp2, 63);                                 \
+  temp3 = temp2 + cnst63h;                                    \
   FILT2(q2_m, p2_m, q2, p2);                                  \
   /* update q1/p1 */                                          \
   temp1 = temp1 + temp0;                                      \
@@ -708,7 +709,7 @@ static void VE4(uint8_t* dst) {    // vertical
   const uint32_t val0 = LW(ptop + 0);
   const uint32_t val1 = LW(ptop + 4);
   uint32_t out;
-  v16u8 A, B, C, AC, B2, R;
+  v16u8 A = { 0 }, B, C, AC, B2, R;
 
   INSERT_W2_UB(val0, val1, A);
   B = SLDI_UB(A, A, 1);
@@ -725,7 +726,7 @@ static void RD4(uint8_t* dst) {   // Down-right
   uint32_t val0 = LW(ptop + 0);
   uint32_t val1 = LW(ptop + 4);
   uint32_t val2, val3;
-  v16u8 A, B, C, AC, B2, R, A1;
+  v16u8 A, B, C, AC, B2, R, A1 = { 0 };
 
   INSERT_W2_UB(val0, val1, A1);
   A = SLDI_UB(A1, A1, 12);
@@ -753,7 +754,7 @@ static void LD4(uint8_t* dst) {   // Down-Left
   uint32_t val0 = LW(ptop + 0);
   uint32_t val1 = LW(ptop + 4);
   uint32_t val2, val3;
-  v16u8 A, B, C, AC, B2, R;
+  v16u8 A = { 0 }, B, C, AC, B2, R;
 
   INSERT_W2_UB(val0, val1, A);
   B = SLDI_UB(A, A, 1);
diff --git a/thirdparty/libwebp/dsp/dec_neon.c b/thirdparty/libwebp/src/dsp/dec_neon.c
index 34796cf4a2..ffa697fcf9 100644
--- a/thirdparty/libwebp/dsp/dec_neon.c
+++ b/thirdparty/libwebp/src/dsp/dec_neon.c
@@ -12,43 +12,23 @@
 // Authors: Somnath Banerjee (somnath@google.com)
 //          Johann Koenig (johannkoenig@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
-#include "./neon.h"
-#include "../dec/vp8i_dec.h"
+#include "src/dsp/neon.h"
+#include "src/dec/vp8i_dec.h"
 
 //------------------------------------------------------------------------------
 // NxM Loading functions
 
-// Load/Store vertical edge
-#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
-  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
-  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
-  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
-  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
-  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
-  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
-  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
-  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
-
-#define STORE8x2(c1, c2, p, stride)                                            \
-  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
-
 #if !defined(WORK_AROUND_GCC)
 
 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
 // (register alloc, probably). The variants somewhat mitigate the problem, but
 // not quite. HFilter16i() remains problematic.
-static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
+static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
+                                            int stride) {
   const uint8x8_t zero = vdup_n_u8(0);
   uint8x8x4_t out;
   INIT_VECTOR4(out, zero, zero, zero, zero);
@@ -63,13 +43,15 @@ static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
   return out;
 }
 
-static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
+                                      uint8x16_t* const p1,
+                                      uint8x16_t* const p0,
+                                      uint8x16_t* const q0,
+                                      uint8x16_t* const q1) {
   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
-  const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
-  const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
+  const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
+  const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
@@ -83,9 +65,11 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
   src += stride;                                                     \
 } while (0)
 
-static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
+                                      uint8x16_t* const p1,
+                                      uint8x16_t* const p0,
+                                      uint8x16_t* const q0,
+                                      uint8x16_t* const q1) {
   const uint32x4_t zero = vdupq_n_u32(0);
   uint32x4x4_t in;
   INIT_VECTOR4(in, zero, zero, zero, zero);
@@ -126,40 +110,40 @@ static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
 
 #endif  // !WORK_AROUND_GCC
 
-static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
-                                 uint8x16_t* const p3, uint8x16_t* const p2,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1,
-                                 uint8x16_t* const q2, uint8x16_t* const q3) {
-  Load4x16(src - 2, stride, p3, p2, p1, p0);
-  Load4x16(src + 2, stride, q0, q1, q2, q3);
+static WEBP_INLINE void Load8x16_NEON(
+    const uint8_t* const src, int stride,
+    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+    uint8x16_t* const q2, uint8x16_t* const q3) {
+  Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
+  Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
 }
 
-static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
+                                      uint8x16_t* const p1,
+                                      uint8x16_t* const p0,
+                                      uint8x16_t* const q0,
+                                      uint8x16_t* const q1) {
   *p1 = vld1q_u8(src - 2 * stride);
   *p0 = vld1q_u8(src - 1 * stride);
   *q0 = vld1q_u8(src + 0 * stride);
   *q1 = vld1q_u8(src + 1 * stride);
 }
 
-static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
-                                 uint8x16_t* const p3, uint8x16_t* const p2,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1,
-                                 uint8x16_t* const q2, uint8x16_t* const q3) {
-  Load16x4(src - 2  * stride, stride, p3, p2, p1, p0);
-  Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
+static WEBP_INLINE void Load16x8_NEON(
+    const uint8_t* const src, int stride,
+    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+    uint8x16_t* const q2, uint8x16_t* const q3) {
+  Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
+  Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
 }
 
-static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
-                                  const uint8_t* const v,
-                                  int stride,
-                                  uint8x16_t* const p3, uint8x16_t* const p2,
-                                  uint8x16_t* const p1, uint8x16_t* const p0,
-                                  uint8x16_t* const q0, uint8x16_t* const q1,
-                                  uint8x16_t* const q2, uint8x16_t* const q3) {
+static WEBP_INLINE void Load8x8x2_NEON(
+    const uint8_t* const u, const uint8_t* const v, int stride,
+    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+    uint8x16_t* const q2, uint8x16_t* const q3) {
   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
   // and the v-samples on the higher half.
   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
@@ -177,13 +161,11 @@ static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
 #define LOAD_UV_8(ROW) \
   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
 
-static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
-                                   const uint8_t* const v,
-                                   int stride,
-                                   uint8x16_t* const p3, uint8x16_t* const p2,
-                                   uint8x16_t* const p1, uint8x16_t* const p0,
-                                   uint8x16_t* const q0, uint8x16_t* const q1,
-                                   uint8x16_t* const q2, uint8x16_t* const q3) {
+static WEBP_INLINE void Load8x8x2T_NEON(
+    const uint8_t* const u, const uint8_t* const v, int stride,
+    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+    uint8x16_t* const q2, uint8x16_t* const q3) {
   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
   // and the v-samples on the higher half.
   const uint8x16_t row0 = LOAD_UV_8(0);
@@ -238,8 +220,8 @@ static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
 
 #endif  // !WORK_AROUND_GCC
 
-static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
-                                 uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
+                                      uint8_t* const dst, int stride) {
   vst2_lane_u8(dst + 0 * stride, v, 0);
   vst2_lane_u8(dst + 1 * stride, v, 1);
   vst2_lane_u8(dst + 2 * stride, v, 2);
@@ -250,20 +232,20 @@ static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
   vst2_lane_u8(dst + 7 * stride, v, 7);
 }
 
-static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
-                                  uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
+                                       uint8_t* const dst, int stride) {
   uint8x8x2_t lo, hi;
   lo.val[0] = vget_low_u8(p0);
   lo.val[1] = vget_low_u8(q0);
   hi.val[0] = vget_high_u8(p0);
   hi.val[1] = vget_high_u8(q0);
-  Store2x8(lo, dst - 1 + 0 * stride, stride);
-  Store2x8(hi, dst - 1 + 8 * stride, stride);
+  Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
+  Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
 }
 
 #if !defined(WORK_AROUND_GCC)
-static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
-                                 uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
+                                      uint8_t* const dst, int stride) {
   vst4_lane_u8(dst + 0 * stride, v, 0);
   vst4_lane_u8(dst + 1 * stride, v, 1);
   vst4_lane_u8(dst + 2 * stride, v, 2);
@@ -274,9 +256,9 @@ static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
   vst4_lane_u8(dst + 7 * stride, v, 7);
 }
 
-static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
-                                  const uint8x16_t q0, const uint8x16_t q1,
-                                  uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                                       const uint8x16_t q0, const uint8x16_t q1,
+                                       uint8_t* const dst, int stride) {
   uint8x8x4_t lo, hi;
   INIT_VECTOR4(lo,
                vget_low_u8(p1), vget_low_u8(p0),
@@ -284,27 +266,28 @@ static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
   INIT_VECTOR4(hi,
                vget_high_u8(p1), vget_high_u8(p0),
                vget_high_u8(q0), vget_high_u8(q1));
-  Store4x8(lo, dst - 2 + 0 * stride, stride);
-  Store4x8(hi, dst - 2 + 8 * stride, stride);
+  Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
+  Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
 }
 #endif  // !WORK_AROUND_GCC
 
-static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
-                                  uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
+                                       uint8_t* const dst, int stride) {
   vst1q_u8(dst - stride, p0);
   vst1q_u8(dst, q0);
 }
 
-static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
-                                  const uint8x16_t q0, const uint8x16_t q1,
-                                  uint8_t* const dst, int stride) {
-  Store16x2(p1, p0, dst - stride, stride);
-  Store16x2(q0, q1, dst + stride, stride);
+static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                                       const uint8x16_t q0, const uint8x16_t q1,
+                                       uint8_t* const dst, int stride) {
+  Store16x2_NEON(p1, p0, dst - stride, stride);
+  Store16x2_NEON(q0, q1, dst + stride, stride);
 }
 
-static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
-                                   uint8_t* const u, uint8_t* const v,
-                                   int stride) {
+static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
+                                        const uint8x16_t q0,
+                                        uint8_t* const u, uint8_t* const v,
+                                        int stride) {
   // p0 and q0 contain the u+v samples packed in low/high halves.
   vst1_u8(u - stride, vget_low_u8(p0));
   vst1_u8(u,          vget_low_u8(q0));
@@ -312,13 +295,15 @@ static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
   vst1_u8(v,          vget_high_u8(q0));
 }
 
-static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
-                                   const uint8x16_t q0, const uint8x16_t q1,
-                                   uint8_t* const u, uint8_t* const v,
-                                   int stride) {
+static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
+                                        const uint8x16_t p0,
+                                        const uint8x16_t q0,
+                                        const uint8x16_t q1,
+                                        uint8_t* const u, uint8_t* const v,
+                                        int stride) {
   // The p1...q1 registers contain the u+v samples packed in low/high halves.
-  Store8x2x2(p1, p0, u - stride, v - stride, stride);
-  Store8x2x2(q0, q1, u + stride, v + stride, stride);
+  Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
+  Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
 }
 
 #if !defined(WORK_AROUND_GCC)
@@ -329,11 +314,10 @@ static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
   (DST) += stride;                                \
 } while (0)
 
-static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
-                                   const uint8x16_t p0, const uint8x16_t q0,
-                                   const uint8x16_t q1, const uint8x16_t q2,
-                                   uint8_t* u, uint8_t* v,
-                                   int stride) {
+static WEBP_INLINE void Store6x8x2_NEON(
+    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
+    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
+    uint8_t* u, uint8_t* v, int stride) {
   uint8x8x3_t u0, u1, v0, v1;
   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
@@ -358,10 +342,12 @@ static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
 }
 #undef STORE6_LANE
 
-static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
-                                   const uint8x16_t q0, const uint8x16_t q1,
-                                   uint8_t* const u, uint8_t* const v,
-                                   int stride) {
+static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
+                                        const uint8x16_t p0,
+                                        const uint8x16_t q0,
+                                        const uint8x16_t q1,
+                                        uint8_t* const u, uint8_t* const v,
+                                        int stride) {
   uint8x8x4_t u0, v0;
   INIT_VECTOR4(u0,
                vget_low_u8(p1), vget_low_u8(p0),
@@ -390,15 +376,15 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
 #endif  // !WORK_AROUND_GCC
 
 // Zero extend 'v' to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
   return vreinterpretq_s16_u16(vmovl_u8(v));
 }
 
 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 // to the corresponding rows of 'dst'.
-static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
-                                            const int16x8_t dst01,
-                                            const int16x8_t dst23) {
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+                                                 const int16x8_t dst01,
+                                                 const int16x8_t dst23) {
   // Unsigned saturate to 8b.
   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@@ -410,8 +396,9 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 }
 
-static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
-                               uint8_t* const dst) {
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+                                    const int16x8_t row23,
+                                    uint8_t* const dst) {
   uint32x2_t dst01 = vdup_n_u32(0);
   uint32x2_t dst23 = vdup_n_u32(0);
 
@@ -423,23 +410,23 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
 
   {
     // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
-    const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
+    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
+    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
 
     // Descale with rounding.
     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
     // Add the inverse transform.
-    SaturateAndStore4x4(dst, out01, out23);
+    SaturateAndStore4x4_NEON(dst, out01, out23);
   }
 }
 
 //-----------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
-static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
-                              const uint8x16_t q0, const uint8x16_t q1,
-                              int thresh) {
+static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                                   const uint8x16_t q0, const uint8x16_t q1,
+                                   int thresh) {
   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
@@ -450,18 +437,18 @@ static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
   return mask;
 }
 
-static int8x16_t FlipSign(const uint8x16_t v) {
+static int8x16_t FlipSign_NEON(const uint8x16_t v) {
   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
 }
 
-static uint8x16_t FlipSignBack(const int8x16_t v) {
+static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
   const int8x16_t sign_bit = vdupq_n_s8(0x80);
   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
 }
 
-static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
-                              const int8x16_t q0, const int8x16_t q1) {
+static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
+                                   const int8x16_t q0, const int8x16_t q1) {
   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
@@ -470,7 +457,7 @@ static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
   return s3;
 }
 
-static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
+static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
@@ -479,9 +466,10 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
 
 //------------------------------------------------------------------------------
 
-static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
-                               const int8x16_t delta,
-                               int8x16_t* const op0, int8x16_t* const oq0) {
+static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
+                                    const int8x16_t delta,
+                                    int8x16_t* const op0,
+                                    int8x16_t* const oq0) {
   const int8x16_t kCst3 = vdupq_n_s8(0x03);
   const int8x16_t kCst4 = vdupq_n_s8(0x04);
   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
@@ -494,9 +482,9 @@ static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
 
 #if defined(WEBP_USE_INTRINSICS)
 
-static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
-                         const int8x16_t delta,
-                         uint8x16_t* const op0, uint8x16_t* const oq0) {
+static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
+                              const int8x16_t delta,
+                              uint8x16_t* const op0, uint8x16_t* const oq0) {
   const int8x16_t kCst3 = vdupq_n_s8(0x03);
   const int8x16_t kCst4 = vdupq_n_s8(0x04);
   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
@@ -505,45 +493,66 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
-  *op0 = FlipSignBack(sp0);
-  *oq0 = FlipSignBack(sq0);
-}
-
-static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
-                      const uint8x16_t q0, const uint8x16_t q1,
-                      const uint8x16_t mask,
-                      uint8x16_t* const op0, uint8x16_t* const oq0) {
-  const int8x16_t p1s = FlipSign(p1);
-  const int8x16_t p0s = FlipSign(p0);
-  const int8x16_t q0s = FlipSign(q0);
-  const int8x16_t q1s = FlipSign(q1);
-  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+  *op0 = FlipSignBack_NEON(sp0);
+  *oq0 = FlipSignBack_NEON(sq0);
+}
+
+static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                           const uint8x16_t q0, const uint8x16_t q1,
+                           const uint8x16_t mask,
+                           uint8x16_t* const op0, uint8x16_t* const oq0) {
+  const int8x16_t p1s = FlipSign_NEON(p1);
+  const int8x16_t p0s = FlipSign_NEON(p0);
+  const int8x16_t q0s = FlipSign_NEON(q0);
+  const int8x16_t q1s = FlipSign_NEON(q1);
+  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
-  ApplyFilter2(p0s, q0s, delta1, op0, oq0);
+  ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
 }
 
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
   uint8x16_t p1, p0, q0, q1, op0, oq0;
-  Load16x4(p, stride, &p1, &p0, &q0, &q1);
+  Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
   {
-    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
-    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
+    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
   }
-  Store16x2(op0, oq0, p, stride);
+  Store16x2_NEON(op0, oq0, p, stride);
 }
 
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
   uint8x16_t p1, p0, q0, q1, oq0, op0;
-  Load4x16(p, stride, &p1, &p0, &q0, &q1);
+  Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
   {
-    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
-    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
+    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
   }
-  Store2x16(op0, oq0, p, stride);
+  Store2x16_NEON(op0, oq0, p, stride);
 }
 
 #else
 
+// Load/Store vertical edge
+#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
+  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
+
+#define STORE8x2(c1, c2, p, stride)                                            \
+  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
+
 #define QRegs "q0", "q1", "q2", "q3",                                          \
               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
 
@@ -592,7 +601,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
   FLIP_SIGN_BIT2(p0, q0, q10)
 
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
   __asm__ volatile (
     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 
@@ -613,7 +622,7 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
   );
 }
 
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
   __asm__ volatile (
     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
@@ -639,30 +648,33 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   );
 }
 
+#undef LOAD8x4
+#undef STORE8x2
+
 #endif    // WEBP_USE_INTRINSICS
 
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
   uint32_t k;
   for (k = 3; k != 0; --k) {
     p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
+    SimpleVFilter16_NEON(p, stride, thresh);
   }
 }
 
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
   uint32_t k;
   for (k = 3; k != 0; --k) {
     p += 4;
-    SimpleHFilter16(p, stride, thresh);
+    SimpleHFilter16_NEON(p, stride, thresh);
   }
 }
 
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 
-static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
-                           const uint8x16_t q0, const uint8x16_t q1,
-                           int hev_thresh) {
+static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                                const uint8x16_t q0, const uint8x16_t q1,
+                                int hev_thresh) {
   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
@@ -671,11 +683,11 @@ static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
   return mask;
 }
 
-static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
-                               const uint8x16_t p1, const uint8x16_t p0,
-                               const uint8x16_t q0, const uint8x16_t q1,
-                               const uint8x16_t q2, const uint8x16_t q3,
-                               int ithresh, int thresh) {
+static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
+                                    const uint8x16_t p1, const uint8x16_t p0,
+                                    const uint8x16_t q0, const uint8x16_t q1,
+                                    const uint8x16_t q2, const uint8x16_t q3,
+                                    int ithresh, int thresh) {
   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
@@ -689,14 +701,14 @@ static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
   const uint8x16_t max12 = vmaxq_u8(max1, max2);
   const uint8x16_t max123 = vmaxq_u8(max12, max3);
   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
-  const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
+  const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
   const uint8x16_t mask = vandq_u8(mask1, mask2);
   return mask;
 }
 
 //  4-points filter
 
-static void ApplyFilter4(
+static void ApplyFilter4_NEON(
     const int8x16_t p1, const int8x16_t p0,
     const int8x16_t q0, const int8x16_t q1,
     const int8x16_t delta0,
@@ -709,47 +721,47 @@ static void ApplyFilter4(
   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
-  *op0 = FlipSignBack(vqaddq_s8(p0, a2));  // clip(p0 + a2)
-  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - a1)
-  *op1 = FlipSignBack(vqaddq_s8(p1, a3));  // clip(p1 + a3)
-  *oq1 = FlipSignBack(vqsubq_s8(q1, a3));  // clip(q1 - a3)
+  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
+  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
+  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
+  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
 }
 
-static void DoFilter4(
+static void DoFilter4_NEON(
     const uint8x16_t p1, const uint8x16_t p0,
     const uint8x16_t q0, const uint8x16_t q1,
     const uint8x16_t mask, const uint8x16_t hev_mask,
     uint8x16_t* const op1, uint8x16_t* const op0,
     uint8x16_t* const oq0, uint8x16_t* const oq1) {
   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
-  const int8x16_t p1s = FlipSign(p1);
-  int8x16_t p0s = FlipSign(p0);
-  int8x16_t q0s = FlipSign(q0);
-  const int8x16_t q1s = FlipSign(q1);
+  const int8x16_t p1s = FlipSign_NEON(p1);
+  int8x16_t p0s = FlipSign_NEON(p0);
+  int8x16_t q0s = FlipSign_NEON(q0);
+  const int8x16_t q1s = FlipSign_NEON(q1);
   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
 
   // do_filter2 part (simple loopfilter on pixels with hev)
   {
-    const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
+    const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
     const int8x16_t simple_lf_delta =
         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
-    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
+    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
   }
 
   // do_filter4 part (complex loopfilter on pixels without hev)
   {
-    const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
+    const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
     const int8x16_t complex_lf_delta =
         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
-    ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
+    ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
   }
 }
 
 //  6-points filter
 
-static void ApplyFilter6(
+static void ApplyFilter6_NEON(
     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
     const int8x16_t delta,
@@ -778,35 +790,35 @@ static void ApplyFilter6(
   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
 
-  *op0 = FlipSignBack(vqaddq_s8(p0, a1));  // clip(p0 + a1)
-  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - q1)
-  *oq1 = FlipSignBack(vqsubq_s8(q1, a2));  // clip(q1 - a2)
-  *op1 = FlipSignBack(vqaddq_s8(p1, a2));  // clip(p1 + a2)
-  *oq2 = FlipSignBack(vqsubq_s8(q2, a3));  // clip(q2 - a3)
-  *op2 = FlipSignBack(vqaddq_s8(p2, a3));  // clip(p2 + a3)
+  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
+  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
+  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
+  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
+  *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
+  *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
 }
 
-static void DoFilter6(
+static void DoFilter6_NEON(
     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
     const uint8x16_t mask, const uint8x16_t hev_mask,
     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
-  const int8x16_t p2s = FlipSign(p2);
-  const int8x16_t p1s = FlipSign(p1);
-  int8x16_t p0s = FlipSign(p0);
-  int8x16_t q0s = FlipSign(q0);
-  const int8x16_t q1s = FlipSign(q1);
-  const int8x16_t q2s = FlipSign(q2);
+  const int8x16_t p2s = FlipSign_NEON(p2);
+  const int8x16_t p1s = FlipSign_NEON(p1);
+  int8x16_t p0s = FlipSign_NEON(p0);
+  int8x16_t q0s = FlipSign_NEON(q0);
+  const int8x16_t q1s = FlipSign_NEON(q1);
+  const int8x16_t q2s = FlipSign_NEON(q2);
   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
-  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
 
   // do_filter2 part (simple loopfilter on pixels with hev)
   {
     const int8x16_t simple_lf_delta =
         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
-    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
+    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
   }
 
   // do_filter6 part (complex loopfilter on pixels without hev)
@@ -815,65 +827,65 @@ static void DoFilter6(
     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
     const int8x16_t complex_lf_delta =
         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
-    ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
-                 op2, op1, op0, oq0, oq1, oq2);
+    ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
+                      op2, op1, op0, oq0, oq1, oq2);
   }
 }
 
 // on macroblock edges
 
-static void VFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter16_NEON(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
-  Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
-    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
-              &op2, &op1, &op0, &oq0, &oq1, &oq2);
-    Store16x2(op2, op1, p - 2 * stride, stride);
-    Store16x2(op0, oq0, p + 0 * stride, stride);
-    Store16x2(oq1, oq2, p + 2 * stride, stride);
+    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store16x2_NEON(op2, op1, p - 2 * stride, stride);
+    Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
+    Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
   }
 }
 
-static void HFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter16_NEON(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
-  Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
-    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
-              &op2, &op1, &op0, &oq0, &oq1, &oq2);
-    Store2x16(op2, op1, p - 2, stride);
-    Store2x16(op0, oq0, p + 0, stride);
-    Store2x16(oq1, oq2, p + 2, stride);
+    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store2x16_NEON(op2, op1, p - 2, stride);
+    Store2x16_NEON(op0, oq0, p + 0, stride);
+    Store2x16_NEON(oq1, oq2, p + 2, stride);
   }
 }
 
 // on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_NEON(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
   uint32_t k;
   uint8x16_t p3, p2, p1, p0;
-  Load16x4(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
+  Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
   for (k = 3; k != 0; --k) {
     uint8x16_t q0, q1, q2, q3;
     p += 4 * stride;
-    Load16x4(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
+    Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
     {
       const uint8x16_t mask =
-          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
-      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
       // p3 and p2 are not just temporary variables here: they will be
       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
-      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
-      Store16x4(p1, p0, p3, p2, p, stride);
+      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+      Store16x4_NEON(p1, p0, p3, p2, p, stride);
       p1 = q2;
       p0 = q3;
     }
@@ -881,21 +893,21 @@ static void VFilter16i(uint8_t* p, int stride,
 }
 
 #if !defined(WORK_AROUND_GCC)
-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i_NEON(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
   uint32_t k;
   uint8x16_t p3, p2, p1, p0;
-  Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
+  Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
   for (k = 3; k != 0; --k) {
     uint8x16_t q0, q1, q2, q3;
     p += 4;
-    Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
+    Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
     {
       const uint8x16_t mask =
-          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
-      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
-      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
-      Store4x16(p1, p0, p3, p2, p, stride);
+          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
+      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+      Store4x16_NEON(p1, p0, p3, p2, p, stride);
       p1 = q2;
       p0 = q3;
     }
@@ -904,67 +916,67 @@ static void HFilter16i(uint8_t* p, int stride,
 #endif  // !WORK_AROUND_GCC
 
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
-  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
-    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
-              &op2, &op1, &op0, &oq0, &oq1, &oq2);
-    Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
-    Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
-    Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
+    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
+    Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
+    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
   }
 }
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   u += 4 * stride;
   v += 4 * stride;
-  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op1, op0, oq0, oq1;
-    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
-    Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
+    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+    Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
   }
 }
 
 #if !defined(WORK_AROUND_GCC)
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
-  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
-    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
-              &op2, &op1, &op0, &oq0, &oq1, &oq2);
-    Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
+    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
   }
 }
 
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   u += 4;
   v += 4;
-  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op1, op0, oq0, oq1;
-    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
-    Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
+    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+    Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
   }
 }
 #endif  // !WORK_AROUND_GCC
@@ -992,8 +1004,9 @@ static const int16_t kC1 = 20091;
 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 
 #if defined(WEBP_USE_INTRINSICS)
-static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
-                                     int16x8x2_t* const out) {
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+                                          const int16x8_t in1,
+                                          int16x8x2_t* const out) {
   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
@@ -1001,7 +1014,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
 }
 
-static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
   // {rows} = in0 | in4
   //          in8 | in12
   // B1 = in4 | in12
@@ -1024,20 +1037,20 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
-  Transpose8x2(E0, E1, rows);
+  Transpose8x2_NEON(E0, E1, rows);
 }
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
   int16x8x2_t rows;
   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
-  TransformPass(&rows);
-  TransformPass(&rows);
-  Add4x4(rows.val[0], rows.val[1], dst);
+  TransformPass_NEON(&rows);
+  TransformPass_NEON(&rows);
+  Add4x4_NEON(rows.val[0], rows.val[1], dst);
 }
 
 #else
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
   const int kBPS = BPS;
   // kC1, kC2. Padded because vld1.16 loads 8 bytes
   const int16_t constants[4] = { kC1, kC2, 0, 0 };
@@ -1170,16 +1183,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
 
 #endif    // WEBP_USE_INTRINSICS
 
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOne(in, dst);
+static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne_NEON(in, dst);
   if (do_two) {
-    TransformOne(in + 16, dst + 4);
+    TransformOne_NEON(in + 16, dst + 4);
   }
 }
 
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
   const int16x8_t DC = vdupq_n_s16(in[0]);
-  Add4x4(DC, DC, dst);
+  Add4x4_NEON(DC, DC, dst);
 }
 
 //------------------------------------------------------------------------------
@@ -1191,7 +1204,7 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
 } while (0)
 
-static void TransformWHT(const int16_t* in, int16_t* out) {
+static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
   int32x4x4_t tmp;
 
   {
@@ -1209,7 +1222,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
     tmp.val[2] = vsubq_s32(a0, a1);
     tmp.val[3] = vsubq_s32(a3, a2);
     // Arrange the temporary results column-wise.
-    tmp = Transpose4x4(tmp);
+    tmp = Transpose4x4_NEON(tmp);
   }
 
   {
@@ -1243,7 +1256,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
 //------------------------------------------------------------------------------
 
 #define MUL(a, b) (((a) * (b)) >> 16)
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
   static const int kC1_full = 20091 + (1 << 16);
   static const int kC2_full = 35468;
   const int16x4_t A = vld1_dup_s16(in);
@@ -1259,14 +1272,14 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const int16x4_t B = vqadd_s16(A, CD);
   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
-  Add4x4(m0_m1, m2_m3, dst);
+  Add4x4_NEON(m0_m1, m2_m3, dst);
 }
 #undef MUL
 
 //------------------------------------------------------------------------------
 // 4x4
 
-static void DC4(uint8_t* dst) {    // DC
+static void DC4_NEON(uint8_t* dst) {    // DC
   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   const uint16x4_t p1 = vpadd_u16(p0, p0);
@@ -1287,17 +1300,17 @@ static void DC4(uint8_t* dst) {    // DC
 }
 
 // TrueMotion (4x4 + 8x8)
-static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
   const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
   int y;
   for (y = 0; y < size; y += 4) {
     // left edge
-    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
-    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
-    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
-    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
     const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
     const int16x8_t r1 = vaddq_s16(L1, d);
     const int16x8_t r2 = vaddq_s16(L2, d);
@@ -1322,9 +1335,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
   }
 }
 
-static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
 
-static void VE4(uint8_t* dst) {    // vertical
+static void VE4_NEON(uint8_t* dst) {    // vertical
   // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
   const uint64x1_t A1 = vshr_n_u64(A0, 8);
@@ -1340,7 +1353,7 @@ static void VE4(uint8_t* dst) {    // vertical
   }
 }
 
-static void RD4(uint8_t* dst) {   // Down-right
+static void RD4_NEON(uint8_t* dst) {   // Down-right
   const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
@@ -1368,7 +1381,7 @@ static void RD4(uint8_t* dst) {   // Down-right
   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
 }
 
-static void LD4(uint8_t* dst) {    // Down-left
+static void LD4_NEON(uint8_t* dst) {    // Down-left
   // Note using the same shift trick as VE4() is slower here.
   const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
   const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
@@ -1390,7 +1403,7 @@ static void LD4(uint8_t* dst) {    // Down-left
 //------------------------------------------------------------------------------
 // Chroma
 
-static void VE8uv(uint8_t* dst) {    // vertical
+static void VE8uv_NEON(uint8_t* dst) {    // vertical
   const uint8x8_t top = vld1_u8(dst - BPS);
   int j;
   for (j = 0; j < 8; ++j) {
@@ -1398,7 +1411,7 @@ static void VE8uv(uint8_t* dst) {    // vertical
   }
 }
 
-static void HE8uv(uint8_t* dst) {    // horizontal
+static void HE8uv_NEON(uint8_t* dst) {    // horizontal
   int j;
   for (j = 0; j < 8; ++j) {
     const uint8x8_t left = vld1_dup_u8(dst - 1);
@@ -1407,7 +1420,7 @@ static void HE8uv(uint8_t* dst) {    // horizontal
   }
 }
 
-static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
+static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
   uint16x8_t sum_top;
   uint16x8_t sum_left;
   uint8x8_t dc0;
@@ -1458,17 +1471,17 @@ static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
   }
 }
 
-static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
-static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
-static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
-static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
+static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
+static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
+static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
+static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
 
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
 
 //------------------------------------------------------------------------------
 // 16x16
 
-static void VE16(uint8_t* dst) {     // vertical
+static void VE16_NEON(uint8_t* dst) {     // vertical
   const uint8x16_t top = vld1q_u8(dst - BPS);
   int j;
   for (j = 0; j < 16; ++j) {
@@ -1476,7 +1489,7 @@ static void VE16(uint8_t* dst) {     // vertical
   }
 }
 
-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_NEON(uint8_t* dst) {     // horizontal
   int j;
   for (j = 0; j < 16; ++j) {
     const uint8x16_t left = vld1q_dup_u8(dst - 1);
@@ -1485,7 +1498,7 @@ static void HE16(uint8_t* dst) {     // horizontal
   }
 }
 
-static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
+static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
   uint16x8_t sum_top;
   uint16x8_t sum_left;
   uint8x8_t dc0;
@@ -1542,12 +1555,12 @@ static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
   }
 }
 
-static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
-static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
-static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
-static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
+static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
+static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
+static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
+static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
 
-static void TM16(uint8_t* dst) {
+static void TM16_NEON(uint8_t* dst) {
   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
   // A[c] - A[-1]
@@ -1556,10 +1569,10 @@ static void TM16(uint8_t* dst) {
   int y;
   for (y = 0; y < 16; y += 4) {
     // left edge
-    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
-    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
-    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
-    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
     const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
     const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
     const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
@@ -1587,49 +1600,49 @@ static void TM16(uint8_t* dst) {
 extern void VP8DspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
-  VP8Transform = TransformTwo;
-  VP8TransformAC3 = TransformAC3;
-  VP8TransformDC = TransformDC;
-  VP8TransformWHT = TransformWHT;
-
-  VP8VFilter16 = VFilter16;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16 = HFilter16;
+  VP8Transform = TransformTwo_NEON;
+  VP8TransformAC3 = TransformAC3_NEON;
+  VP8TransformDC = TransformDC_NEON;
+  VP8TransformWHT = TransformWHT_NEON;
+
+  VP8VFilter16 = VFilter16_NEON;
+  VP8VFilter16i = VFilter16i_NEON;
+  VP8HFilter16 = HFilter16_NEON;
 #if !defined(WORK_AROUND_GCC)
-  VP8HFilter16i = HFilter16i;
+  VP8HFilter16i = HFilter16i_NEON;
 #endif
-  VP8VFilter8 = VFilter8;
-  VP8VFilter8i = VFilter8i;
+  VP8VFilter8 = VFilter8_NEON;
+  VP8VFilter8i = VFilter8i_NEON;
 #if !defined(WORK_AROUND_GCC)
-  VP8HFilter8 = HFilter8;
-  VP8HFilter8i = HFilter8i;
+  VP8HFilter8 = HFilter8_NEON;
+  VP8HFilter8i = HFilter8i_NEON;
 #endif
-  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16i;
-
-  VP8PredLuma4[0] = DC4;
-  VP8PredLuma4[1] = TM4;
-  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[6] = LD4;
-
-  VP8PredLuma16[0] = DC16TopLeft;
-  VP8PredLuma16[1] = TM16;
-  VP8PredLuma16[2] = VE16;
-  VP8PredLuma16[3] = HE16;
-  VP8PredLuma16[4] = DC16NoTop;
-  VP8PredLuma16[5] = DC16NoLeft;
-  VP8PredLuma16[6] = DC16NoTopLeft;
-
-  VP8PredChroma8[0] = DC8uv;
-  VP8PredChroma8[1] = TM8uv;
-  VP8PredChroma8[2] = VE8uv;
-  VP8PredChroma8[3] = HE8uv;
-  VP8PredChroma8[4] = DC8uvNoTop;
-  VP8PredChroma8[5] = DC8uvNoLeft;
-  VP8PredChroma8[6] = DC8uvNoTopLeft;
+  VP8SimpleVFilter16 = SimpleVFilter16_NEON;
+  VP8SimpleHFilter16 = SimpleHFilter16_NEON;
+  VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
+  VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
+
+  VP8PredLuma4[0] = DC4_NEON;
+  VP8PredLuma4[1] = TM4_NEON;
+  VP8PredLuma4[2] = VE4_NEON;
+  VP8PredLuma4[4] = RD4_NEON;
+  VP8PredLuma4[6] = LD4_NEON;
+
+  VP8PredLuma16[0] = DC16TopLeft_NEON;
+  VP8PredLuma16[1] = TM16_NEON;
+  VP8PredLuma16[2] = VE16_NEON;
+  VP8PredLuma16[3] = HE16_NEON;
+  VP8PredLuma16[4] = DC16NoTop_NEON;
+  VP8PredLuma16[5] = DC16NoLeft_NEON;
+  VP8PredLuma16[6] = DC16NoTopLeft_NEON;
+
+  VP8PredChroma8[0] = DC8uv_NEON;
+  VP8PredChroma8[1] = TM8uv_NEON;
+  VP8PredChroma8[2] = VE8uv_NEON;
+  VP8PredChroma8[3] = HE8uv_NEON;
+  VP8PredChroma8[4] = DC8uvNoTop_NEON;
+  VP8PredChroma8[5] = DC8uvNoLeft_NEON;
+  VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
 }
 
 #else  // !WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/dec_sse2.c b/thirdparty/libwebp/src/dsp/dec_sse2.c
index 411fb02768..b3840faf3a 100644
--- a/thirdparty/libwebp/dsp/dec_sse2.c
+++ b/thirdparty/libwebp/src/dsp/dec_sse2.c
@@ -12,23 +12,25 @@
 // Author: somnath@google.com (Somnath Banerjee)
 //         cduvivier@google.com (Christian Duvivier)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 
 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
 // one it seems => disable it by default. Uncomment the following to enable:
-// #define USE_TRANSFORM_AC3
+#if !defined(USE_TRANSFORM_AC3)
+#define USE_TRANSFORM_AC3 0   // ALTERNATE_CODE
+#endif
 
 #include <emmintrin.h>
-#include "./common_sse2.h"
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
-static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
   }
 }
 
-#if defined(USE_TRANSFORM_AC3)
+#if (USE_TRANSFORM_AC3 == 1)
 #define MUL(a, b) (((a) * (b)) >> 16)
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
   static const int kC1 = 20091 + (1 << 16);
@@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
     _mm_subs_epu8((p), (q)))
 
 // Shift each byte of "x" by 3 bits while preserving by the sign bit.
-static WEBP_INLINE void SignedShift8b(__m128i* const x) {
+static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
   const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
@@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
 }
 
 #define FLIP_SIGN_BIT2(a, b) {                                                 \
-  a = _mm_xor_si128(a, sign_bit);                                              \
-  b = _mm_xor_si128(b, sign_bit);                                              \
+  (a) = _mm_xor_si128(a, sign_bit);                                            \
+  (b) = _mm_xor_si128(b, sign_bit);                                            \
 }
 
 #define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
@@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
 }
 
 // input/output is uint8_t
-static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
-                                  const __m128i* const p0,
-                                  const __m128i* const q0,
-                                  const __m128i* const q1,
-                                  int hev_thresh, __m128i* const not_hev) {
+static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
+                                       const __m128i* const p0,
+                                       const __m128i* const q0,
+                                       const __m128i* const q1,
+                                       int hev_thresh, __m128i* const not_hev) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i t_1 = MM_ABS(*p1, *p0);
   const __m128i t_2 = MM_ABS(*q1, *q0);
@@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
 }
 
 // input pixels are int8_t
-static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
-                                     const __m128i* const p0,
-                                     const __m128i* const q0,
-                                     const __m128i* const q1,
-                                     __m128i* const delta) {
+static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
+                                          const __m128i* const p0,
+                                          const __m128i* const q0,
+                                          const __m128i* const q1,
+                                          __m128i* const delta) {
   // beware of addition order, for saturation!
   const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
   const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
@@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
 }
 
 // input and output are int8_t
-static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
-                                       const __m128i* const fl) {
+static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
+                                            __m128i* const q0,
+                                            const __m128i* const fl) {
   const __m128i k3 = _mm_set1_epi8(3);
   const __m128i k4 = _mm_set1_epi8(4);
   __m128i v3 = _mm_adds_epi8(*fl, k3);
   __m128i v4 = _mm_adds_epi8(*fl, k4);
 
-  SignedShift8b(&v4);                  // v4 >> 3
-  SignedShift8b(&v3);                  // v3 >> 3
+  SignedShift8b_SSE2(&v4);             // v4 >> 3
+  SignedShift8b_SSE2(&v3);             // v3 >> 3
   *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
   *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
 }
@@ -317,9 +320,9 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
 // Update operations:
 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
 // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
-static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
-                                      const __m128i* const a0_lo,
-                                      const __m128i* const a0_hi) {
+static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
+                                           const __m128i* const a0_lo,
+                                           const __m128i* const a0_hi) {
   const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
   const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
   const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
@@ -330,11 +333,11 @@ static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
 }
 
 // input pixels are uint8_t
-static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
-                                    const __m128i* const p0,
-                                    const __m128i* const q0,
-                                    const __m128i* const q1,
-                                    int thresh, __m128i* const mask) {
+static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
+                                         const __m128i* const p0,
+                                         const __m128i* const q0,
+                                         const __m128i* const q1,
+                                         int thresh, __m128i* const mask) {
   const __m128i m_thresh = _mm_set1_epi8(thresh);
   const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
   const __m128i kFE = _mm_set1_epi8(0xFE);
@@ -353,28 +356,29 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
 // Edge filtering functions
 
 // Applies filter on 2 pixels (p0 and q0)
-static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
-                                  __m128i* const q0, __m128i* const q1,
-                                  int thresh) {
+static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
+                                       __m128i* const q0, __m128i* const q1,
+                                       int thresh) {
   __m128i a, mask;
   const __m128i sign_bit = _mm_set1_epi8(0x80);
-  // convert p1/q1 to int8_t (for GetBaseDelta)
+  // convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
   const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
   const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
 
-  NeedsFilter(p1, p0, q0, q1, thresh, &mask);
+  NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask);
 
   FLIP_SIGN_BIT2(*p0, *q0);
-  GetBaseDelta(&p1s, p0, q0, &q1s, &a);
+  GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a);
   a = _mm_and_si128(a, mask);     // mask filter values we don't care about
-  DoSimpleFilter(p0, q0, &a);
+  DoSimpleFilter_SSE2(p0, q0, &a);
   FLIP_SIGN_BIT2(*p0, *q0);
 }
 
 // Applies filter on 4 pixels (p1, p0, q0 and q1)
-static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
-                                  __m128i* const q0, __m128i* const q1,
-                                  const __m128i* const mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
+                                       __m128i* const q0, __m128i* const q1,
+                                       const __m128i* const mask,
+                                       int hev_thresh) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bit = _mm_set1_epi8(0x80);
   const __m128i k64 = _mm_set1_epi8(64);
@@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
   __m128i t1, t2, t3;
 
   // compute hev mask
-  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+  GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
 
   // convert to signed values
   FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
 
   t2 = _mm_adds_epi8(t1, k3);        // 3 * (q0 - p0) + hev(p1 - q1) + 3
   t3 = _mm_adds_epi8(t1, k4);        // 3 * (q0 - p0) + hev(p1 - q1) + 4
-  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
-  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+  SignedShift8b_SSE2(&t2);           // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  SignedShift8b_SSE2(&t3);           // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
   *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
   *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
   FLIP_SIGN_BIT2(*p0, *q0);
@@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
 }
 
 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
-                                  __m128i* const p0, __m128i* const q0,
-                                  __m128i* const q1, __m128i* const q2,
-                                  const __m128i* const mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
+                                       __m128i* const p0, __m128i* const q0,
+                                       __m128i* const q1, __m128i* const q2,
+                                       const __m128i* const mask,
+                                       int hev_thresh) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bit = _mm_set1_epi8(0x80);
   __m128i a, not_hev;
 
   // compute hev mask
-  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+  GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
 
   FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
   FLIP_SIGN_BIT2(*p2, *q2);
-  GetBaseDelta(p1, p0, q0, q1, &a);
+  GetBaseDelta_SSE2(p1, p0, q0, q1, &a);
 
   { // do simple filter on pixels with hev
     const __m128i m = _mm_andnot_si128(not_hev, *mask);
     const __m128i f = _mm_and_si128(a, m);
-    DoSimpleFilter(p0, q0, &f);
+    DoSimpleFilter_SSE2(p0, q0, &f);
   }
 
   { // do strong filter on pixels with not hev
@@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
     const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63
     const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63
 
-    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
-    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
-    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
+    Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi);
+    Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi);
+    Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi);
   }
 }
 
 // reads 8 rows across a vertical edge.
-static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
-                                __m128i* const p, __m128i* const q) {
+static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
+                                     __m128i* const p, __m128i* const q) {
   // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
   // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
   const __m128i A0 = _mm_set_epi32(
@@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
   *q = _mm_unpackhi_epi32(C0, C1);
 }
 
-static WEBP_INLINE void Load16x4(const uint8_t* const r0,
-                                 const uint8_t* const r8,
-                                 int stride,
-                                 __m128i* const p1, __m128i* const p0,
-                                 __m128i* const q0, __m128i* const q1) {
+static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
+                                      const uint8_t* const r8,
+                                      int stride,
+                                      __m128i* const p1, __m128i* const p0,
+                                      __m128i* const q0, __m128i* const q1) {
   // Assume the pixels around the edge (|) are numbered as follows
   //                00 01 | 02 03
   //                10 11 | 12 13
@@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
   // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
   // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
   // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-  Load8x4(r0, stride, p1, q0);
-  Load8x4(r8, stride, p0, q1);
+  Load8x4_SSE2(r0, stride, p1, q0);
+  Load8x4_SSE2(r8, stride, p0, q1);
 
   {
     // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
@@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
   }
 }
 
-static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
+                                      uint8_t* dst, int stride) {
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
     WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
@@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
 }
 
 // Transpose back and store
-static WEBP_INLINE void Store16x4(const __m128i* const p1,
-                                  const __m128i* const p0,
-                                  const __m128i* const q0,
-                                  const __m128i* const q1,
-                                  uint8_t* r0, uint8_t* r8,
-                                  int stride) {
+static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
+                                       const __m128i* const p0,
+                                       const __m128i* const q0,
+                                       const __m128i* const q1,
+                                       uint8_t* r0, uint8_t* r8,
+                                       int stride) {
   __m128i t1, p1_s, p0_s, q0_s, q1_s;
 
   // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1,
   p1_s = _mm_unpacklo_epi16(t1, q1_s);
   q1_s = _mm_unpackhi_epi16(t1, q1_s);
 
-  Store4x4(&p0_s, r0, stride);
+  Store4x4_SSE2(&p0_s, r0, stride);
   r0 += 4 * stride;
-  Store4x4(&q0_s, r0, stride);
+  Store4x4_SSE2(&q0_s, r0, stride);
 
-  Store4x4(&p1_s, r8, stride);
+  Store4x4_SSE2(&p1_s, r8, stride);
   r8 += 4 * stride;
-  Store4x4(&q1_s, r8, stride);
+  Store4x4_SSE2(&q1_s, r8, stride);
 }
 
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) {
   // Load
   __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
   __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
   __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
   __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
 
-  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+  DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
 
   // Store
   _mm_storeu_si128((__m128i*)&p[-stride], p0);
   _mm_storeu_si128((__m128i*)&p[0], q0);
 }
 
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) {
   __m128i p1, p0, q0, q1;
 
   p -= 2;  // beginning of p1
 
-  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
-  DoFilter2(&p1, &p0, &q0, &q1, thresh);
-  Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
+  Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+  DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
+  Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
 }
 
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
+    SimpleVFilter16_SSE2(p, stride, thresh);
   }
 }
 
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4;
-    SimpleHFilter16(p, stride, thresh);
+    SimpleHFilter16_SSE2(p, stride, thresh);
   }
 }
 
@@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 // Complex In-loop filtering (Paragraph 15.3)
 
 #define MAX_DIFF1(p3, p2, p1, p0, m) do {                                      \
-  m = MM_ABS(p1, p0);                                                          \
-  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  (m) = MM_ABS(p1, p0);                                                        \
+  (m) = _mm_max_epu8(m, MM_ABS(p3, p2));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
 } while (0)
 
 #define MAX_DIFF2(p3, p2, p1, p0, m) do {                                      \
-  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  (m) = _mm_max_epu8(m, MM_ABS(p1, p0));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p3, p2));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
 } while (0)
 
 #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
-  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
-  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
-  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
-  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
+  (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]);                        \
+  (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]);                        \
+  (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]);                        \
+  (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]);                        \
 }
 
 #define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
   const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
   const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]);                 \
-  p = _mm_unpacklo_epi64(U, V);                                                \
+  (p) = _mm_unpacklo_epi64(U, V);                                              \
 } while (0)
 
 #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
-  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
-  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
-  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
-  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
+  LOADUV_H_EDGE(e1, u, v, 0 * (stride));                                       \
+  LOADUV_H_EDGE(e2, u, v, 1 * (stride));                                       \
+  LOADUV_H_EDGE(e3, u, v, 2 * (stride));                                       \
+  LOADUV_H_EDGE(e4, u, v, 3 * (stride));                                       \
 }
 
 #define STOREUV(p, u, v, stride) {                                             \
-  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
-  p = _mm_srli_si128(p, 8);                                                    \
-  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
+  _mm_storel_epi64((__m128i*)&(u)[(stride)], p);                               \
+  (p) = _mm_srli_si128(p, 8);                                                  \
+  _mm_storel_epi64((__m128i*)&(v)[(stride)], p);                               \
 }
 
-static WEBP_INLINE void ComplexMask(const __m128i* const p1,
-                                    const __m128i* const p0,
-                                    const __m128i* const q0,
-                                    const __m128i* const q1,
-                                    int thresh, int ithresh,
-                                    __m128i* const mask) {
+static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
+                                         const __m128i* const p0,
+                                         const __m128i* const q0,
+                                         const __m128i* const q1,
+                                         int thresh, int ithresh,
+                                         __m128i* const mask) {
   const __m128i it = _mm_set1_epi8(ithresh);
   const __m128i diff = _mm_subs_epu8(*mask, it);
   const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
   __m128i filter_mask;
-  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
+  NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask);
   *mask = _mm_and_si128(thresh_mask, filter_mask);
 }
 
 // on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter16_SSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   __m128i t1;
   __m128i mask;
   __m128i p2, p1, p0, q0, q1, q2;
@@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride,
   LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
   MAX_DIFF2(t1, q2, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
   // Store
   _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
@@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride,
   _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
 }
 
-static void HFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter16_SSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
   uint8_t* const b = p - 4;
-  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
   MAX_DIFF1(p3, p2, p1, p0, mask);
 
-  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
+  Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
   MAX_DIFF2(q3, q2, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
-  Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
-  Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
+  Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
+  Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
 }
 
 // on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_SSE2(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
   int k;
   __m128i p3, p2, p1, p0;   // loop invariants
 
@@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride,
 
     // p3 and p2 are not just temporary variables here: they will be
     // re-used for next span. And q2/q3 will become p1/p0 accordingly.
-    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+    ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
 
     // Store
     _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
@@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride,
   }
 }
 
-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i_SSE2(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
   int k;
   __m128i p3, p2, p1, p0;   // loop invariants
 
-  Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
+  Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
 
   for (k = 3; k > 0; --k) {
     __m128i mask, tmp1, tmp2;
@@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride,
     p += 4;  // beginning of q0 (and next span)
 
     MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
-    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
+    Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
     MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
 
-    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+    ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
 
-    Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
+    Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
 
     // rotate samples
     p1 = tmp1;
@@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride,
 }
 
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, p2, p1, p0, q0, q1, q2;
 
@@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
   LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
   MAX_DIFF2(t1, q2, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
   // Store
   STOREUV(p2, u, v, -3 * stride);
@@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
   STOREUV(q2, u, v, 2 * stride);
 }
 
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
   uint8_t* const tu = u - 4;
   uint8_t* const tv = v - 4;
-  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0);
   MAX_DIFF1(p3, p2, p1, p0, mask);
 
-  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
+  Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3);
   MAX_DIFF2(q3, q2, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
-  Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
-  Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
+  Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride);
+  Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
 }
 
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, t2, p1, p0, q0, q1;
 
@@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
   LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
   MAX_DIFF2(t2, t1, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
 
   // Store
   STOREUV(p1, u, v, -2 * stride);
@@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
   STOREUV(q1, u, v, 1 * stride);
 }
 
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, t2, p1, p0, q0, q1;
-  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
+  Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
   MAX_DIFF1(t2, t1, p1, p0, mask);
 
   u += 4;  // beginning of q0
   v += 4;
-  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+  Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
   MAX_DIFF2(t2, t1, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
 
   u -= 2;  // beginning of p1
   v -= 2;
-  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
+  Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride);
 }
 
 //------------------------------------------------------------------------------
@@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
 
-static void VE4(uint8_t* dst) {    // vertical
+static void VE4_SSE2(uint8_t* dst) {    // vertical
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) {    // vertical
   }
 }
 
-static void LD4(uint8_t* dst) {   // Down-Left
+static void LD4_SSE2(uint8_t* dst) {   // Down-Left
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) {   // Down-Left
   WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
-static void VR4(uint8_t* dst) {   // Vertical-Right
+static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
   const __m128i one = _mm_set1_epi8(1);
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
@@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) {   // Vertical-Right
   DST(0, 3) = AVG3(K, J, I);
 }
 
-static void VL4(uint8_t* dst) {   // Vertical-Left
+static void VL4_SSE2(uint8_t* dst) {   // Vertical-Left
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
   const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) {   // Vertical-Left
   DST(3, 3) = (extra_out >> 8) & 0xff;
 }
 
-static void RD4(uint8_t* dst) {   // Down-right
+static void RD4_SSE2(uint8_t* dst) {   // Down-right
   const __m128i one = _mm_set1_epi8(1);
   const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
   const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
@@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) {   // Down-right
 //------------------------------------------------------------------------------
 // Luma 16x16
 
-static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
   const uint8_t* top = dst - BPS;
   const __m128i zero = _mm_setzero_si128();
   int y;
@@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
   }
 }
 
-static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+static void TM4_SSE2(uint8_t* dst)   { TrueMotion_SSE2(dst, 4); }
+static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
+static void TM16_SSE2(uint8_t* dst)  { TrueMotion_SSE2(dst, 16); }
 
-static void VE16(uint8_t* dst) {
+static void VE16_SSE2(uint8_t* dst) {
   const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
   int j;
   for (j = 0; j < 16; ++j) {
@@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) {
   }
 }
 
-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_SSE2(uint8_t* dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
     const __m128i values = _mm_set1_epi8(dst[-1]);
@@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) {     // horizontal
   }
 }
 
-static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
   const __m128i values = _mm_set1_epi8(v);
   for (j = 0; j < 16; ++j) {
@@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
   }
 }
 
-static void DC16(uint8_t* dst) {    // DC
+static void DC16_SSE2(uint8_t* dst) {  // DC
   const __m128i zero = _mm_setzero_si128();
   const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
   const __m128i sad8x2 = _mm_sad_epu8(top, zero);
@@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) {    // DC
   }
   {
     const int DC = _mm_cvtsi128_si32(sum) + left + 16;
-    Put16(DC >> 5, dst);
+    Put16_SSE2(DC >> 5, dst);
   }
 }
 
-static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+static void DC16NoTop_SSE2(uint8_t* dst) {  // DC with top samples unavailable
   int DC = 8;
   int j;
   for (j = 0; j < 16; ++j) {
     DC += dst[-1 + j * BPS];
   }
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }
 
-static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+static void DC16NoLeft_SSE2(uint8_t* dst) {  // DC with left samples unavailable
   const __m128i zero = _mm_setzero_si128();
   const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
   const __m128i sad8x2 = _mm_sad_epu8(top, zero);
   // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
   const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
   const int DC = _mm_cvtsi128_si32(sum) + 8;
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }
 
-static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
-  Put16(0x80, dst);
+static void DC16NoTopLeft_SSE2(uint8_t* dst) {  // DC with no top & left samples
+  Put16_SSE2(0x80, dst);
 }
 
 //------------------------------------------------------------------------------
 // Chroma
 
-static void VE8uv(uint8_t* dst) {    // vertical
+static void VE8uv_SSE2(uint8_t* dst) {    // vertical
   int j;
   const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
   for (j = 0; j < 8; ++j) {
@@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) {    // vertical
   }
 }
 
-static void HE8uv(uint8_t* dst) {    // horizontal
-  int j;
-  for (j = 0; j < 8; ++j) {
-    const __m128i values = _mm_set1_epi8(dst[-1]);
-    _mm_storel_epi64((__m128i*)dst, values);
-    dst += BPS;
-  }
-}
-
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
   const __m128i values = _mm_set1_epi8(v);
   for (j = 0; j < 8; ++j) {
@@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
   }
 }
 
-static void DC8uv(uint8_t* dst) {     // DC
+static void DC8uv_SSE2(uint8_t* dst) {     // DC
   const __m128i zero = _mm_setzero_si128();
   const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
   const __m128i sum = _mm_sad_epu8(top, zero);
@@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) {     // DC
   }
   {
     const int DC = _mm_cvtsi128_si32(sum) + left + 8;
-    Put8x8uv(DC >> 4, dst);
+    Put8x8uv_SSE2(DC >> 4, dst);
   }
 }
 
-static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+static void DC8uvNoLeft_SSE2(uint8_t* dst) {   // DC with no left samples
   const __m128i zero = _mm_setzero_si128();
   const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
   const __m128i sum = _mm_sad_epu8(top, zero);
   const int DC = _mm_cvtsi128_si32(sum) + 4;
-  Put8x8uv(DC >> 3, dst);
+  Put8x8uv_SSE2(DC >> 3, dst);
 }
 
-static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+static void DC8uvNoTop_SSE2(uint8_t* dst) {  // DC with no top samples
   int dc0 = 4;
   int i;
   for (i = 0; i < 8; ++i) {
     dc0 += dst[-1 + i * BPS];
   }
-  Put8x8uv(dc0 >> 3, dst);
+  Put8x8uv_SSE2(dc0 >> 3, dst);
 }
 
-static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
-  Put8x8uv(0x80, dst);
+static void DC8uvNoTopLeft_SSE2(uint8_t* dst) {    // DC with nothing
+  Put8x8uv_SSE2(0x80, dst);
 }
 
 //------------------------------------------------------------------------------
@@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
 extern void VP8DspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
-  VP8Transform = Transform;
-#if defined(USE_TRANSFORM_AC3)
-  VP8TransformAC3 = TransformAC3;
+  VP8Transform = Transform_SSE2;
+#if (USE_TRANSFORM_AC3 == 1)
+  VP8TransformAC3 = TransformAC3_SSE2;
 #endif
 
-  VP8VFilter16 = VFilter16;
-  VP8HFilter16 = HFilter16;
-  VP8VFilter8 = VFilter8;
-  VP8HFilter8 = HFilter8;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16i = HFilter16i;
-  VP8VFilter8i = VFilter8i;
-  VP8HFilter8i = HFilter8i;
-
-  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16i;
-
-  VP8PredLuma4[1] = TM4;
-  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[5] = VR4;
-  VP8PredLuma4[6] = LD4;
-  VP8PredLuma4[7] = VL4;
-
-  VP8PredLuma16[0] = DC16;
-  VP8PredLuma16[1] = TM16;
-  VP8PredLuma16[2] = VE16;
-  VP8PredLuma16[3] = HE16;
-  VP8PredLuma16[4] = DC16NoTop;
-  VP8PredLuma16[5] = DC16NoLeft;
-  VP8PredLuma16[6] = DC16NoTopLeft;
-
-  VP8PredChroma8[0] = DC8uv;
-  VP8PredChroma8[1] = TM8uv;
-  VP8PredChroma8[2] = VE8uv;
-  VP8PredChroma8[3] = HE8uv;
-  VP8PredChroma8[4] = DC8uvNoTop;
-  VP8PredChroma8[5] = DC8uvNoLeft;
-  VP8PredChroma8[6] = DC8uvNoTopLeft;
+  VP8VFilter16 = VFilter16_SSE2;
+  VP8HFilter16 = HFilter16_SSE2;
+  VP8VFilter8 = VFilter8_SSE2;
+  VP8HFilter8 = HFilter8_SSE2;
+  VP8VFilter16i = VFilter16i_SSE2;
+  VP8HFilter16i = HFilter16i_SSE2;
+  VP8VFilter8i = VFilter8i_SSE2;
+  VP8HFilter8i = HFilter8i_SSE2;
+
+  VP8SimpleVFilter16 = SimpleVFilter16_SSE2;
+  VP8SimpleHFilter16 = SimpleHFilter16_SSE2;
+  VP8SimpleVFilter16i = SimpleVFilter16i_SSE2;
+  VP8SimpleHFilter16i = SimpleHFilter16i_SSE2;
+
+  VP8PredLuma4[1] = TM4_SSE2;
+  VP8PredLuma4[2] = VE4_SSE2;
+  VP8PredLuma4[4] = RD4_SSE2;
+  VP8PredLuma4[5] = VR4_SSE2;
+  VP8PredLuma4[6] = LD4_SSE2;
+  VP8PredLuma4[7] = VL4_SSE2;
+
+  VP8PredLuma16[0] = DC16_SSE2;
+  VP8PredLuma16[1] = TM16_SSE2;
+  VP8PredLuma16[2] = VE16_SSE2;
+  VP8PredLuma16[3] = HE16_SSE2;
+  VP8PredLuma16[4] = DC16NoTop_SSE2;
+  VP8PredLuma16[5] = DC16NoLeft_SSE2;
+  VP8PredLuma16[6] = DC16NoTopLeft_SSE2;
+
+  VP8PredChroma8[0] = DC8uv_SSE2;
+  VP8PredChroma8[1] = TM8uv_SSE2;
+  VP8PredChroma8[2] = VE8uv_SSE2;
+  VP8PredChroma8[4] = DC8uvNoTop_SSE2;
+  VP8PredChroma8[5] = DC8uvNoLeft_SSE2;
+  VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/dec_sse41.c b/thirdparty/libwebp/src/dsp/dec_sse41.c
index 4e81ec4d80..8f18506d54 100644
--- a/thirdparty/libwebp/dsp/dec_sse41.c
+++ b/thirdparty/libwebp/src/dsp/dec_sse41.c
@@ -11,15 +11,15 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE41)
 
 #include <smmintrin.h>
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_SSE41(uint8_t* dst) {     // horizontal
   int j;
   const __m128i kShuffle3 = _mm_set1_epi8(3);
   for (j = 16; j > 0; --j) {
@@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) {     // horizontal
 extern void VP8DspInitSSE41(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
-  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[3] = HE16_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/thirdparty/libwebp/dsp/dsp.h b/thirdparty/libwebp/src/dsp/dsp.h
index 813fed4a35..99eefe092f 100644
--- a/thirdparty/libwebp/dsp/dsp.h
+++ b/thirdparty/libwebp/src/dsp/dsp.h
@@ -15,10 +15,10 @@
 #define WEBP_DSP_DSP_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,10 +38,22 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 
+#if defined(__clang__)
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
 #ifndef __has_builtin
 # define __has_builtin(x) 0
 #endif
 
+// for now, none of the optimizations below are available in emscripten
+#if !defined(EMSCRIPTEN)
+
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -68,18 +80,20 @@ extern "C" {
 #define WEBP_USE_AVX2
 #endif
 
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
-#define WEBP_ANDROID_NEON  // Android targets that might support NEON
-#endif
-
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+#if (defined(__ARM_NEON__) || \
      defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
     !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif
 
+#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
+    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
+#define WEBP_ANDROID_NEON  // Android targets that may have NEON
+#define WEBP_USE_NEON
+#endif
+
 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
 #define WEBP_USE_NEON
 #define WEBP_USE_INTRINSICS
@@ -90,7 +104,7 @@ extern "C" {
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
+#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
 #define WEBP_USE_MIPS_DSP_R2
 #endif
 #endif
@@ -100,6 +114,24 @@ extern "C" {
 #define WEBP_USE_MSA
 #endif
 
+#endif  /* EMSCRIPTEN */
+
+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
@@ -129,6 +161,11 @@ extern "C" {
 #endif
 #endif
 
+// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define WEBP_SWAP_16BIT_CSP 0
+#endif
+
 typedef enum {
   kSSE2,
   kSSE3,
@@ -143,7 +180,7 @@ typedef enum {
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 
 //------------------------------------------------------------------------------
 // Init stub generator
@@ -271,6 +308,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
                                         int xo, int yo,  // center position
                                         int W, int H);   // plane dimension
 
+#if !defined(WEBP_REDUCE_SIZE)
 // This version is called with the guarantee that you can load 8 bytes and
 // 8 rows at offset src1 and src2
 typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
@@ -278,10 +316,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
 
 extern VP8SSIMGetFunc VP8SSIMGet;         // unclipped / unchecked
 extern VP8SSIMGetClippedFunc VP8SSIMGetClipped;   // with clipping
+#endif
 
+#if !defined(WEBP_DISABLE_STATS)
 typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
                                          const uint8_t* src2, int len);
 extern VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
 
 // must be called before using any of the above directly
 void VP8SSIMDspInit(void);
@@ -462,12 +503,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
 extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 
 // Plain-C implementation, as fall-back.
-extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
-                                         const uint8_t* src);
-extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
-                                         const uint8_t* src);
-extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
-extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
+extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
 
 // Main entry calls:
 extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
@@ -533,24 +574,21 @@ void WebPMultRows(uint8_t* ptr, int stride,
                   int width, int num_rows, int inverse);
 
 // Plain-C versions, used as fallback by some implementations.
-void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
-                  int width, int inverse);
-void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
-
-// To be called first before using the above.
-void WebPInitAlphaProcessing(void);
-
-// ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
-                           const uint8_t* g, const uint8_t* b, int len,
-                           uint32_t* out);
+void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+                   int width, int inverse);
+void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
 
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                          int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                           int len, int step, uint32_t* out);
+
+// This function returns true if src[i] contains a value different from 0xff.
+extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
+// This function returns true if src[4*i] contains a value different from 0xff.
+extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
 
 // To be called first before using the above.
-void VP8EncDspARGBInit(void);
+void WebPInitAlphaProcessing(void);
 
 //------------------------------------------------------------------------------
 // Filter functions
diff --git a/thirdparty/libwebp/dsp/enc.c b/thirdparty/libwebp/src/dsp/enc.c
index f31bc6de18..1c807f1df7 100644
--- a/thirdparty/libwebp/dsp/enc.c
+++ b/thirdparty/libwebp/src/dsp/enc.c
@@ -14,16 +14,18 @@
 #include <assert.h>
 #include <stdlib.h>  // for abs()
 
-#include "./dsp.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/vp8i_enc.h"
 
 static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
 
+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int clip_max(int v, int max) {
   return (v > max) ? max : v;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
@@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
   histo->last_non_zero = last_non_zero;
 }
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+                               int start_block, int end_block,
+                               VP8Histogram* const histo) {
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
@@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
   }
   VP8SetHistogramData(distribution, histo);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // run-time tables (~4k)
@@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
+#if !WEBP_NEON_OMIT_C_CODE
+
 #define STORE(x, y, v) \
   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
 
@@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   }
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                         int do_two) {
   ITransformOne(ref, in, dst);
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   int i;
   int tmp[16];
   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
     out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+                          int16_t* out) {
   VP8FTransform(src, ref, out);
   VP8FTransform(src + 4, ref + 4, out + 16);
 }
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void FTransformWHT_C(const int16_t* in, int16_t* out) {
   // input is 12b signed
   int32_t tmp[16];
   int i;
@@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
     out[12 + i] = b3 >> 1;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 #undef MUL
 #undef STORE
@@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top) {
   // U block
   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
   VerticalPred(C8VE8 + dst, top, 8);
@@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_C(uint8_t* dst,
+                           const uint8_t* left, const uint8_t* top) {
   DCMode(I16DC16 + dst, left, top, 16, 16, 5);
   VerticalPred(I16VE16 + dst, top, 16);
   HorizontalPred(I16HE16 + dst, left, 16);
@@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Metric
 
+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
                               int w, int h) {
   int count = 0;
@@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
   return count;
 }
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 16, 16);
 }
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 16, 8);
 }
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 8, 8);
 }
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 4, 4);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
   int k, x, y;
   for (k = 0; k < 4; ++k) {
     uint32_t avg = 0;
@@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.
 
+#if !WEBP_NEON_OMIT_C_CODE
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
@@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
   return sum;
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
   const int sum1 = TTransform(a, w);
   const int sum2 = TTransform(b, w);
   return abs(sum2 - sum1) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_C(a + x + y, b + x + y, w);
     }
   }
   return D;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Quantization
@@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = {
 };
 
 // Simple quantization
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
+                           const VP8Matrix* const mtx) {
   int last = -1;
   int n;
   for (n = 0; n < 16; ++n) {
@@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (last >= 0);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
+                             const VP8Matrix* const mtx) {
   int nz;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 // Block copy
@@ -682,149 +698,15 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
   }
 }
 
-static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
   Copy(src, dst, 4, 4);
 }
 
-static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
   Copy(src, dst, 16, 8);
 }
 
 //------------------------------------------------------------------------------
-// SSIM / PSNR
-
-// hat-shaped filter. Sum of coefficients is equal to 16.
-static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
-  1, 2, 3, 4, 3, 2, 1
-};
-static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
-
-static WEBP_INLINE double SSIMCalculation(
-    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
-  const uint32_t w2 =  N * N;
-  const uint32_t C1 = 20 * w2;
-  const uint32_t C2 = 60 * w2;
-  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
-  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
-  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
-  if (xmxm + ymym >= C3) {
-    const int64_t xmym = (int64_t)stats->xm * stats->ym;
-    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
-    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
-    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
-    // we descale by 8 to prevent overflow during the fnum/fden multiply.
-    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
-    const uint64_t den_S = (sxx + syy + C2) >> 8;
-    const uint64_t fnum = (2 * xmym + C1) * num_S;
-    const uint64_t fden = (xmxm + ymym + C1) * den_S;
-    const double r = (double)fnum / fden;
-    assert(r >= 0. && r <= 1.0);
-    return r;
-  }
-  return 1.;   // area is too dark to contribute meaningfully
-}
-
-double VP8SSIMFromStats(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, kWeightSum);
-}
-
-double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, stats->w);
-}
-
-static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
-                               const uint8_t* src2, int stride2,
-                               int xo, int yo, int W, int H) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
-  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
-                                                  : yo + VP8_SSIM_KERNEL;
-  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
-  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
-                                                  : xo + VP8_SSIM_KERNEL;
-  int x, y;
-  src1 += ymin * stride1;
-  src2 += ymin * stride2;
-  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
-    for (x = xmin; x <= xmax; ++x) {
-      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
-                       * kWeight[VP8_SSIM_KERNEL + y - yo];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.w   += w;
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStatsClipped(&stats);
-}
-
-static double SSIMGet_C(const uint8_t* src1, int stride1,
-                        const uint8_t* src2, int stride2) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  int x, y;
-  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
-    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
-      const uint32_t w = kWeight[x] * kWeight[y];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStats(&stats);
-}
-
-//------------------------------------------------------------------------------
-
-static uint32_t AccumulateSSE(const uint8_t* src1,
-                              const uint8_t* src2, int len) {
-  int i;
-  uint32_t sse2 = 0;
-  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
-  for (i = 0; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-//------------------------------------------------------------------------------
-
-VP8SSIMGetFunc VP8SSIMGet;
-VP8SSIMGetClippedFunc VP8SSIMGetClipped;
-VP8AccumulateSSEFunc VP8AccumulateSSE;
-
-extern void VP8SSIMDspInitSSE2(void);
-
-static volatile VP8CPUInfo ssim_last_cpuinfo_used =
-    (VP8CPUInfo)&ssim_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
-  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8SSIMGetClipped = SSIMGetClipped_C;
-  VP8SSIMGet = SSIMGet_C;
-
-  VP8AccumulateSSE = AccumulateSSE;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8SSIMDspInitSSE2();
-    }
-#endif
-  }
-
-  ssim_last_cpuinfo_used = VP8GetCPUInfo;
-}
-
-//------------------------------------------------------------------------------
 // Initialization
 
 // Speed-critical function pointers. We have to initialize them to the default
@@ -868,26 +750,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
   InitTables();
 
   // default C implementations
-  VP8CollectHistogram = CollectHistogram;
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransform2 = FTransform2;
-  VP8FTransformWHT = FTransformWHT;
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8Mean16x4 = Mean16x4;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlock;
-  VP8Copy4x4 = Copy4x4;
-  VP8Copy16x8 = Copy16x8;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8ITransform = ITransform_C;
+  VP8FTransform = FTransform_C;
+  VP8FTransformWHT = FTransformWHT_C;
+  VP8TDisto4x4 = Disto4x4_C;
+  VP8TDisto16x16 = Disto16x16_C;
+  VP8CollectHistogram = CollectHistogram_C;
+  VP8SSE16x16 = SSE16x16_C;
+  VP8SSE16x8 = SSE16x8_C;
+  VP8SSE8x8 = SSE8x8_C;
+  VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8EncQuantizeBlock = QuantizeBlock_C;
+  VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+  VP8FTransform2 = FTransform2_C;
+  VP8EncPredLuma4 = Intra4Preds_C;
+  VP8EncPredLuma16 = Intra16Preds_C;
+  VP8EncPredChroma8 = IntraChromaPreds_C;
+  VP8Mean16x4 = Mean16x4_C;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+  VP8Copy4x4 = Copy4x4_C;
+  VP8Copy16x8 = Copy16x8_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -906,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
       VP8EncDspInitAVX2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8EncDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8EncDspInitMIPS32();
@@ -927,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8EncDspInitNEON();
+  }
+#endif
+
+  assert(VP8ITransform != NULL);
+  assert(VP8FTransform != NULL);
+  assert(VP8FTransformWHT != NULL);
+  assert(VP8TDisto4x4 != NULL);
+  assert(VP8TDisto16x16 != NULL);
+  assert(VP8CollectHistogram != NULL);
+  assert(VP8SSE16x16 != NULL);
+  assert(VP8SSE16x8 != NULL);
+  assert(VP8SSE8x8 != NULL);
+  assert(VP8SSE4x4 != NULL);
+  assert(VP8EncQuantizeBlock != NULL);
+  assert(VP8EncQuantize2Blocks != NULL);
+  assert(VP8FTransform2 != NULL);
+  assert(VP8EncPredLuma4 != NULL);
+  assert(VP8EncPredLuma16 != NULL);
+  assert(VP8EncPredChroma8 != NULL);
+  assert(VP8Mean16x4 != NULL);
+  assert(VP8EncQuantizeBlockWHT != NULL);
+  assert(VP8Copy4x4 != NULL);
+  assert(VP8Copy16x8 != NULL);
+
   enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/enc_avx2.c b/thirdparty/libwebp/src/dsp/enc_avx2.c
index 93efb30b10..8bc5798fee 100644
--- a/thirdparty/libwebp/dsp/enc_avx2.c
+++ b/thirdparty/libwebp/src/dsp/enc_avx2.c
@@ -9,7 +9,7 @@
 //
 // AVX2 version of speed-critical encoding functions.
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_AVX2)
 
diff --git a/thirdparty/libwebp/dsp/enc_mips32.c b/thirdparty/libwebp/src/dsp/enc_mips32.c
index 752b14daf6..618f0fc0ee 100644
--- a/thirdparty/libwebp/dsp/enc_mips32.c
+++ b/thirdparty/libwebp/src/dsp/enc_mips32.c
@@ -13,13 +13,13 @@
 //            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 //            Slobodan Prijic  (slobodan.prijic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "./mips_macro.h"
-#include "../enc/vp8i_enc.h"
-#include "../enc/cost_enc.h"
+#include "src/dsp/mips_macro.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
@@ -113,8 +113,9 @@ static const int kC2 = 35468;
   "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
 
 // Does one or two inverse transforms.
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
+                                             const int16_t* in,
+                                             uint8_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
   int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
   int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   );
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in,
-                       uint8_t* dst, int do_two) {
-  ITransformOne(ref, in, dst);
+static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
+                              uint8_t* dst, int do_two) {
+  ITransformOne_MIPS32(ref, in, dst);
   if (do_two) {
-    ITransformOne(ref + 4, in + 16, dst + 4);
+    ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
   }
 }
 
@@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
   "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
   "sh           %[level],       " #N "(%[pout])                     \n\t"
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
+                                const VP8Matrix* const mtx) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   int sign, coeff, level, i;
   int max_level = MAX_LEVEL;
@@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
+                                  const VP8Matrix* const mtx) {
   int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
 
@@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
   "msub   %[temp6],  %[temp0]                \n\t"                \
   "msub   %[temp7],  %[temp1]                \n\t"
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
   int tmp[32];
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
 
@@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                             const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
   "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
   "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
+                              int16_t* out) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
   int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
   int temp17, temp18, temp19, temp20;
@@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
   GET_SSE_INNER(D, D + 1, D + 2, D + 3)
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
@@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
@@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
@@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
@@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 extern void VP8EncDspInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
+  VP8ITransform = ITransform_MIPS32;
+  VP8FTransform = FTransform_MIPS32;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
+
+  VP8TDisto4x4 = Disto4x4_MIPS32;
+  VP8TDisto16x16 = Disto16x16_MIPS32;
+
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
+  VP8SSE16x16 = SSE16x16_MIPS32;
+  VP8SSE8x8 = SSE8x8_MIPS32;
+  VP8SSE16x8 = SSE16x8_MIPS32;
+  VP8SSE4x4 = SSE4x4_MIPS32;
 #endif
 }
 
diff --git a/thirdparty/libwebp/dsp/enc_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
index 6c8c1c6acd..9ddd895086 100644
--- a/thirdparty/libwebp/dsp/enc_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
@@ -12,13 +12,13 @@
 // Author(s): Darko Laus (darko.laus@imgtec.com)
 //            Mirko Raus (mirko.raus@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./mips_macro.h"
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/mips_macro.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
@@ -141,7 +141,8 @@ static const int kC2 = 35468;
   "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
   "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
+                                 int16_t* out) {
   const int c2217 = 2217;
   const int c5352 = 5352;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   );
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
+                                 uint8_t* dst, int do_two) {
   ITransformOne(ref, in, dst);
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
+                              const uint16_t* const w) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
 
@@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
   return abs(temp3 - temp17) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MIPSdspR2(const uint8_t* const a,
+                                const uint8_t* const b,
+                                const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
   // U block
   DCMode8(C8DC8 + dst, left, top);
   VerticalPred8(C8VE8 + dst, top);
@@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MIPSdspR2(uint8_t* dst,
+                                   const uint8_t* left, const uint8_t* top) {
   DCMode16(I16DC16 + dst, left, top);
   VerticalPred16(I16VE16 + dst, top);
   HorizontalPred16(I16HE16 + dst, left);
@@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst,
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
   GET_SSE_INNER(C)                        \
   GET_SSE_INNER(D)
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
@@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
@@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
@@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
@@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   "usw         $0,           " #J "(%[ppin])                 \n\t"        \
 "3:                                                          \n\t"
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
+                                   const VP8Matrix* const mtx) {
   int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
   int sign, coeff, level;
   int max_level = MAX_LEVEL;
@@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (ret != 0);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
+                                     const VP8Matrix* const mtx) {
   int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
 
@@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
   "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
   "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
   int temp0, temp1, temp2, temp3, temp4;
   int temp5, temp6, temp7, temp8, temp9;
 
@@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   "sw         %[temp8],  0(%[temp3])                   \n\t"
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
+                                       int start_block, int end_block,
+                                       VP8Histogram* const histo) {
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
@@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 extern void VP8EncDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
-  VP8FTransform = FTransform;
-  VP8ITransform = ITransform;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8EncPredLuma4 = Intra4Preds;
+  VP8FTransform = FTransform_MIPSdspR2;
+  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
+  VP8ITransform = ITransform_MIPSdspR2;
+
+  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
+  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
+
+  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
+  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
+  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
+
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
+  VP8SSE16x16 = SSE16x16_MIPSdspR2;
+  VP8SSE8x8 = SSE8x8_MIPSdspR2;
+  VP8SSE16x8 = SSE16x8_MIPSdspR2;
+  VP8SSE4x4 = SSE4x4_MIPSdspR2;
 #endif
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8FTransformWHT = FTransformWHT;
-  VP8CollectHistogram = CollectHistogram;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
+
+  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/enc_msa.c b/thirdparty/libwebp/src/dsp/enc_msa.c
index 909b46d5d9..6f85add4bb 100644
--- a/thirdparty/libwebp/dsp/enc_msa.c
+++ b/thirdparty/libwebp/src/dsp/enc_msa.c
@@ -11,13 +11,13 @@
 //
 // Author:  Prashant Patil   (prashant.patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
 #include <stdlib.h>
-#include "./msa_macro.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/msa_macro.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms
@@ -69,20 +69,21 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                           int do_two) {
   ITransformOne(ref, in, dst);
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
+                           int16_t* out) {
   uint64_t out0, out1, out2, out3;
   uint32_t in0, in1, in2, in3;
   v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
   v8i16 t0, t1, t2, t3;
-  v16u8 srcl0, srcl1, src0, src1;
+  v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
   const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
   const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
   const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
@@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   SD4(out0, out1, out2, out3, out, 8);
 }
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
   v8i16 in0 = { 0 };
   v8i16 in1 = { 0 };
   v8i16 tmp0, tmp1, tmp2, tmp3;
@@ -167,10 +168,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
   ST_SH2(out0, out1, out, 8);
 }
 
-static int TTransform(const uint8_t* in, const uint16_t* w) {
+static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
   int sum;
   uint32_t in0_m, in1_m, in2_m, in3_m;
-  v16i8 src0;
+  v16i8 src0 = { 0 };
   v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
   v4i32 dst0, dst1;
   const v16i8 zero = { 0 };
@@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
   return sum;
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int sum1 = TTransform(a, w);
-  const int sum2 = TTransform(b, w);
+static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  const int sum1 = TTransform_MSA(a, w);
+  const int sum2 = TTransform_MSA(b, w);
   return abs(sum2 - sum1) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MSA(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 //------------------------------------------------------------------------------
 // Histogram
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
+                                 int start_block, int end_block,
+                                 VP8Histogram* const histo) {
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
@@ -259,8 +260,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const v16u8 A1 = { 0 };
   const uint64_t val_m = LD(top - 1);
-  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
   const v16u8 B = SLDI_UB(A, A, 1);
   const v16u8 C = SLDI_UB(A, A, 2);
   const v16u8 AC = __msa_ave_u_b(A, C);
@@ -292,8 +294,9 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
 }
 
 static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A2 = { 0 };
   const uint64_t val_m = LD(top - 5);
-  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
   const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
   const v16u8 B = SLDI_UB(A, A, 1);
   const v16u8 C = SLDI_UB(A, A, 2);
@@ -311,8 +314,9 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
 }
 
 static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A1 = { 0 };
   const uint64_t val_m = LD(top);
-  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
   const v16u8 B = SLDI_UB(A, A, 1);
   const v16u8 C1 = SLDI_UB(A, A, 2);
   const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
@@ -427,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 #undef AVG3
 #undef AVG2
 
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -544,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
   STORE16x16(out, dst);
 }
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MSA(uint8_t* dst,
+                             const uint8_t* left, const uint8_t* top) {
   DCMode16x16(I16DC16 + dst, left, top);
   VerticalPred16x16(I16VE16 + dst, top);
   HorizontalPred16x16(I16HE16 + dst, left);
@@ -645,7 +649,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
 static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
                                   const uint8_t* top) {
   uint64_t out;
-  v16u8 src;
+  v16u8 src = { 0 };
   if (top != NULL && left != NULL) {
     const uint64_t left_m = LD(left);
     const uint64_t top_m = LD(top);
@@ -666,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
   STORE8x8(out, dst);
 }
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
   // U block
   DCMode8x8(C8DC8 + dst, left, top);
   VerticalPred8x8(C8VE8 + dst, top);
@@ -708,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
   DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
 } while (0)
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -735,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   return sum;
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -754,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   return sum;
 }
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -774,10 +778,10 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return sum;
 }
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
   uint32_t sum = 0;
   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 src, ref, tmp0, tmp1;
+  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
   v8i16 diff0, diff1;
   v4i32 out0, out1;
 
@@ -796,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 //------------------------------------------------------------------------------
 // Quantization
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
+                             const VP8Matrix* const mtx) {
   int sum;
   v8i16 in0, in1, sh0, sh1, out0, out1;
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@@ -828,7 +832,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   tmp1 = (tmp3 > maxlevel);
   tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
   tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
-  SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
+  SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
   tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
   tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
   LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3);   // zthresh
@@ -849,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (sum > 0);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
+                               const VP8Matrix* const mtx) {
   int nz;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@@ -863,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransformWHT = FTransformWHT;
-
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8CollectHistogram = CollectHistogram;
-
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE4x4 = SSE4x4;
-
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlock;
+  VP8ITransform = ITransform_MSA;
+  VP8FTransform = FTransform_MSA;
+  VP8FTransformWHT = FTransformWHT_MSA;
+
+  VP8TDisto4x4 = Disto4x4_MSA;
+  VP8TDisto16x16 = Disto16x16_MSA;
+  VP8CollectHistogram = CollectHistogram_MSA;
+
+  VP8EncPredLuma4 = Intra4Preds_MSA;
+  VP8EncPredLuma16 = Intra16Preds_MSA;
+  VP8EncPredChroma8 = IntraChromaPreds_MSA;
+
+  VP8SSE16x16 = SSE16x16_MSA;
+  VP8SSE16x8 = SSE16x8_MSA;
+  VP8SSE8x8 = SSE8x8_MSA;
+  VP8SSE4x4 = SSE4x4_MSA;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MSA;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/enc_neon.c b/thirdparty/libwebp/src/dsp/enc_neon.c
index 6a078d632d..43bf1245c5 100644
--- a/thirdparty/libwebp/dsp/enc_neon.c
+++ b/thirdparty/libwebp/src/dsp/enc_neon.c
@@ -11,14 +11,14 @@
 //
 // adapted from libvpx (http://www.webmproject.org/code/)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <assert.h>
 
-#include "./neon.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/neon.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -37,15 +37,15 @@ static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 #if defined(WEBP_USE_INTRINSICS)
 
 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
 }
 
 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 // to the corresponding rows of 'dst'.
-static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
-                                            const int16x8_t dst01,
-                                            const int16x8_t dst23) {
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+                                                 const int16x8_t dst01,
+                                                 const int16x8_t dst23) {
   // Unsigned saturate to 8b.
   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@@ -57,8 +57,10 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 }
 
-static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
-                               const uint8_t* const ref, uint8_t* const dst) {
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+                                    const int16x8_t row23,
+                                    const uint8_t* const ref,
+                                    uint8_t* const dst) {
   uint32x2_t dst01 = vdup_n_u32(0);
   uint32x2_t dst23 = vdup_n_u32(0);
 
@@ -70,19 +72,20 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
 
   {
     // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
-    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
+    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
 
     // Descale with rounding.
     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
     // Add the inverse transform.
-    SaturateAndStore4x4(dst, out01, out23);
+    SaturateAndStore4x4_NEON(dst, out01, out23);
   }
 }
 
-static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
-                                     int16x8x2_t* const out) {
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+                                          const int16x8_t in1,
+                                          int16x8x2_t* const out) {
   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
@@ -90,7 +93,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
 }
 
-static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
   // {rows} = in0 | in4
   //          in8 | in12
   // B1 = in4 | in12
@@ -113,22 +116,22 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
-  Transpose8x2(E0, E1, rows);
+  Transpose8x2_NEON(E0, E1, rows);
 }
 
-static void ITransformOne(const uint8_t* ref,
-                          const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
   int16x8x2_t rows;
   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
-  TransformPass(&rows);
-  TransformPass(&rows);
-  Add4x4(rows.val[0], rows.val[1], ref, dst);
+  TransformPass_NEON(&rows);
+  TransformPass_NEON(&rows);
+  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
 }
 
 #else
 
-static void ITransformOne(const uint8_t* ref,
-                          const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
   const int kBPS = BPS;
   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
 
@@ -243,16 +246,16 @@ static void ITransformOne(const uint8_t* ref,
 
 #endif    // WEBP_USE_INTRINSICS
 
-static void ITransform(const uint8_t* ref,
-                       const int16_t* in, uint8_t* dst, int do_two) {
-  ITransformOne(ref, in, dst);
+static void ITransform_NEON(const uint8_t* ref,
+                            const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne_NEON(ref, in, dst);
   if (do_two) {
-    ITransformOne(ref + 4, in + 16, dst + 4);
+    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
   }
 }
 
 // Load all 4x4 pixels into a single uint8x16_t variable.
-static uint8x16_t Load4x4(const uint8_t* src) {
+static uint8x16_t Load4x4_NEON(const uint8_t* src) {
   uint32x4_t out = vdupq_n_u32(0);
   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
@@ -265,10 +268,12 @@ static uint8x16_t Load4x4(const uint8_t* src) {
 
 #if defined(WEBP_USE_INTRINSICS)
 
-static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
-                                         const int16x4_t C, const int16x4_t D,
-                                         int16x8_t* const out01,
-                                         int16x8_t* const out32) {
+static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
+                                              const int16x4_t B,
+                                              const int16x4_t C,
+                                              const int16x4_t D,
+                                              int16x8_t* const out01,
+                                              int16x8_t* const out32) {
   const int16x4x2_t AB = vtrn_s16(A, B);
   const int16x4x2_t CD = vtrn_s16(C, D);
   const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
@@ -283,24 +288,24 @@ static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
                    vreinterpret_s64_s32(tmp02.val[1])));
 }
 
-static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
-                                         const uint8x8_t b) {
+static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
+                                              const uint8x8_t b) {
   return vreinterpretq_s16_u16(vsubl_u8(a, b));
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref,
-                       int16_t* out) {
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
   {
-    const uint8x16_t S0 = Load4x4(src);
-    const uint8x16_t R0 = Load4x4(ref);
-    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
-    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
+    const uint8x16_t S0 = Load4x4_NEON(src);
+    const uint8x16_t R0 = Load4x4_NEON(ref);
+    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
+    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
     const int16x4_t D0 = vget_low_s16(D0D1);
     const int16x4_t D1 = vget_high_s16(D0D1);
     const int16x4_t D2 = vget_low_s16(D2D3);
     const int16x4_t D3 = vget_high_s16(D2D3);
-    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
+    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
   }
   {    // 1rst pass
     const int32x4_t kCst937 = vdupq_n_s32(937);
@@ -318,7 +323,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
     const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
     const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
-    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
   }
   {    // 2nd pass
     // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
@@ -358,8 +363,8 @@ static const int32_t kCoeff32[] = {
   51000, 51000, 51000, 51000
 };
 
-static void FTransform(const uint8_t* src, const uint8_t* ref,
-                       int16_t* out) {
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
   const int kBPS = BPS;
   const uint8_t* src_ptr = src;
   const uint8_t* ref_ptr = ref;
@@ -478,7 +483,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
   src += stride;                                    \
 } while (0)
 
-static void FTransformWHT(const int16_t* src, int16_t* out) {
+static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
   const int stride = 16;
   const int16x4_t zero = vdup_n_s16(0);
   int32x4x4_t tmp0;
@@ -516,7 +521,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
     tmp0.val[3] = vsubq_s32(a0, a1);
   }
   {
-    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
+    const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
     // a0 = tmp[0 + i] + tmp[ 8 + i]
     // a1 = tmp[4 + i] + tmp[12 + i]
     // a2 = tmp[4 + i] - tmp[12 + i]
@@ -560,7 +565,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
 // a 26ae, b 26ae
 // a 37bf, b 37bf
 //
-static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
   const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
   const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
   const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
@@ -574,7 +579,8 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
   return q4_in;
 }
 
-static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
+    const int16x8x4_t q4_in) {
   // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
   // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
   const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
@@ -593,7 +599,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
   return q4_out;
 }
 
-static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
   const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
                                                         q4_in.val[2]));
   const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
@@ -610,7 +616,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
   return q4_out;
 }
 
-static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
+static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
   const uint16x8_t q_w07 = vld1q_u16(&w[0]);
   const uint16x8_t q_w8f = vld1q_u16(&w[8]);
   int16x4x4_t d4_w;
@@ -622,8 +628,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
   return d4_w;
 }
 
-static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
-                                      const int16x4x4_t d4_w) {
+static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
+                                           const int16x4x4_t d4_w) {
   int32x2_t d_sum;
   // sum += w[ 0] * abs(b0);
   // sum += w[ 4] * abs(b1);
@@ -652,8 +658,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@@ -679,12 +685,12 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
     // Vertical pass first to avoid a transpose (vertical and horizontal passes
     // are commutative because w/kWeightY is symmetric) and subsequent
     // transpose.
-    const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
-    const int16x4x4_t d4_w = DistoLoadW(w);
+    const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
+    const int16x4x4_t d4_w = DistoLoadW_NEON(w);
     // horizontal pass
-    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
-    const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
-    int32x2_t d_sum = DistoSum(q4_h, d4_w);
+    const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
+    const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
+    int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
 
     // abs(sum2 - sum1) >> 5
     d_sum = vabs_s32(d_sum);
@@ -694,13 +700,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 }
 #undef LOAD_LANE_32b
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_NEON(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -708,15 +714,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 
 //------------------------------------------------------------------------------
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
     int16_t out[16];
-    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
     {
       int k;
       const int16x8_t a0 = vld1q_s16(out + 0);
@@ -740,9 +746,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 
 //------------------------------------------------------------------------------
 
-static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
-                                        const uint8_t* const b,
-                                        uint32x4_t* const sum) {
+static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
+                                             const uint8_t* const b,
+                                             uint32x4_t* const sum) {
   const uint8x16_t a0 = vld1q_u8(a);
   const uint8x16_t b0 = vld1q_u8(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@@ -757,7 +763,7 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
 }
 
 // Horizontal sum of all four uint32_t values in 'sum'.
-static int SumToInt(uint32x4_t sum) {
+static int SumToInt_NEON(uint32x4_t sum) {
   const uint64x2_t sum2 = vpaddlq_u32(sum);
   const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
   return (int)sum3;
@@ -767,18 +773,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 16; ++y) {
-    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
   }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }
 
 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
-    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
   }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }
 
 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
@@ -791,12 +797,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
     const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
     sum = vpadalq_u16(sum, prod);
   }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }
 
 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
-  const uint8x16_t a0 = Load4x4(a);
-  const uint8x16_t b0 = Load4x4(b);
+  const uint8x16_t a0 = Load4x4_NEON(a);
+  const uint8x16_t b0 = Load4x4_NEON(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
                                     vget_low_u8(abs_diff));
@@ -805,7 +811,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
   /* pair-wise adds and widen */
   const uint32x4_t sum1 = vpaddlq_u16(prod1);
   const uint32x4_t sum2 = vpaddlq_u16(prod2);
-  return SumToInt(vaddq_u32(sum1, sum2));
+  return SumToInt_NEON(vaddq_u32(sum1, sum2));
 }
 
 //------------------------------------------------------------------------------
@@ -813,8 +819,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
 // Compilation with gcc-4.6.x is problematic for now.
 #if !defined(WORK_AROUND_GCC)
 
-static int16x8_t Quantize(int16_t* const in,
-                          const VP8Matrix* const mtx, int offset) {
+static int16x8_t Quantize_NEON(int16_t* const in,
+                               const VP8Matrix* const mtx, int offset) {
   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@@ -847,10 +853,10 @@ static const uint8_t kShuffles[4][8] = {
   { 14, 15, 22, 23, 28, 29, 30, 31 }
 };
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  const int16x8_t out0 = Quantize(in, mtx, 0);
-  const int16x8_t out1 = Quantize(in, mtx, 8);
+static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
+  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
   uint8x8x4_t shuffles;
   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
   // non-standard versions there.
@@ -889,11 +895,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
   int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
 
@@ -905,14 +911,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
+  VP8ITransform = ITransform_NEON;
+  VP8FTransform = FTransform_NEON;
 
-  VP8FTransformWHT = FTransformWHT;
+  VP8FTransformWHT = FTransformWHT_NEON;
 
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8CollectHistogram = CollectHistogram;
+  VP8TDisto4x4 = Disto4x4_NEON;
+  VP8TDisto16x16 = Disto16x16_NEON;
+  VP8CollectHistogram = CollectHistogram_NEON;
 
   VP8SSE16x16 = SSE16x16_NEON;
   VP8SSE16x8 = SSE16x8_NEON;
@@ -920,8 +926,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
   VP8SSE4x4 = SSE4x4_NEON;
 
 #if !defined(WORK_AROUND_GCC)
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlock = QuantizeBlock_NEON;
+  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
 #endif
 }
 
diff --git a/thirdparty/libwebp/dsp/enc_sse2.c b/thirdparty/libwebp/src/dsp/enc_sse2.c
index 2026a74c91..7b3f142c31 100644
--- a/thirdparty/libwebp/dsp/enc_sse2.c
+++ b/thirdparty/libwebp/src/dsp/enc_sse2.c
@@ -11,23 +11,23 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 #include <assert.h>
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 
-#include "./common_sse2.h"
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/common_sse2.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 // Does one or two inverse transforms.
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                            int do_two) {
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
   }
 }
 
-static void FTransformPass1(const __m128i* const in01,
-                            const __m128i* const in23,
-                            __m128i* const out01,
-                            __m128i* const out32) {
+static void FTransformPass1_SSE2(const __m128i* const in01,
+                                 const __m128i* const in23,
+                                 __m128i* const out01,
+                                 __m128i* const out32) {
   const __m128i k937 = _mm_set1_epi32(937);
   const __m128i k1812 = _mm_set1_epi32(1812);
 
@@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01,
   *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
 }
 
-static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
-                            int16_t* out) {
+static void FTransformPass2_SSE2(const __m128i* const v01,
+                                 const __m128i* const v32,
+                                 int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i seven = _mm_set1_epi16(7);
   const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
@@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
   _mm_storeu_si128((__m128i*)&out[8], d2_f3);
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
   // Load src.
   const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   __m128i v01, v32;
 
   // First pass
-  FTransformPass1(&row01, &row23, &v01, &v32);
+  FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
 
   // Second pass
-  FTransformPass2(&v01, &v32, out);
+  FTransformPass2_SSE2(&v01, &v32, out);
 }
 
-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
+                             int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
 
   // Load src and convert to 16b.
@@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   __m128i v01h, v32h;
 
   // First pass
-  FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
-  FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
+  FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
+  FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
 
   // Second pass
-  FTransformPass2(&v01l, &v32l, out + 0);
-  FTransformPass2(&v01h, &v32h, out + 16);
+  FTransformPass2_SSE2(&v01l, &v32l, out + 0);
+  FTransformPass2_SSE2(&v01h, &v32h, out + 16);
 }
 
-static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
+static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
   const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
   const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
   const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
   *out = _mm_madd_epi16(D, kMult);
 }
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
   // Input is 12b signed.
   __m128i row0, row1, row2, row3;
   // Rows are 14b signed.
-  FTransformWHTRow(in + 0 * 64, &row0);
-  FTransformWHTRow(in + 1 * 64, &row1);
-  FTransformWHTRow(in + 2 * 64, &row2);
-  FTransformWHTRow(in + 3 * 64, &row3);
+  FTransformWHTRow_SSE2(in + 0 * 64, &row0);
+  FTransformWHTRow_SSE2(in + 1 * 64, &row1);
+  FTransformWHTRow_SSE2(in + 2 * 64, &row2);
+  FTransformWHTRow_SSE2(in + 3 * 64, &row3);
 
   {
     // The a* are 15b signed.
@@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   int j;
@@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
     int16_t out[16];
     int k;
 
-    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 
     // Convert coefficients to bin (within out[]).
     {
@@ -476,7 +479,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 // Intra predictions
 
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
   const __m128i values = _mm_set1_epi8(v);
   for (j = 0; j < 8; ++j) {
@@ -484,7 +487,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
   }
 }
 
-static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
   const __m128i values = _mm_set1_epi8(v);
   for (j = 0; j < 16; ++j) {
@@ -492,20 +495,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
   }
 }
 
-static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
   if (size == 4) {
     int j;
     for (j = 0; j < 4; ++j) {
       memset(dst + j * BPS, value, 4);
     }
   } else if (size == 8) {
-    Put8x8uv(value, dst);
+    Put8x8uv_SSE2(value, dst);
   } else {
-    Put16(value, dst);
+    Put16_SSE2(value, dst);
   }
 }
 
-static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
   int j;
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   for (j = 0; j < 8; ++j) {
@@ -513,7 +516,7 @@ static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
   }
 }
 
-static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i top_values = _mm_load_si128((const __m128i*)top);
   int j;
   for (j = 0; j < 16; ++j) {
@@ -521,20 +524,20 @@ static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
   }
 }
 
-static WEBP_INLINE void VerticalPred(uint8_t* dst,
-                                     const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
+                                          const uint8_t* top, int size) {
   if (top != NULL) {
     if (size == 8) {
-      VE8uv(dst, top);
+      VE8uv_SSE2(dst, top);
     } else {
-      VE16(dst, top);
+      VE16_SSE2(dst, top);
     }
   } else {
-    Fill(dst, 127, size);
+    Fill_SSE2(dst, 127, size);
   }
 }
 
-static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 8; ++j) {
     const __m128i values = _mm_set1_epi8(left[j]);
@@ -543,7 +546,7 @@ static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
   }
 }
 
-static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 16; ++j) {
     const __m128i values = _mm_set1_epi8(left[j]);
@@ -552,21 +555,21 @@ static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
   }
 }
 
-static WEBP_INLINE void HorizontalPred(uint8_t* dst,
-                                       const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
+                                            const uint8_t* left, int size) {
   if (left != NULL) {
     if (size == 8) {
-      HE8uv(dst, left);
+      HE8uv_SSE2(dst, left);
     } else {
-      HE16(dst, left);
+      HE16_SSE2(dst, left);
     }
   } else {
-    Fill(dst, 129, size);
+    Fill_SSE2(dst, 129, size);
   }
 }
 
-static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
-                           const uint8_t* top, int size) {
+static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
+                                const uint8_t* top, int size) {
   const __m128i zero = _mm_setzero_si128();
   int y;
   if (size == 8) {
@@ -593,13 +596,13 @@ static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
   }
 }
 
-static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
-                                   const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top, int size) {
   if (left != NULL) {
     if (top != NULL) {
-      TM(dst, left, top, size);
+      TM_SSE2(dst, left, top, size);
     } else {
-      HorizontalPred(dst, left, size);
+      HorizontalPred_SSE2(dst, left, size);
     }
   } else {
     // true motion without left samples (hence: with default 129 value)
@@ -607,90 +610,90 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
     // Note that if top samples are not available, the default value is
     // then 129, and not 127 as in the VerticalPred case.
     if (top != NULL) {
-      VerticalPred(dst, top, size);
+      VerticalPred_SSE2(dst, top, size);
     } else {
-      Fill(dst, 129, size);
+      Fill_SSE2(dst, 129, size);
     }
   }
 }
 
-static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top) {
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top) {
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
   const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
   const int DC = VP8HorizontalAdd8b(&combined) + 8;
-  Put8x8uv(DC >> 4, dst);
+  Put8x8uv_SSE2(DC >> 4, dst);
 }
 
-static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i sum = _mm_sad_epu8(top_values, zero);
   const int DC = _mm_cvtsi128_si32(sum) + 4;
-  Put8x8uv(DC >> 3, dst);
+  Put8x8uv_SSE2(DC >> 3, dst);
 }
 
-static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
   // 'left' is contiguous so we can reuse the top summation.
-  DC8uvNoLeft(dst, left);
+  DC8uvNoLeft_SSE2(dst, left);
 }
 
-static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
-  Put8x8uv(0x80, dst);
+static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
+  Put8x8uv_SSE2(0x80, dst);
 }
 
-static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
   if (top != NULL) {
     if (left != NULL) {  // top and left present
-      DC8uv(dst, left, top);
+      DC8uv_SSE2(dst, left, top);
     } else {  // top, but no left
-      DC8uvNoLeft(dst, top);
+      DC8uvNoLeft_SSE2(dst, top);
     }
   } else if (left != NULL) {  // left but no top
-    DC8uvNoTop(dst, left);
+    DC8uvNoTop_SSE2(dst, left);
   } else {  // no top, no left, nothing.
-    DC8uvNoTopLeft(dst);
+    DC8uvNoTopLeft_SSE2(dst);
   }
 }
 
-static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const __m128i left_row = _mm_load_si128((const __m128i*)left);
   const int DC =
       VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
-  Put16(DC >> 5, dst);
+  Put16_SSE2(DC >> 5, dst);
 }
 
-static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const int DC = VP8HorizontalAdd8b(&top_row) + 8;
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }
 
-static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
   // 'left' is contiguous so we can reuse the top summation.
-  DC16NoLeft(dst, left);
+  DC16NoLeft_SSE2(dst, left);
 }
 
-static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
-  Put16(0x80, dst);
+static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
+  Put16_SSE2(0x80, dst);
 }
 
-static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
-                                 const uint8_t* top) {
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
   if (top != NULL) {
     if (left != NULL) {  // top and left present
-      DC16(dst, left, top);
+      DC16_SSE2(dst, left, top);
     } else {  // top, but no left
-      DC16NoLeft(dst, top);
+      DC16NoLeft_SSE2(dst, top);
     }
   } else if (left != NULL) {  // left but no top
-    DC16NoTop(dst, left);
+    DC16NoTop_SSE2(dst, left);
   } else {  // no top, no left, nothing.
-    DC16NoTopLeft(dst);
+    DC16NoTopLeft_SSE2(dst);
   }
 }
 
@@ -709,7 +712,8 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
 
-static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
+static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // vertical
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -725,7 +729,8 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
   }
 }
 
-static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
+static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // horizontal
   const int X = top[-1];
   const int I = top[-2];
   const int J = top[-3];
@@ -737,14 +742,15 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
 
-static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
   uint32_t dc = 4;
   int i;
   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
-  Fill(dst, dc >> 3, 4);
+  Fill_SSE2(dst, dc >> 3, 4);
 }
 
-static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
+static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-Left
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -760,8 +766,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
   WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
-static WEBP_INLINE void VR4(uint8_t* dst,
-                            const uint8_t* top) {  // Vertical-Right
+static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Right
   const __m128i one = _mm_set1_epi8(1);
   const int I = top[-2];
   const int J = top[-3];
@@ -786,8 +792,8 @@ static WEBP_INLINE void VR4(uint8_t* dst,
   DST(0, 3) = AVG3(K, J, I);
 }
 
-static WEBP_INLINE void VL4(uint8_t* dst,
-                            const uint8_t* top) {  // Vertical-Left
+static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Left
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
   const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -812,7 +818,8 @@ static WEBP_INLINE void VL4(uint8_t* dst,
   DST(3, 3) = (extra_out >> 8) & 0xff;
 }
 
-static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
+static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-right
   const __m128i one = _mm_set1_epi8(1);
   const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
   const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@@ -828,7 +835,7 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
   WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
-static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
   const int I = top[-2];
   const int J = top[-3];
   const int K = top[-4];
@@ -843,7 +850,7 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 
-static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
   const int X = top[-1];
   const int I = top[-2];
   const int J = top[-3];
@@ -866,7 +873,7 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
   DST(1, 3)             = AVG3(L, K, J);
 }
 
-static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
   const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@@ -888,55 +895,56 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
-  DC4(I4DC4 + dst, top);
-  TM4(I4TM4 + dst, top);
-  VE4(I4VE4 + dst, top);
-  HE4(I4HE4 + dst, top);
-  RD4(I4RD4 + dst, top);
-  VR4(I4VR4 + dst, top);
-  LD4(I4LD4 + dst, top);
-  VL4(I4VL4 + dst, top);
-  HD4(I4HD4 + dst, top);
-  HU4(I4HU4 + dst, top);
+static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+  DC4_SSE2(I4DC4 + dst, top);
+  TM4_SSE2(I4TM4 + dst, top);
+  VE4_SSE2(I4VE4 + dst, top);
+  HE4_SSE2(I4HE4 + dst, top);
+  RD4_SSE2(I4RD4 + dst, top);
+  VR4_SSE2(I4VR4 + dst, top);
+  LD4_SSE2(I4LD4 + dst, top);
+  VL4_SSE2(I4VL4 + dst, top);
+  HD4_SSE2(I4HD4 + dst, top);
+  HU4_SSE2(I4HU4 + dst, top);
 }
 
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
   // U block
-  DC8uvMode(C8DC8 + dst, left, top);
-  VerticalPred(C8VE8 + dst, top, 8);
-  HorizontalPred(C8HE8 + dst, left, 8);
-  TrueMotion(C8TM8 + dst, left, top, 8);
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
   // V block
   dst += 8;
   if (top != NULL) top += 8;
   if (left != NULL) left += 16;
-  DC8uvMode(C8DC8 + dst, left, top);
-  VerticalPred(C8VE8 + dst, top, 8);
-  HorizontalPred(C8HE8 + dst, left, 8);
-  TrueMotion(C8TM8 + dst, left, top, 8);
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
 }
 
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
-  DC16Mode(I16DC16 + dst, left, top);
-  VerticalPred(I16VE16 + dst, top, 16);
-  HorizontalPred(I16HE16 + dst, left, 16);
-  TrueMotion(I16TM16 + dst, left, top, 16);
+static void Intra16Preds_SSE2(uint8_t* dst,
+                              const uint8_t* left, const uint8_t* top) {
+  DC16Mode_SSE2(I16DC16 + dst, left, top);
+  VerticalPred_SSE2(I16VE16 + dst, top, 16);
+  HorizontalPred_SSE2(I16HE16 + dst, left, 16);
+  TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
 }
 
 //------------------------------------------------------------------------------
 // Metric
 
-static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
-                                              __m128i* const sum) {
+static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
+                                                   const __m128i b,
+                                                   __m128i* const sum) {
   // take abs(a-b) in 8b
   const __m128i a_b = _mm_subs_epu8(a, b);
   const __m128i b_a = _mm_subs_epu8(b, a);
@@ -951,8 +959,8 @@ static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
   *sum = _mm_add_epi32(sum1, sum2);
 }
 
-static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
-                                int num_pairs) {
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+                                     int num_pairs) {
   __m128i sum = _mm_setzero_si128();
   int32_t tmp[4];
   int i;
@@ -963,8 +971,8 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
     const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
     __m128i sum1, sum2;
-    SubtractAndAccumulate(a0, b0, &sum1);
-    SubtractAndAccumulate(a1, b1, &sum2);
+    SubtractAndAccumulate_SSE2(a0, b0, &sum1);
+    SubtractAndAccumulate_SSE2(a1, b1, &sum2);
     sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
     a += 2 * BPS;
     b += 2 * BPS;
@@ -973,18 +981,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  return SSE_16xN(a, b, 8);
+static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 8);
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  return SSE_16xN(a, b, 4);
+static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 4);
 }
 
 #define LOAD_8x16b(ptr) \
   _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
   const __m128i zero = _mm_setzero_si128();
   int num_pairs = 4;
   __m128i sum = zero;
@@ -1011,7 +1019,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
 }
 #undef LOAD_8x16b
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
   const __m128i zero = _mm_setzero_si128();
 
   // Load values. Note that we read 8 pixels instead of 4,
@@ -1048,7 +1056,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 
 //------------------------------------------------------------------------------
 
-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
   const __m128i mask = _mm_set1_epi16(0x00ff);
   const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
   const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@@ -1086,8 +1094,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* inA, const uint8_t* inB,
-                      const uint16_t* const w) {
+static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
+                           const uint16_t* const w) {
   int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
   const __m128i zero = _mm_setzero_si128();
@@ -1187,19 +1195,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
   return sum[0] + sum[1] + sum[2] + sum[3];
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int diff_sum = TTransform(a, b, w);
+static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE2(a, b, w);
   return abs(diff_sum) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_SSE2(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -1209,9 +1217,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 //
 
-static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
-                                       const uint16_t* const sharpen,
-                                       const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                                            const uint16_t* const sharpen,
+                                            const VP8Matrix* const mtx) {
   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
   const __m128i zero = _mm_setzero_si128();
   __m128i coeff0, coeff8;
@@ -1321,22 +1329,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
   return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
 }
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
 }
 
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, NULL, mtx);
+static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
+                                 const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
   int nz;
   const uint16_t* const sharpen = &mtx->sharpen_[0];
-  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
-  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
   return nz;
 }
 
@@ -1346,139 +1354,28 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
-  VP8CollectHistogram = CollectHistogram;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransform2 = FTransform2;
-  VP8FTransformWHT = FTransformWHT;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE4x4 = SSE4x4;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8Mean16x4 = Mean16x4;
-}
-
-//------------------------------------------------------------------------------
-// SSIM / PSNR entry point (TODO(skal): move to its own file later)
-
-static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
-                                   const uint8_t* src2, int len) {
-  int i = 0;
-  uint32_t sse2 = 0;
-  if (len >= 16) {
-    const int limit = len - 32;
-    int32_t tmp[4];
-    __m128i sum1;
-    __m128i sum = _mm_setzero_si128();
-    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
-    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
-    i += 16;
-    while (i <= limit) {
-      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
-      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
-      __m128i sum2;
-      i += 16;
-      SubtractAndAccumulate(a0, b0, &sum1);
-      sum = _mm_add_epi32(sum, sum1);
-      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
-      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
-      i += 16;
-      SubtractAndAccumulate(a1, b1, &sum2);
-      sum = _mm_add_epi32(sum, sum2);
-    }
-    SubtractAndAccumulate(a0, b0, &sum1);
-    sum = _mm_add_epi32(sum, sum1);
-    _mm_storeu_si128((__m128i*)tmp, sum);
-    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
-  }
-
-  for (; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-static uint32_t HorizontalAdd16b(const __m128i* const m) {
-  uint16_t tmp[8];
-  const __m128i a = _mm_srli_si128(*m, 8);
-  const __m128i b = _mm_add_epi16(*m, a);
-  _mm_storeu_si128((__m128i*)tmp, b);
-  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
-}
-
-static uint32_t HorizontalAdd32b(const __m128i* const m) {
-  const __m128i a = _mm_srli_si128(*m, 8);
-  const __m128i b = _mm_add_epi32(*m, a);
-  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
-  return (uint32_t)_mm_cvtsi128_si32(c);
-}
-
-static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
-
-#define ACCUMULATE_ROW(WEIGHT) do {                         \
-  /* compute row weight (Wx * Wy) */                        \
-  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
-  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
-  /* process 8 bytes at a time (7 bytes, actually) */       \
-  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
-  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
-  /* convert to 16b and multiply by weight */               \
-  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
-  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
-  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
-  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
-  /* accumulate */                                          \
-  xm  = _mm_add_epi16(xm, wa1);                             \
-  ym  = _mm_add_epi16(ym, wb1);                             \
-  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
-  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
-  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
-  src1 += stride1;                                          \
-  src2 += stride2;                                          \
-} while (0)
-
-static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
-                           const uint8_t* src2, int stride2) {
-  VP8DistoStats stats;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i xm = zero, ym = zero;                // 16b accums
-  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
-  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
-  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
-  ACCUMULATE_ROW(1);
-  ACCUMULATE_ROW(2);
-  ACCUMULATE_ROW(3);
-  ACCUMULATE_ROW(4);
-  ACCUMULATE_ROW(3);
-  ACCUMULATE_ROW(2);
-  ACCUMULATE_ROW(1);
-  stats.xm  = HorizontalAdd16b(&xm);
-  stats.ym  = HorizontalAdd16b(&ym);
-  stats.xxm = HorizontalAdd32b(&xxm);
-  stats.xym = HorizontalAdd32b(&xym);
-  stats.yym = HorizontalAdd32b(&yym);
-  return VP8SSIMFromStats(&stats);
-}
-
-extern void VP8SSIMDspInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
-  VP8AccumulateSSE = AccumulateSSE_SSE2;
-  VP8SSIMGet = SSIMGet_SSE2;
+  VP8CollectHistogram = CollectHistogram_SSE2;
+  VP8EncPredLuma16 = Intra16Preds_SSE2;
+  VP8EncPredChroma8 = IntraChromaPreds_SSE2;
+  VP8EncPredLuma4 = Intra4Preds_SSE2;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
+  VP8ITransform = ITransform_SSE2;
+  VP8FTransform = FTransform_SSE2;
+  VP8FTransform2 = FTransform2_SSE2;
+  VP8FTransformWHT = FTransformWHT_SSE2;
+  VP8SSE16x16 = SSE16x16_SSE2;
+  VP8SSE16x8 = SSE16x8_SSE2;
+  VP8SSE8x8 = SSE8x8_SSE2;
+  VP8SSE4x4 = SSE4x4_SSE2;
+  VP8TDisto4x4 = Disto4x4_SSE2;
+  VP8TDisto16x16 = Disto16x16_SSE2;
+  VP8Mean16x4 = Mean16x4_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
 
 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
-WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
 
 #endif  // WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/enc_sse41.c b/thirdparty/libwebp/src/dsp/enc_sse41.c
index e32086d9fd..924035a644 100644
--- a/thirdparty/libwebp/dsp/enc_sse41.c
+++ b/thirdparty/libwebp/src/dsp/enc_sse41.c
@@ -11,21 +11,21 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE41)
 #include <smmintrin.h>
 #include <stdlib.h>  // for abs()
 
-#include "./common_sse2.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/common_sse2.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms.
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+                                   int start_block, int end_block,
+                                   VP8Histogram* const histo) {
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* inA, const uint8_t* inB,
-                      const uint16_t* const w) {
+static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
+                            const uint16_t* const w) {
   int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
 
@@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
   return sum[0] + sum[1] + sum[2] + sum[3];
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int diff_sum = TTransform(a, b, w);
+static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE41(a, b, w);
   return abs(diff_sum) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
+                            const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_SSE41(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
                2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
                2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
 
-static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
-                                       const uint16_t* const sharpen,
-                                       const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                                             const uint16_t* const sharpen,
+                                             const VP8Matrix* const mtx) {
   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
   const __m128i zero = _mm_setzero_si128();
   __m128i out0, out8;
@@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
 
 #undef PSHUFB_CST
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                               const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
 }
 
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, NULL, mtx);
+static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
+                                  const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
+                                 const VP8Matrix* const mtx) {
   int nz;
   const uint16_t* const sharpen = &mtx->sharpen_[0];
-  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
-  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
   return nz;
 }
 
@@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 
 extern void VP8EncDspInitSSE41(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
-  VP8CollectHistogram = CollectHistogram;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
+  VP8CollectHistogram = CollectHistogram_SSE41;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE41;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
+  VP8TDisto4x4 = Disto4x4_SSE41;
+  VP8TDisto16x16 = Disto16x16_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/thirdparty/libwebp/dsp/filters.c b/thirdparty/libwebp/src/dsp/filters.c
index 65f34aad1f..ca5f877da7 100644
--- a/thirdparty/libwebp/dsp/filters.c
+++ b/thirdparty/libwebp/src/dsp/filters.c
@@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,16 +20,17 @@
 // Helpful macro.
 
 # define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
+  assert((in) != NULL);                                                        \
+  assert((out) != NULL);                                                       \
   assert(width > 0);                                                           \
   assert(height > 0);                                                          \
   assert(stride >= width);                                                     \
   assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
   (void)height;  // Silence unused warning.
 
-static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
-                                    uint8_t* dst, int length, int inverse) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
+                                      uint8_t* dst, int length, int inverse) {
   int i;
   if (inverse) {
     for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
@@ -41,10 +42,10 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
 //------------------------------------------------------------------------------
 // Horizontal filter.
 
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           int inverse, uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
+                                             int width, int height, int stride,
+                                             int row, int num_rows,
+                                             int inverse, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -56,7 +57,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   if (row == 0) {
     // Leftmost pixel is the same as input for topmost scanline.
     out[0] = in[0];
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
     row = 1;
     preds += stride;
     in += stride;
@@ -66,8 +67,8 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   // Filter line-by-line.
   while (row < last_row) {
     // Leftmost pixel is predicted from above.
-    PredictLine(in, preds - stride, out, 1, inverse);
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in, preds - stride, out, 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
     ++row;
     preds += stride;
     in += stride;
@@ -78,10 +79,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.
 
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -94,7 +95,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
     // Very first top-left pixel is copied.
     out[0] = in[0];
     // Rest of top scan-line is left-predicted.
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
     row = 1;
     in += stride;
     out += stride;
@@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 
   // Filter line-by-line.
   while (row < last_row) {
-    PredictLine(in, preds, out, width, inverse);
+    PredictLine_C(in, preds, out, width, inverse);
     ++row;
     preds += stride;
     in += stride;
     out += stride;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Gradient filter.
 
-static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
   const int g = a + b - c;
   return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }
 
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // left prediction for top scan-line
   if (row == 0) {
     out[0] = in[0];
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
     row = 1;
     preds += stride;
     in += stride;
@@ -147,11 +150,11 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   while (row < last_row) {
     int w;
     // leftmost pixel: predict from above.
-    PredictLine(in, preds - stride, out, 1, inverse);
+    PredictLine_C(in, preds - stride, out, 1, inverse);
     for (w = 1; w < width; ++w) {
-      const int pred = GradientPredictor(preds[w - 1],
-                                         preds[w - stride],
-                                         preds[w - stride - 1]);
+      const int pred = GradientPredictor_C(preds[w - 1],
+                                           preds[w - stride],
+                                           preds[w - stride - 1]);
       out[w] = in[w] + (inverse ? pred : -pred);
     }
     ++row;
@@ -160,32 +163,34 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
     out += stride;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 #undef SANITY_CHECK
 
 //------------------------------------------------------------------------------
 
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+#if !WEBP_NEON_OMIT_C_CODE
+static void HorizontalFilter_C(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
+                       filtered_data);
 }
 
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+static void VerticalFilter_C(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
 }
 
-
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+static void GradientFilter_C(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
 }
-
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                                 uint8_t* out, int width) {
   uint8_t pred = (prev == NULL) ? 0 : prev[0];
   int i;
   for (i = 0; i < width; ++i) {
@@ -194,26 +199,28 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
   }
 }
 
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_C(NULL, in, out, width);
   } else {
     int i;
     for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_C(NULL, in, out, width);
   } else {
     uint8_t top = prev[0], top_left = top, left = top;
     int i;
     for (i = 0; i < width; ++i) {
       top = prev[i];  // need to read this first, in case prev==out
-      left = in[i] + GradientPredictor(left, top, top_left);
+      left = in[i] + GradientPredictor_C(left, top, top_left);
       top_left = top;
       out[i] = left;
     }
@@ -238,14 +245,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
   if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
 
   WebPUnfilters[WEBP_FILTER_NONE] = NULL;
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
+#endif
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;
 
   WebPFilters[WEBP_FILTER_NONE] = NULL;
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
+#endif
 
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -253,11 +264,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
       VP8FiltersInitSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8FiltersInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       VP8FiltersInitMIPSdspR2();
@@ -269,5 +275,20 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8FiltersInitNEON();
+  }
+#endif
+
+  assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
+  assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
+  assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
+  assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
+  assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
+  assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
+
   filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/filters_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
index 1d82e3c2e1..9382b12823 100644
--- a/thirdparty/libwebp/dsp/filters_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
@@ -12,11 +12,11 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "../dsp/dsp.h"
+#include "src/dsp/dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
@@ -101,8 +101,8 @@
     );                                                                         \
   } while (0)
 
-static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
-                                    int length) {
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+                                              int length) {
   DO_PREDICT_LINE(src, dst, length, 0);
 }
 
@@ -192,10 +192,11 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
     }                                                                          \
   } while (0)
 
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
+                                                     int width, int height,
+                                                     int stride,
+                                                     int row, int num_rows,
+                                                     uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -207,7 +208,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   if (row == 0) {
     // Leftmost pixel is the same as input for topmost scanline.
     out[0] = in[0];
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
     row = 1;
     preds += stride;
     in += stride;
@@ -219,9 +220,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
 
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
+static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
+                                       int width, int height,
+                                       int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                               filtered_data);
 }
 
 //------------------------------------------------------------------------------
@@ -237,9 +240,11 @@ static void HorizontalFilter(const uint8_t* data, int width, int height,
     }                                                                          \
   } while (0)
 
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
+                                                   int width, int height,
+                                                   int stride,
+                                                   int row, int num_rows,
+                                                   uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -252,7 +257,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
     // Very first top-left pixel is copied.
     out[0] = in[0];
     // Rest of top scan-line is left-predicted.
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -266,15 +271,16 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
 
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
 }
 
 //------------------------------------------------------------------------------
 // Gradient filter.
 
-static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
   int temp0;
   __asm__ volatile (
     "addu             %[temp0],   %[a],       %[b]        \n\t"
@@ -293,9 +299,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
       int w;                                                                   \
       PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
       for (w = 1; w < width; ++w) {                                            \
-        const int pred = GradientPredictor(PREDS[w - 1],                       \
-                                           PREDS[w - stride],                  \
-                                           PREDS[w - stride - 1]);             \
+        const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1],             \
+                                                     PREDS[w - stride],        \
+                                                     PREDS[w - stride - 1]);   \
         out[w] = in[w] OPERATION pred;                                         \
       }                                                                        \
       ++row;                                                                   \
@@ -304,9 +310,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
     }                                                                          \
   } while (0)
 
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+                                       int width, int height, int stride,
+                                       int row, int num_rows, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -318,7 +324,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // left prediction for top scan-line
   if (row == 0) {
     out[0] = in[0];
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
     row = 1;
     preds += stride;
     in += stride;
@@ -330,38 +336,39 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
 
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
 }
 
 //------------------------------------------------------------------------------
 
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                         uint8_t* out, int width) {
  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
  DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
 }
 
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
   } else {
     DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
   }
 }
 
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
   } else {
     uint8_t top = prev[0], top_left = top, left = top;
     int i;
     for (i = 0; i < width; ++i) {
       top = prev[i];  // need to read this first, in case prev==dst
-      left = in[i] + GradientPredictor(left, top, top_left);
+      left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
       top_left = top;
       out[i] = left;
     }
@@ -379,13 +386,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
 extern void VP8FiltersInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
 
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/filters_msa.c b/thirdparty/libwebp/src/dsp/filters_msa.c
index 4b8922d0bc..14c437d141 100644
--- a/thirdparty/libwebp/dsp/filters_msa.c
+++ b/thirdparty/libwebp/src/dsp/filters_msa.c
@@ -11,11 +11,11 @@
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./msa_macro.h"
+#include "src/dsp/msa_macro.h"
 
 #include <assert.h>
 
@@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 //------------------------------------------------------------------------------
 // Horrizontal filter
 
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
+static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
+                                 int stride, uint8_t* filtered_data) {
   const uint8_t* preds = data;
   const uint8_t* in = data;
   uint8_t* out = filtered_data;
@@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
 }
 
 
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
+static void GradientFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
   const uint8_t* in = data;
   const uint8_t* preds = data;
   uint8_t* out = filtered_data;
@@ -157,8 +157,8 @@ static void GradientFilter(const uint8_t* data, int width, int height,
 //------------------------------------------------------------------------------
 // Vertical filter
 
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
+static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
   const uint8_t* in = data;
   const uint8_t* preds = data;
   uint8_t* out = filtered_data;
@@ -190,9 +190,9 @@ static void VerticalFilter(const uint8_t* data, int width, int height,
 extern void VP8FiltersInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/filters_neon.c b/thirdparty/libwebp/src/dsp/filters_neon.c
index 4d6e50cc76..3e6a578ea7 100644
--- a/thirdparty/libwebp/dsp/filters_neon.c
+++ b/thirdparty/libwebp/src/dsp/filters_neon.c
@@ -11,12 +11,12 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <assert.h>
-#include "./neon.h"
+#include "src/dsp/neon.h"
 
 //------------------------------------------------------------------------------
 // Helpful macros.
@@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
 }
 
 static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+                                int stride, uint8_t* filtered_data) {
   DoVerticalFilter_NEON(data, width, height, stride, 0, height,
                         filtered_data);
 }
@@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
 }
 
 static void GradientFilter_NEON(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+                                int stride, uint8_t* filtered_data) {
   DoGradientFilter_NEON(data, width, height, stride, 0, height,
                         filtered_data);
 }
@@ -251,9 +251,11 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
 // GradientUnfilter_NEON is correct but slower than the C-version,
 // at least on ARM64. For armv7, it's a wash.
 // So best is to disable it for now, but keep the idea around...
-// #define USE_GRADIENT_UNFILTER
+#if !defined(USE_GRADIENT_UNFILTER)
+#define USE_GRADIENT_UNFILTER 0   // ALTERNATE_CODE
+#endif
 
-#if defined(USE_GRADIENT_UNFILTER)
+#if (USE_GRADIENT_UNFILTER == 1)
 #define GRAD_PROCESS_LANE(L)  do {                                             \
   const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1);  /* rotate predictor in */   \
   const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1));                       \
@@ -292,7 +294,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in,
 #undef GRAD_PROCESS_LANE
 
 static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
-                                 uint8_t* out, int width) {
+                                  uint8_t* out, int width) {
   if (prev == NULL) {
     HorizontalUnfilter_NEON(NULL, in, out, width);
   } else {
@@ -311,7 +313,7 @@ extern void VP8FiltersInitNEON(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
-#if defined(USE_GRADIENT_UNFILTER)
+#if (USE_GRADIENT_UNFILTER == 1)
   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
 #endif
 
diff --git a/thirdparty/libwebp/dsp/filters_sse2.c b/thirdparty/libwebp/src/dsp/filters_sse2.c
index 67f77999e6..5a18895676 100644
--- a/thirdparty/libwebp/dsp/filters_sse2.c
+++ b/thirdparty/libwebp/src/dsp/filters_sse2.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 
@@ -24,16 +24,16 @@
 // Helpful macro.
 
 # define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
+  assert((in) != NULL);                                                        \
+  assert((out) != NULL);                                                       \
   assert(width > 0);                                                           \
   assert(height > 0);                                                          \
   assert(stride >= width);                                                     \
   assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
   (void)height;  // Silence unused warning.
 
-static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
-                           uint8_t* dst, int length) {
+static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
+                                uint8_t* dst, int length) {
   int i;
   const int max_pos = length & ~31;
   assert(length >= 0);
@@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
 }
 
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
   int i;
   const int max_pos = length & ~31;
   assert(length >= 0);
@@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
 //------------------------------------------------------------------------------
 // Horizontal filter.
 
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
+                                                int width, int height,
+                                                int stride,
+                                                int row, int num_rows,
+                                                uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
@@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   if (row == 0) {
     // Leftmost pixel is the same as input for topmost scanline.
     out[0] = in[0];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   while (row < last_row) {
     // Leftmost pixel is predicted from above.
     out[0] = in[0] - in[-stride];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     ++row;
     in += stride;
     out += stride;
@@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.
 
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
+                                              int width, int height, int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
@@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
     // Very first top-left pixel is copied.
     out[0] = in[0];
     // Rest of top scan-line is left-predicted.
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 
   // Filter line-by-line.
   while (row < last_row) {
-    PredictLineTop(in, in - stride, out, width);
+    PredictLineTop_SSE2(in, in - stride, out, width);
     ++row;
     in += stride;
     out += stride;
@@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Gradient filter.
 
-static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
   const int g = a + b - c;
   return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }
 
-static void GradientPredictDirect(const uint8_t* const row,
-                                  const uint8_t* const top,
-                                  uint8_t* const out, int length) {
+static void GradientPredictDirect_SSE2(const uint8_t* const row,
+                                       const uint8_t* const top,
+                                       uint8_t* const out, int length) {
   const int max_pos = length & ~7;
   int i;
   const __m128i zero = _mm_setzero_si128();
@@ -161,14 +163,14 @@ static void GradientPredictDirect(const uint8_t* const row,
     _mm_storel_epi64((__m128i*)(out + i), H);
   }
   for (; i < length; ++i) {
-    out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+    out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
   }
 }
 
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         uint8_t* out) {
+static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
+                                              int width, int height, int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
@@ -178,7 +180,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // left prediction for top scan-line
   if (row == 0) {
     out[0] = in[0];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -187,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // Filter line-by-line.
   while (row < last_row) {
     out[0] = in[0] - in[-stride];
-    GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
+    GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
     ++row;
     in += stride;
     out += stride;
@@ -198,26 +200,27 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
 
 //------------------------------------------------------------------------------
 
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
+static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
+                                  int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
+                          filtered_data);
 }
 
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
+                                int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
 }
 
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
+                                int stride, uint8_t* filtered_data) {
+  DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
 }
 
 //------------------------------------------------------------------------------
 // Inverse transforms
 
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                    uint8_t* out, int width) {
   int i;
   __m128i last;
   out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
@@ -238,10 +241,10 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
   for (; i < width; ++i) out[i] = in[i] + out[i - 1];
 }
 
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                  uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_SSE2(NULL, in, out, width);
   } else {
     int i;
     const int max_pos = width & ~31;
@@ -260,9 +263,9 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
   }
 }
 
-static void GradientPredictInverse(const uint8_t* const in,
-                                   const uint8_t* const top,
-                                   uint8_t* const row, int length) {
+static void GradientPredictInverse_SSE2(const uint8_t* const in,
+                                        const uint8_t* const top,
+                                        uint8_t* const row, int length) {
   if (length > 0) {
     int i;
     const int max_pos = length & ~7;
@@ -293,18 +296,18 @@ static void GradientPredictInverse(const uint8_t* const in,
       _mm_storel_epi64((__m128i*)&row[i], out);
     }
     for (; i < length; ++i) {
-      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+      row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
     }
   }
 }
 
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                  uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_SSE2(NULL, in, out, width);
   } else {
     out[0] = in[0] + prev[0];  // predict from above
-    GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
+    GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
   }
 }
 
@@ -314,13 +317,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
 extern void VP8FiltersInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
 
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c
index 20d18f6ecd..83f553d9ad 100644
--- a/thirdparty/libwebp/dsp/lossless.c
+++ b/thirdparty/libwebp/src/dsp/lossless.c
@@ -13,14 +13,15 @@
 //          Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li_dec.h"
-#include "../utils/endian_inl_utils.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 
 #define MAX_DIFF_COST (1e30f)
 
@@ -80,8 +81,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 
-// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
-#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
+// inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
 # define LOCAL_INLINE __attribute__ ((noinline))
 #else
 # define LOCAL_INLINE WEBP_INLINE
@@ -107,69 +109,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
   (void)top;
   return left;
 }
-static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average3(left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[0]);
   return pred;
 }
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
 
-GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
-static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
-                          int num_pixels, uint32_t* out) {
+GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
+static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
   int i;
   uint32_t left = out[-1];
   for (i = 0; i < num_pixels; ++i) {
@@ -177,29 +179,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
   }
   (void)upper;
 }
-GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
-GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
-GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
-GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
-GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
-GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
-GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
-GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
-GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
-GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
-GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
-GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
+GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
 
 //------------------------------------------------------------------------------
 
 // Inverse prediction.
-static void PredictorInverseTransform(const VP8LTransform* const transform,
-                                      int y_start, int y_end,
-                                      const uint32_t* in, uint32_t* out) {
+static void PredictorInverseTransform_C(const VP8LTransform* const transform,
+                                        int y_start, int y_end,
+                                        const uint32_t* in, uint32_t* out) {
   const int width = transform->xsize_;
   if (y_start == 0) {  // First Row follows the L (mode=1) mode.
-    PredictorAdd0(in, NULL, 1, out);
-    PredictorAdd1(in + 1, NULL, width - 1, out + 1);
+    PredictorAdd0_C(in, NULL, 1, out);
+    PredictorAdd1_C(in + 1, NULL, width - 1, out + 1);
     in += width;
     out += width;
     ++y_start;
@@ -217,7 +219,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
       const uint32_t* pred_mode_src = pred_mode_base;
       int x = 1;
       // First pixel follows the T (mode=2) mode.
-      PredictorAdd2(in, out - width, 1, out);
+      PredictorAdd2_C(in, out - width, 1, out);
       // .. the rest:
       while (x < width) {
         const VP8LPredictorAddSubFunc pred_func =
@@ -272,8 +274,8 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
     const uint32_t argb = src[i];
     const uint32_t green = argb >> 8;
     const uint32_t red = argb >> 16;
-    int new_red = red;
-    int new_blue = argb;
+    int new_red = red & 0xff;
+    int new_blue = argb & 0xff;
     new_red += ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue += ColorTransformDelta(m->green_to_blue_, green);
@@ -284,9 +286,9 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
 }
 
 // Color space inverse transform.
-static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
-                                       int y_start, int y_end,
-                                       const uint32_t* src, uint32_t* dst) {
+static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform,
+                                         int y_start, int y_end,
+                                         const uint32_t* src, uint32_t* dst) {
   const int width = transform->xsize_;
   const int tile_width = 1 << transform->bits_;
   const int mask = tile_width - 1;
@@ -362,10 +364,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform,               \
   }                                                                            \
 }
 
-COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
-                    VP8GetARGBIndex, VP8GetARGBValue)
-COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
-                    8b, VP8GetAlphaIndex, VP8GetAlphaValue)
+COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static,
+                    uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue)
+COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, ,
+                    uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
 
 #undef COLOR_INDEX_INVERSE
 
@@ -380,7 +382,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
       VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
       break;
     case PREDICTOR_TRANSFORM:
-      PredictorInverseTransform(transform, row_start, row_end, in, out);
+      PredictorInverseTransform_C(transform, row_start, row_end, in, out);
       if (row_end != transform->ysize_) {
         // The last predicted row in this iteration will be the top-pred row
         // for the first row in next iteration.
@@ -389,7 +391,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
       }
       break;
     case CROSS_COLOR_TRANSFORM:
-      ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
+      ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out);
       break;
     case COLOR_INDEXING_TRANSFORM:
       if (in == out && transform->bits_ > 0) {
@@ -403,9 +405,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
             VP8LSubSampleSize(transform->xsize_, transform->bits_);
         uint32_t* const src = out + out_stride - in_stride;
         memmove(src, out, in_stride * sizeof(*src));
-        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
+        ColorIndexInverseTransform_C(transform, row_start, row_end, src, out);
       } else {
-        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
+        ColorIndexInverseTransform_C(transform, row_start, row_end, in, out);
       }
       break;
   }
@@ -452,7 +454,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
     const uint32_t argb = *src++;
     const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
     const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     *dst++ = ba;
     *dst++ = rg;
 #else
@@ -469,7 +471,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
     const uint32_t argb = *src++;
     const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
     const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     *dst++ = gb;
     *dst++ = rg;
 #else
@@ -496,22 +498,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
     const uint32_t* const src_end = src + num_pixels;
     while (src < src_end) {
       const uint32_t argb = *src++;
-
-#if !defined(WORDS_BIGENDIAN)
-#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
       WebPUint32ToMem(dst, BSwap32(argb));
-#else  // WEBP_REFERENCE_IMPLEMENTATION
-      dst[0] = (argb >> 24) & 0xff;
-      dst[1] = (argb >> 16) & 0xff;
-      dst[2] = (argb >>  8) & 0xff;
-      dst[3] = (argb >>  0) & 0xff;
-#endif
-#else  // WORDS_BIGENDIAN
-      dst[0] = (argb >>  0) & 0xff;
-      dst[1] = (argb >>  8) & 0xff;
-      dst[2] = (argb >> 16) & 0xff;
-      dst[3] = (argb >> 24) & 0xff;
-#endif
       dst += sizeof(argb);
     }
   } else {
@@ -593,23 +580,23 @@ extern void VP8LDspInitMSA(void);
 static volatile VP8CPUInfo lossless_last_cpuinfo_used =
     (VP8CPUInfo)&lossless_last_cpuinfo_used;
 
-#define COPY_PREDICTOR_ARRAY(IN, OUT) do {              \
-  (OUT)[0] = IN##0;                                     \
-  (OUT)[1] = IN##1;                                     \
-  (OUT)[2] = IN##2;                                     \
-  (OUT)[3] = IN##3;                                     \
-  (OUT)[4] = IN##4;                                     \
-  (OUT)[5] = IN##5;                                     \
-  (OUT)[6] = IN##6;                                     \
-  (OUT)[7] = IN##7;                                     \
-  (OUT)[8] = IN##8;                                     \
-  (OUT)[9] = IN##9;                                     \
-  (OUT)[10] = IN##10;                                   \
-  (OUT)[11] = IN##11;                                   \
-  (OUT)[12] = IN##12;                                   \
-  (OUT)[13] = IN##13;                                   \
-  (OUT)[14] = IN##0; /* <- padding security sentinels*/ \
-  (OUT)[15] = IN##0;                                    \
+#define COPY_PREDICTOR_ARRAY(IN, OUT) do {                \
+  (OUT)[0] = IN##0_C;                                     \
+  (OUT)[1] = IN##1_C;                                     \
+  (OUT)[2] = IN##2_C;                                     \
+  (OUT)[3] = IN##3_C;                                     \
+  (OUT)[4] = IN##4_C;                                     \
+  (OUT)[5] = IN##5_C;                                     \
+  (OUT)[6] = IN##6_C;                                     \
+  (OUT)[7] = IN##7_C;                                     \
+  (OUT)[8] = IN##8_C;                                     \
+  (OUT)[9] = IN##9_C;                                     \
+  (OUT)[10] = IN##10_C;                                   \
+  (OUT)[11] = IN##11_C;                                   \
+  (OUT)[12] = IN##12_C;                                   \
+  (OUT)[13] = IN##13_C;                                   \
+  (OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \
+  (OUT)[15] = IN##0_C;                                    \
 } while (0);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
@@ -620,18 +607,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
 
+#if !WEBP_NEON_OMIT_C_CODE
   VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
 
   VP8LTransformColorInverse = VP8LTransformColorInverse_C;
 
-  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
   VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+#endif
+
   VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
   VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
-  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
 
-  VP8LMapColor32b = MapARGB;
-  VP8LMapColor8b = MapAlpha;
+  VP8LMapColor32b = MapARGB_C;
+  VP8LMapColor8b = MapAlpha_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -640,11 +630,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
       VP8LDspInitSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8LDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       VP8LDspInitMIPSdspR2();
@@ -656,6 +641,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LDspInitNEON();
+  }
+#endif
+
+  assert(VP8LAddGreenToBlueAndRed != NULL);
+  assert(VP8LTransformColorInverse != NULL);
+  assert(VP8LConvertBGRAToRGBA != NULL);
+  assert(VP8LConvertBGRAToRGB != NULL);
+  assert(VP8LConvertBGRAToBGR != NULL);
+  assert(VP8LConvertBGRAToRGBA4444 != NULL);
+  assert(VP8LConvertBGRAToRGB565 != NULL);
+  assert(VP8LMapColor32b != NULL);
+  assert(VP8LMapColor8b != NULL);
+
   lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 #undef COPY_PREDICTOR_ARRAY
diff --git a/thirdparty/libwebp/dsp/lossless.h b/thirdparty/libwebp/src/dsp/lossless.h
index 352a54e509..a99dbda686 100644
--- a/thirdparty/libwebp/dsp/lossless.h
+++ b/thirdparty/libwebp/src/dsp/lossless.h
@@ -15,18 +15,18 @@
 #ifndef WEBP_DSP_LOSSLESS_H_
 #define WEBP_DSP_LOSSLESS_H_
 
-#include "../webp/types.h"
-#include "../webp/decode.h"
+#include "src/webp/types.h"
+#include "src/webp/decode.h"
 
-#include "../enc/histogram_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/histogram_enc.h"
+#include "src/utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "../enc/delta_palettization_enc.h"
+#include "src/enc/delta_palettization_enc.h"
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
 //------------------------------------------------------------------------------
@@ -124,7 +124,7 @@ void VP8LDspInit(void);
 typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
 extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
-                                       uint32_t* const dst, int num_pixels);
+                                       uint32_t* dst, int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
     const uint32_t* argb, int stride,
diff --git a/thirdparty/libwebp/dsp/lossless_common.h b/thirdparty/libwebp/src/dsp/lossless_common.h
index c40f711208..a2648d1737 100644
--- a/thirdparty/libwebp/dsp/lossless_common.h
+++ b/thirdparty/libwebp/src/dsp/lossless_common.h
@@ -16,9 +16,9 @@
 #ifndef WEBP_DSP_LOSSLESS_COMMON_H_
 #define WEBP_DSP_LOSSLESS_COMMON_H_
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
-#include "../utils/utils.h"
+#include "src/utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
 // -----------------------------------------------------------------------------
 // PrefixEncode()
 
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int log_floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1))) {  // zero or a power of two.
-    return log_floor;
-  }
-  return log_floor + 1;
-}
-
 // Splitting of distance and length codes into prefixes and
 // extra bits. The prefixes are encoded with an entropy code
 // while the extra bits are stored just as normal bits.
diff --git a/thirdparty/libwebp/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c
index 4e46fbab8b..92ca3c0542 100644
--- a/thirdparty/libwebp/dsp/lossless_enc.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc.c
@@ -13,15 +13,16 @@
 //          Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li_dec.h"
-#include "../utils/endian_inl_utils.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
-#include "./yuv.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/dsp/yuv.h"
 
 // lookup table for small values of log2(int)
 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
@@ -325,7 +326,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
   112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
 };
 
-static float FastSLog2Slow(uint32_t v) {
+static float FastSLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     int log_cnt = 0;
@@ -351,7 +352,7 @@ static float FastSLog2Slow(uint32_t v) {
   }
 }
 
-static float FastLog2Slow(uint32_t v) {
+static float FastLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     int log_cnt = 0;
@@ -380,7 +381,7 @@ static float FastLog2Slow(uint32_t v) {
 // Methods to calculate Entropy (Shannon).
 
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
-static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
   int i;
   double retval = 0.;
   int sumX = 0, sumXY = 0;
@@ -453,9 +454,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
 }
 
-static void GetEntropyUnrefined(const uint32_t X[], int length,
-                                VP8LBitEntropy* const bit_entropy,
-                                VP8LStreaks* const stats) {
+static void GetEntropyUnrefined_C(const uint32_t X[], int length,
+                                  VP8LBitEntropy* const bit_entropy,
+                                  VP8LStreaks* const stats) {
   int i;
   int i_prev = 0;
   uint32_t x_prev = X[0];
@@ -474,10 +475,11 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
   bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
 }
 
-static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
-                                        int length,
-                                        VP8LBitEntropy* const bit_entropy,
-                                        VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
+                                          const uint32_t Y[],
+                                          int length,
+                                          VP8LBitEntropy* const bit_entropy,
+                                          VP8LStreaks* const stats) {
   int i = 1;
   int i_prev = 0;
   uint32_t xy_prev = X[0] + Y[0];
@@ -520,8 +522,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
     const uint32_t argb = data[i];
     const uint32_t green = argb >> 8;
     const uint32_t red = argb >> 16;
-    int new_red = red;
-    int new_blue = argb;
+    int new_red = red & 0xff;
+    int new_blue = argb & 0xff;
     new_red -= ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue -= ColorTransformDelta(m->green_to_blue_, green);
@@ -577,8 +579,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
 
 //------------------------------------------------------------------------------
 
-static int VectorMismatch(const uint32_t* const array1,
-                          const uint32_t* const array2, int length) {
+static int VectorMismatch_C(const uint32_t* const array1,
+                            const uint32_t* const array2, int length) {
   int match_len = 0;
 
   while (match_len < length && array1[match_len] == array2[match_len]) {
@@ -610,15 +612,15 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
 
 //------------------------------------------------------------------------------
 
-static double ExtraCost(const uint32_t* population, int length) {
+static double ExtraCost_C(const uint32_t* population, int length) {
   int i;
   double cost = 0.;
   for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
   return cost;
 }
 
-static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
-                                int length) {
+static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+                                  int length) {
   int i;
   double cost = 0.;
   for (i = 2; i < length - 2; ++i) {
@@ -630,9 +632,9 @@ static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
 
 //------------------------------------------------------------------------------
 
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
+static void HistogramAdd_C(const VP8LHistogram* const a,
+                           const VP8LHistogram* const b,
+                           VP8LHistogram* const out) {
   int i;
   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
@@ -869,26 +871,28 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
 
   VP8LDspInit();
 
+#if !WEBP_NEON_OMIT_C_CODE
   VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
 
   VP8LTransformColor = VP8LTransformColor_C;
+#endif
 
   VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
   VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
 
-  VP8LFastLog2Slow = FastLog2Slow;
-  VP8LFastSLog2Slow = FastSLog2Slow;
+  VP8LFastLog2Slow = FastLog2Slow_C;
+  VP8LFastSLog2Slow = FastSLog2Slow_C;
 
-  VP8LExtraCost = ExtraCost;
-  VP8LExtraCostCombined = ExtraCostCombined;
-  VP8LCombinedShannonEntropy = CombinedShannonEntropy;
+  VP8LExtraCost = ExtraCost_C;
+  VP8LExtraCostCombined = ExtraCostCombined_C;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
 
-  VP8LGetEntropyUnrefined = GetEntropyUnrefined;
-  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
 
-  VP8LHistogramAdd = HistogramAdd;
+  VP8LHistogramAdd = HistogramAdd_C;
 
-  VP8LVectorMismatch = VectorMismatch;
+  VP8LVectorMismatch = VectorMismatch_C;
   VP8LBundleColorMap = VP8LBundleColorMap_C;
 
   VP8LPredictorsSub[0] = PredictorSub0_C;
@@ -937,11 +941,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
 #endif
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8LEncDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8LEncDspInitMIPS32();
@@ -958,6 +957,61 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LEncDspInitNEON();
+  }
+#endif
+
+  assert(VP8LSubtractGreenFromBlueAndRed != NULL);
+  assert(VP8LTransformColor != NULL);
+  assert(VP8LCollectColorBlueTransforms != NULL);
+  assert(VP8LCollectColorRedTransforms != NULL);
+  assert(VP8LFastLog2Slow != NULL);
+  assert(VP8LFastSLog2Slow != NULL);
+  assert(VP8LExtraCost != NULL);
+  assert(VP8LExtraCostCombined != NULL);
+  assert(VP8LCombinedShannonEntropy != NULL);
+  assert(VP8LGetEntropyUnrefined != NULL);
+  assert(VP8LGetCombinedEntropyUnrefined != NULL);
+  assert(VP8LHistogramAdd != NULL);
+  assert(VP8LVectorMismatch != NULL);
+  assert(VP8LBundleColorMap != NULL);
+  assert(VP8LPredictorsSub[0] != NULL);
+  assert(VP8LPredictorsSub[1] != NULL);
+  assert(VP8LPredictorsSub[2] != NULL);
+  assert(VP8LPredictorsSub[3] != NULL);
+  assert(VP8LPredictorsSub[4] != NULL);
+  assert(VP8LPredictorsSub[5] != NULL);
+  assert(VP8LPredictorsSub[6] != NULL);
+  assert(VP8LPredictorsSub[7] != NULL);
+  assert(VP8LPredictorsSub[8] != NULL);
+  assert(VP8LPredictorsSub[9] != NULL);
+  assert(VP8LPredictorsSub[10] != NULL);
+  assert(VP8LPredictorsSub[11] != NULL);
+  assert(VP8LPredictorsSub[12] != NULL);
+  assert(VP8LPredictorsSub[13] != NULL);
+  assert(VP8LPredictorsSub[14] != NULL);
+  assert(VP8LPredictorsSub[15] != NULL);
+  assert(VP8LPredictorsSub_C[0] != NULL);
+  assert(VP8LPredictorsSub_C[1] != NULL);
+  assert(VP8LPredictorsSub_C[2] != NULL);
+  assert(VP8LPredictorsSub_C[3] != NULL);
+  assert(VP8LPredictorsSub_C[4] != NULL);
+  assert(VP8LPredictorsSub_C[5] != NULL);
+  assert(VP8LPredictorsSub_C[6] != NULL);
+  assert(VP8LPredictorsSub_C[7] != NULL);
+  assert(VP8LPredictorsSub_C[8] != NULL);
+  assert(VP8LPredictorsSub_C[9] != NULL);
+  assert(VP8LPredictorsSub_C[10] != NULL);
+  assert(VP8LPredictorsSub_C[11] != NULL);
+  assert(VP8LPredictorsSub_C[12] != NULL);
+  assert(VP8LPredictorsSub_C[13] != NULL);
+  assert(VP8LPredictorsSub_C[14] != NULL);
+  assert(VP8LPredictorsSub_C[15] != NULL);
+
   lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
diff --git a/thirdparty/libwebp/dsp/lossless_enc_mips32.c b/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c
index 4186b9f50d..e7b58f4e8c 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_mips32.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -12,9 +12,9 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 
 #if defined(WEBP_USE_MIPS32)
 
@@ -23,7 +23,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-static float FastSLog2Slow(uint32_t v) {
+static float FastSLog2Slow_MIPS32(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     uint32_t log_cnt, y, correction;
@@ -59,7 +59,7 @@ static float FastSLog2Slow(uint32_t v) {
   }
 }
 
-static float FastLog2Slow(uint32_t v) {
+static float FastLog2Slow_MIPS32(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     uint32_t log_cnt, y;
@@ -104,7 +104,7 @@ static float FastLog2Slow(uint32_t v) {
 //     pop += 2;
 //   }
 //   return (double)cost;
-static double ExtraCost(const uint32_t* const population, int length) {
+static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
   int i, temp0, temp1;
   const uint32_t* pop = &population[4];
   const uint32_t* const LoopEnd = &population[length];
@@ -149,8 +149,8 @@ static double ExtraCost(const uint32_t* const population, int length) {
 //     pY += 2;
 //   }
 //   return (double)cost;
-static double ExtraCostCombined(const uint32_t* const X,
-                                const uint32_t* const Y, int length) {
+static double ExtraCostCombined_MIPS32(const uint32_t* const X,
+                                       const uint32_t* const Y, int length) {
   int i, temp0, temp1, temp2, temp3;
   const uint32_t* pX = &X[4];
   const uint32_t* pY = &Y[4];
@@ -241,9 +241,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
 }
 
-static void GetEntropyUnrefined(const uint32_t X[], int length,
-                                VP8LBitEntropy* const bit_entropy,
-                                VP8LStreaks* const stats) {
+static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
+                                       VP8LBitEntropy* const bit_entropy,
+                                       VP8LStreaks* const stats) {
   int i;
   int i_prev = 0;
   uint32_t x_prev = X[0];
@@ -262,26 +262,27 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
   bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
 }
 
-static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
-                                        int length,
-                                        VP8LBitEntropy* const bit_entropy,
-                                        VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
+                                               const uint32_t Y[],
+                                               int length,
+                                               VP8LBitEntropy* const entropy,
+                                               VP8LStreaks* const stats) {
   int i = 1;
   int i_prev = 0;
   uint32_t xy_prev = X[0] + Y[0];
 
   memset(stats, 0, sizeof(*stats));
-  VP8LBitEntropyInit(bit_entropy);
+  VP8LBitEntropyInit(entropy);
 
   for (i = 1; i < length; ++i) {
     const uint32_t xy = X[i] + Y[i];
     if (xy != xy_prev) {
-      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
     }
   }
-  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
 
-  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+  entropy->entropy += VP8LFastSLog2(entropy->sum);
 }
 
 #define ASM_START                                       \
@@ -374,9 +375,9 @@ static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
   }                                                     \
 } while (0)
 
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
+static void HistogramAdd_MIPS32(const VP8LHistogram* const a,
+                                const VP8LHistogram* const b,
+                                VP8LHistogram* const out) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
                              - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
@@ -415,13 +416,13 @@ static void HistogramAdd(const VP8LHistogram* const a,
 extern void VP8LEncDspInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
-  VP8LFastSLog2Slow = FastSLog2Slow;
-  VP8LFastLog2Slow = FastLog2Slow;
-  VP8LExtraCost = ExtraCost;
-  VP8LExtraCostCombined = ExtraCostCombined;
-  VP8LGetEntropyUnrefined = GetEntropyUnrefined;
-  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
-  VP8LHistogramAdd = HistogramAdd;
+  VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
+  VP8LFastLog2Slow = FastLog2Slow_MIPS32;
+  VP8LExtraCost = ExtraCost_MIPS32;
+  VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
+  VP8LHistogramAdd = HistogramAdd_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/thirdparty/libwebp/dsp/lossless_enc_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
index 0abf3c4f36..5855e6ae15 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
@@ -12,14 +12,14 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./lossless.h"
+#include "src/dsp/lossless.h"
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
-                                        int num_pixels) {
+static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
+                                                  int num_pixels) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
   uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
@@ -78,8 +78,8 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
   return (uint32_t)((int)(color_pred) * color) >> 5;
 }
 
-static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
-                           int num_pixels) {
+static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
+                                     uint32_t* data, int num_pixels) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   uint32_t argb, argb1, new_red, new_red1;
   const uint32_t G_to_R = m->green_to_red_;
@@ -171,10 +171,13 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   return (new_blue & 0xff);
 }
 
-static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
-                                       int tile_width, int tile_height,
-                                       int green_to_blue, int red_to_blue,
-                                       int histo[]) {
+static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
+                                                 int stride,
+                                                 int tile_width,
+                                                 int tile_height,
+                                                 int green_to_blue,
+                                                 int red_to_blue,
+                                                 int histo[]) {
   const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
   const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
   const uint32_t mask = 0xff00ffu;
@@ -222,9 +225,12 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
   return (new_red & 0xff);
 }
 
-static void CollectColorRedTransforms(const uint32_t* argb, int stride,
-                                      int tile_width, int tile_height,
-                                      int green_to_red, int histo[]) {
+static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
+                                                int stride,
+                                                int tile_width,
+                                                int tile_height,
+                                                int green_to_red,
+                                                int histo[]) {
   const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
   while (tile_height-- > 0) {
     int x;
@@ -262,10 +268,10 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
 extern void VP8LEncDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
-  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
-  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
+  VP8LTransformColor = TransformColor_MIPSdspR2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/lossless_enc_msa.c b/thirdparty/libwebp/src/dsp/lossless_enc_msa.c
index 2f69ba3bca..600dddfb59 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_msa.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_msa.c
@@ -11,12 +11,12 @@
 //
 // Authors: Prashant Patil (Prashant.Patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./lossless.h"
-#include "./msa_macro.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/msa_macro.h"
 
 #define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do {  \
   v8i16 g0, g1, t0, t1, t2, t3;                                               \
@@ -48,8 +48,8 @@
   dst = VSHF_UB(src, t0, mask1);                                \
 } while (0)
 
-static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
-                           int num_pixels) {
+static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
+                               int num_pixels) {
   v16u8 src0, dst0;
   const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
                                          (m->green_to_red_ << 16));
@@ -94,7 +94,8 @@ static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
   }
 }
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
+                                            int num_pixels) {
   int i;
   uint8_t* ptemp_data = (uint8_t*)argb_data;
   v16u8 src0, dst0, tmp0;
@@ -136,8 +137,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 extern void VP8LEncDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
+  VP8LTransformColor = TransformColor_MSA;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/lossless_enc_neon.c b/thirdparty/libwebp/src/dsp/lossless_enc_neon.c
index 4c56f2594b..7c7b73f8b6 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_neon.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_neon.c
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <arm_neon.h>
 
-#include "./lossless.h"
-#include "./neon.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/neon.h"
 
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
@@ -36,8 +36,8 @@ static const uint8_t kGreenShuffle[16] = {
   1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
 };
 
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
-                                             const uint8x16_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x16_t shuffle) {
   return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
                      vtbl1q_u8(argb, vget_high_u8(shuffle)));
 }
@@ -45,14 +45,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
 // 255 = byte will be zeroed
 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
 
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
-                                             const uint8x8_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x8_t shuffle) {
   return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                      vtbl1_u8(vget_high_u8(argb), shuffle));
 }
 #endif  // USE_VTBLQ
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
+                                             int num_pixels) {
   const uint32_t* const end = argb_data + (num_pixels & ~3);
 #ifdef USE_VTBLQ
   const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@@ -61,7 +62,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 #endif
   for (; argb_data < end; argb_data += 4) {
     const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
-    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
     vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
   }
   // fallthrough and finish off with plain-C
@@ -71,8 +72,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 //------------------------------------------------------------------------------
 // Color Transform
 
-static void TransformColor(const VP8LMultipliers* const m,
-                           uint32_t* argb_data, int num_pixels) {
+static void TransformColor_NEON(const VP8LMultipliers* const m,
+                                uint32_t* argb_data, int num_pixels) {
   // sign-extended multiplying constants, pre-shifted by 6.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
   const int16_t rb[8] = {
@@ -102,7 +103,7 @@ static void TransformColor(const VP8LMultipliers* const m,
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
     // 0 g 0 g
-    const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+    const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
     // x dr  x db1
     const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
     // r 0   b   0
@@ -132,8 +133,8 @@ static void TransformColor(const VP8LMultipliers* const m,
 extern void VP8LEncDspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
+  VP8LTransformColor = TransformColor_NEON;
 }
 
 #else  // !WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/lossless_enc_sse2.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 8ad85d94d7..1eaf35ca8e 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_sse2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -11,22 +11,23 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 #include <assert.h>
 #include <emmintrin.h>
-#include "./lossless.h"
-#include "./common_sse2.h"
-#include "./lossless_common.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dsp/lossless_common.h"
 
 // For sign-extended multiplying constants, pre-shifted by 5:
-#define CST_5b(X)  (((int16_t)((uint16_t)X << 8)) >> 5)
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
 
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
+                                             int num_pixels) {
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
@@ -45,8 +46,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 //------------------------------------------------------------------------------
 // Color Transform
 
-static void TransformColor(const VP8LMultipliers* const m,
-                           uint32_t* argb_data, int num_pixels) {
+static void TransformColor_SSE2(const VP8LMultipliers* const m,
+                                uint32_t* argb_data, int num_pixels) {
   const __m128i mults_rb = _mm_set_epi16(
       CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
       CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
@@ -80,10 +81,10 @@ static void TransformColor(const VP8LMultipliers* const m,
 
 //------------------------------------------------------------------------------
 #define SPAN 8
-static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
-                                       int tile_width, int tile_height,
-                                       int green_to_blue, int red_to_blue,
-                                       int histo[]) {
+static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_blue, int red_to_blue,
+                                            int histo[]) {
   const __m128i mults_r = _mm_set_epi16(
       CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
       CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
@@ -131,9 +132,9 @@ static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
   }
 }
 
-static void CollectColorRedTransforms(const uint32_t* argb, int stride,
-                                      int tile_width, int tile_height,
-                                      int green_to_red, int histo[]) {
+static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
+                                           int tile_width, int tile_height,
+                                           int green_to_red, int histo[]) {
   const __m128i mults_g = _mm_set_epi16(
       0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
       0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
@@ -177,8 +178,8 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
 //------------------------------------------------------------------------------
 
 #define LINE_SIZE 16    // 8 or 16
-static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
-                      int size) {
+static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                           int size) {
   int i;
   assert(size % LINE_SIZE == 0);
   for (i = 0; i < size; i += LINE_SIZE) {
@@ -203,7 +204,7 @@ static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
   }
 }
 
-static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
+static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
   int i;
   assert(size % LINE_SIZE == 0);
   for (i = 0; i < size; i += LINE_SIZE) {
@@ -231,22 +232,22 @@ static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
 
 // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
 // that's ok since the histogram values are less than 1<<28 (max picture size).
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
+static void HistogramAdd_SSE2(const VP8LHistogram* const a,
+                              const VP8LHistogram* const b,
+                              VP8LHistogram* const out) {
   int i;
   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   if (b != out) {
-    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
-    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+    AddVector_SSE2(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVector_SSE2(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
+    AddVector_SSE2(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVector_SSE2(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
   } else {
-    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
-    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+    AddVectorEq_SSE2(a->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVectorEq_SSE2(a->red_, out->red_, NUM_LITERAL_CODES);
+    AddVectorEq_SSE2(a->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVectorEq_SSE2(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
   }
   for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
     out->literal_[i] = a->literal_[i] + b->literal_[i];
@@ -261,9 +262,9 @@ static void HistogramAdd(const VP8LHistogram* const a,
 
 // Checks whether the X or Y contribution is worth computing and adding.
 // Used in loop unrolling.
-#define ANALYZE_X_OR_Y(x_or_y, j)                                   \
-  do {                                                              \
-    if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
+#define ANALYZE_X_OR_Y(x_or_y, j)                                           \
+  do {                                                                      \
+    if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
   } while (0)
 
 // Checks whether the X + Y contribution is worth computing and adding.
@@ -276,7 +277,7 @@ static void HistogramAdd(const VP8LHistogram* const a,
     }                                  \
   } while (0)
 
-static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
   int i;
   double retval = 0.;
   int sumX, sumXY;
@@ -332,8 +333,8 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
 
 //------------------------------------------------------------------------------
 
-static int VectorMismatch(const uint32_t* const array1,
-                          const uint32_t* const array2, int length) {
+static int VectorMismatch_SSE2(const uint32_t* const array1,
+                               const uint32_t* const array2, int length) {
   int match_len;
 
   if (length >= 12) {
@@ -574,8 +575,8 @@ static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
 }
 
 // Predictor11: select.
-static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
-                            __m128i* const out) {
+static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
+                                 __m128i* const out) {
   // We can unpack with any value on the upper 32 bits, provided it's the same
   // on both operands (to that their sum of abs diff is zero). Here we use *A.
   const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
@@ -596,8 +597,8 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
     const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     __m128i pa, pb;
-    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
-    GetSumAbsDiff32(&L, &TL, &pb);   // pb = sum |L-TL|
+    GetSumAbsDiff32_SSE2(&T, &TL, &pa);   // pa = sum |T-TL|
+    GetSumAbsDiff32_SSE2(&L, &TL, &pb);   // pb = sum |L-TL|
     {
       const __m128i mask = _mm_cmpgt_epi32(pb, pa);
       const __m128i A = _mm_and_si128(mask, L);
@@ -677,13 +678,13 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
 extern void VP8LEncDspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
-  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
-  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
-  VP8LHistogramAdd = HistogramAdd;
-  VP8LCombinedShannonEntropy = CombinedShannonEntropy;
-  VP8LVectorMismatch = VectorMismatch;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
+  VP8LTransformColor = TransformColor_SSE2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
+  VP8LHistogramAdd = HistogramAdd_SSE2;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+  VP8LVectorMismatch = VectorMismatch_SSE2;
   VP8LBundleColorMap = BundleColorMap_SSE2;
 
   VP8LPredictorsSub[0] = PredictorSub0_SSE2;
diff --git a/thirdparty/libwebp/dsp/lossless_enc_sse41.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 821057ccd4..3526a342d3 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_sse41.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -11,17 +11,18 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE41)
 #include <assert.h>
 #include <smmintrin.h>
-#include "./lossless.h"
+#include "src/dsp/lossless.h"
 
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
+                                              int num_pixels) {
   int i;
   const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
                                            -1,  5, -1,  5, -1, 1, -1, 1);
@@ -43,7 +44,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 extern void VP8LEncDspInitSSE41(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/thirdparty/libwebp/dsp/lossless_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 2984ce8df7..9888854d57 100644
--- a/thirdparty/libwebp/dsp/lossless_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -12,12 +12,12 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 
 #define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)                 \
 static void FUNC_NAME(const TYPE* src,                                         \
@@ -86,8 +86,8 @@ static void FUNC_NAME(const TYPE* src,                                         \
   }                                                                            \
 }
 
-MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
-MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
+MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
+MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
 
 #undef MAP_COLOR_FUNCS
 
@@ -188,48 +188,52 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return Average2(Average2(a0, a1), Average2(a2, a3));
 }
 
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   return Average3(left, top[0], top[1]);
 }
 
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   return Average2(left, top[-1]);
 }
 
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   return Average2(left, top[0]);
 }
 
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   (void)left;
   return Average2(top[-1], top[0]);
 }
 
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   (void)left;
   return Average2(top[0], top[1]);
 }
 
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_MIPSdspR2(uint32_t left,
+                                      const uint32_t* const top) {
   return Average4(left, top[-1], top[0], top[1]);
 }
 
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_MIPSdspR2(uint32_t left,
+                                      const uint32_t* const top) {
   return Select(top[0], left, top[-1]);
 }
 
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_MIPSdspR2(uint32_t left,
+                                      const uint32_t* const top) {
   return ClampedAddSubtractFull(left, top[0], top[-1]);
 }
 
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_MIPSdspR2(uint32_t left,
+                                      const uint32_t* const top) {
   return ClampedAddSubtractHalf(left, top[0], top[-1]);
 }
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
-                                 uint32_t* dst) {
+static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
+                                           uint32_t* dst) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -285,9 +289,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
   );
 }
 
-static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  const uint32_t* src, int num_pixels,
-                                  uint32_t* dst) {
+static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
+                                            const uint32_t* src, int num_pixels,
+                                            uint32_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   uint32_t argb, argb1, new_red;
   const uint32_t G_to_R = m->green_to_red_;
@@ -356,8 +360,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
   if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
 }
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -408,8 +412,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
   );
 }
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
+                                        int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -458,8 +462,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   );
 }
 
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
+                                            int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -492,7 +496,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
     "ins            %[temp3],    %[temp5],          16,   4    \n\t"
     "addiu          %[src],      %[src],            16         \n\t"
     "precr.qb.ph    %[temp3],    %[temp3],          %[temp2]   \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     "usw            %[temp1],    0(%[dst])                     \n\t"
     "usw            %[temp3],    4(%[dst])                     \n\t"
 #else
@@ -514,7 +518,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
     "ins            %[temp0],    %[temp5],          16,   4    \n\t"
     "addiu          %[src],      %[src],            4          \n\t"
     "precr.qb.ph    %[temp0],    %[temp0],          %[temp0]   \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     "ush            %[temp0],    0(%[dst])                     \n\t"
 #else
     "wsbh           %[temp0],    %[temp0]                      \n\t"
@@ -532,8 +536,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
   );
 }
 
-static void ConvertBGRAToRGB565(const uint32_t* src,
-                                int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
+                                          int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -570,7 +574,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
     "ins            %[temp2],    %[temp3],          0,    5    \n\t"
     "addiu          %[src],      %[src],            16         \n\t"
     "append         %[temp2],    %[temp1],          16         \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     "usw            %[temp0],    0(%[dst])                     \n\t"
     "usw            %[temp2],    4(%[dst])                     \n\t"
 #else
@@ -592,7 +596,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
     "ins            %[temp4],    %[temp5],          0,    11   \n\t"
     "addiu          %[src],      %[src],            4          \n\t"
     "ins            %[temp4],    %[temp0],          0,    5    \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     "ush            %[temp4],    0(%[dst])                     \n\t"
 #else
     "wsbh           %[temp4],    %[temp4]                      \n\t"
@@ -610,8 +614,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
   );
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -662,24 +666,27 @@ static void ConvertBGRAToBGR(const uint32_t* src,
 extern void VP8LDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
-  VP8LMapColor32b = MapARGB;
-  VP8LMapColor8b = MapAlpha;
-  VP8LPredictors[5] = Predictor5;
-  VP8LPredictors[6] = Predictor6;
-  VP8LPredictors[7] = Predictor7;
-  VP8LPredictors[8] = Predictor8;
-  VP8LPredictors[9] = Predictor9;
-  VP8LPredictors[10] = Predictor10;
-  VP8LPredictors[11] = Predictor11;
-  VP8LPredictors[12] = Predictor12;
-  VP8LPredictors[13] = Predictor13;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-  VP8LTransformColorInverse = TransformColorInverse;
-  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
-  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
-  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
-  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
-  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LMapColor32b = MapARGB_MIPSdspR2;
+  VP8LMapColor8b = MapAlpha_MIPSdspR2;
+
+  VP8LPredictors[5] = Predictor5_MIPSdspR2;
+  VP8LPredictors[6] = Predictor6_MIPSdspR2;
+  VP8LPredictors[7] = Predictor7_MIPSdspR2;
+  VP8LPredictors[8] = Predictor8_MIPSdspR2;
+  VP8LPredictors[9] = Predictor9_MIPSdspR2;
+  VP8LPredictors[10] = Predictor10_MIPSdspR2;
+  VP8LPredictors[11] = Predictor11_MIPSdspR2;
+  VP8LPredictors[12] = Predictor12_MIPSdspR2;
+  VP8LPredictors[13] = Predictor13_MIPSdspR2;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
+  VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
+
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/lossless_msa.c b/thirdparty/libwebp/src/dsp/lossless_msa.c
index f6dd5649ac..9f5472078d 100644
--- a/thirdparty/libwebp/dsp/lossless_msa.c
+++ b/thirdparty/libwebp/src/dsp/lossless_msa.c
@@ -11,12 +11,12 @@
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./lossless.h"
-#include "./msa_macro.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/msa_macro.h"
 
 //------------------------------------------------------------------------------
 // Colorspace conversion functions
@@ -43,7 +43,7 @@
 
 #define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do {         \
   uint64_t pix_d;                                          \
-  v16u8 src0, src1, src2, dst0, dst1;                      \
+  v16u8 src0, src1, src2 = { 0 }, dst0, dst1;              \
   LD_UB2(psrc, 16, src0, src1);                            \
   VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
   ST_UB(dst0, pdst);                                       \
@@ -109,8 +109,8 @@
   dst = VSHF_UB(src, t0, mask1);                                        \
 } while (0)
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   int i;
   const uint8_t* ptemp_src = (const uint8_t*)src;
   uint8_t* ptemp_dst = (uint8_t*)dst;
@@ -150,8 +150,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_MSA(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
   const uint8_t* ptemp_src = (const uint8_t*)src;
   uint8_t* ptemp_dst = (uint8_t*)dst;
   const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
@@ -197,8 +197,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_MSA(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
   const uint8_t* ptemp_src = (const uint8_t*)src;
   uint8_t* ptemp_dst = (uint8_t*)dst;
   const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
@@ -244,8 +244,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
   }
 }
 
-static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
-                                 uint32_t* dst) {
+static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
+                                     uint32_t* dst) {
   int i;
   const uint8_t* in = (const uint8_t*)src;
   uint8_t* out = (uint8_t*)dst;
@@ -286,9 +286,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
   }
 }
 
-static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  const uint32_t* src, int num_pixels,
-                                  uint32_t* dst) {
+static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
+                                      const uint32_t* src, int num_pixels,
+                                      uint32_t* dst) {
   v16u8 src0, dst0;
   const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
                                          (m->green_to_red_ << 16));
@@ -341,11 +341,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
 extern void VP8LDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
-  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
-  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
-  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-  VP8LTransformColorInverse = TransformColorInverse;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
+  VP8LTransformColorInverse = TransformColorInverse_MSA;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/lossless_neon.c b/thirdparty/libwebp/src/dsp/lossless_neon.c
index 1145d5fad0..76a1b6f873 100644
--- a/thirdparty/libwebp/dsp/lossless_neon.c
+++ b/thirdparty/libwebp/src/dsp/lossless_neon.c
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <arm_neon.h>
 
-#include "./lossless.h"
-#include "./neon.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/neon.h"
 
 //------------------------------------------------------------------------------
 // Colorspace conversion functions
@@ -26,8 +26,8 @@
 #if !defined(WORK_AROUND_GCC)
 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
 // gcc-4.8.x at least.
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
     uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_NEON(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
     const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -53,8 +53,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
   VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
 }
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_NEON(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
     const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -71,8 +71,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
 
 static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~1);
   const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
   for (; src < end; src += 2) {
@@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
   { 21, 22, 24, 25, 26, 28, 29, 30 }
 };
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_NEON(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~7);
   const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
   const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
   { 21, 20, 26, 25, 24, 30, 29, 28 }
 };
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_NEON(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~7);
   const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
   const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@@ -139,7 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src,
 
 #endif   // !WORK_AROUND_GCC
 
-
 //------------------------------------------------------------------------------
 // Predictor Transform
 
@@ -506,8 +505,8 @@ static const uint8_t kGreenShuffle[16] = {
   1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
 };
 
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
-                                             const uint8x16_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x16_t shuffle) {
   return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
                      vtbl1q_u8(argb, vget_high_u8(shuffle)));
 }
@@ -515,15 +514,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
 // 255 = byte will be zeroed
 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
 
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
-                                             const uint8x8_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x8_t shuffle) {
   return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                      vtbl1_u8(vget_high_u8(argb), shuffle));
 }
 #endif  // USE_VTBLQ
 
-static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
-                                 uint32_t* dst) {
+static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels,
+                                      uint32_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~3);
 #ifdef USE_VTBLQ
   const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@@ -532,7 +531,7 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
 #endif
   for (; src < end; src += 4, dst += 4) {
     const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
-    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
     vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
   }
   // fallthrough and finish off with plain-C
@@ -542,9 +541,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
 //------------------------------------------------------------------------------
 // Color Transform
 
-static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  const uint32_t* const src, int num_pixels,
-                                  uint32_t* dst) {
+static void TransformColorInverse_NEON(const VP8LMultipliers* const m,
+                                       const uint32_t* const src,
+                                       int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 6.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
   const int16_t rb[8] = {
@@ -575,7 +574,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
     const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
     // 0 g 0 g
-    const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+    const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
     // x dr  x db1
     const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
     // x r'  x   b'
@@ -627,12 +626,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
   VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
   VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
 
-  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
-  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
-  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON;
 
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-  VP8LTransformColorInverse = TransformColorInverse;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON;
+  VP8LTransformColorInverse = TransformColorInverse_NEON;
 }
 
 #else  // !WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/lossless_sse2.c b/thirdparty/libwebp/src/dsp/lossless_sse2.c
index 15aae93869..653b466cd6 100644
--- a/thirdparty/libwebp/dsp/lossless_sse2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_sse2.c
@@ -11,21 +11,22 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 
-#include "./common_sse2.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 #include <assert.h>
 #include <emmintrin.h>
 
 //------------------------------------------------------------------------------
 // Predictor Transform
 
-static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
+static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
+                                                        uint32_t c1,
+                                                        uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@@ -37,8 +38,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
   return output;
 }
 
-static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
+                                                        uint32_t c1,
+                                                        uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@@ -55,7 +57,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   return output;
 }
 
-static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
   int pa_minus_pb;
   const __m128i zero = _mm_setzero_si128();
   const __m128i A0 = _mm_cvtsi32_si128(a);
@@ -88,8 +90,9 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
   *avg = _mm_sub_epi8(avg1, one);
 }
 
-static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
-                                        __m128i* const avg) {
+static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
+                                             const uint32_t a1,
+                                             __m128i* const avg) {
   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
   const __m128i ones = _mm_set1_epi8(1);
   const __m128i A0 = _mm_cvtsi32_si128(a0);
@@ -99,7 +102,7 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
   *avg = _mm_sub_epi8(avg1, one);
 }
 
-static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
+static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
@@ -107,15 +110,16 @@ static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
   return _mm_srli_epi16(sum, 1);
 }
 
-static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
   __m128i output;
-  Average2_uint32(a0, a1, &output);
+  Average2_uint32_SSE2(a0, a1, &output);
   return _mm_cvtsi128_si32(output);
 }
 
-static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
+                                          uint32_t a2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i avg1 = Average2_uint32_16(a0, a2);
+  const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
   const __m128i sum = _mm_add_epi16(avg1, A1);
   const __m128i avg2 = _mm_srli_epi16(sum, 1);
@@ -124,10 +128,10 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
   return output;
 }
 
-static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
-                                     uint32_t a2, uint32_t a3) {
-  const __m128i avg1 = Average2_uint32_16(a0, a1);
-  const __m128i avg2 = Average2_uint32_16(a2, a3);
+static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
+                                          uint32_t a2, uint32_t a3) {
+  const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
+  const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
   const __m128i sum = _mm_add_epi16(avg2, avg1);
   const __m128i avg3 = _mm_srli_epi16(sum, 1);
   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
@@ -136,41 +140,41 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
 }
 
 static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
+  const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
   return pred;
 }
 static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
+  const uint32_t pred = Average2_SSE2(left, top[-1]);
   return pred;
 }
 static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
+  const uint32_t pred = Average2_SSE2(left, top[0]);
   return pred;
 }
 static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[-1], top[0]);
+  const uint32_t pred = Average2_SSE2(top[-1], top[0]);
   (void)left;
   return pred;
 }
 static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[0], top[1]);
+  const uint32_t pred = Average2_SSE2(top[0], top[1]);
   (void)left;
   return pred;
 }
 static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
   return pred;
 }
 static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+  const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
   return pred;
 }
 static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
   return pred;
 }
 static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
   return pred;
 }
 
@@ -272,9 +276,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 #undef GENERATE_PREDICTOR_2
 
 // Predictor10: average of (average of (L,TL), average of (T, TR)).
+#define DO_PRED10(OUT) do {               \
+  __m128i avgLTL, avg;                    \
+  Average2_m128i(&L, &TL, &avgLTL);       \
+  Average2_m128i(&avgTTR, &avgLTL, &avg); \
+  L = _mm_add_epi8(avg, src);             \
+  out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
+} while (0)
+
+#define DO_PRED10_SHIFT do {                                  \
+  /* Rotate the pre-computed values for the next iteration.*/ \
+  avgTTR = _mm_srli_si128(avgTTR, 4);                         \
+  TL = _mm_srli_si128(TL, 4);                                 \
+  src = _mm_srli_si128(src, 4);                               \
+} while (0)
+
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
-  int i, j;
+  int i;
   __m128i L = _mm_cvtsi32_si128(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@@ -283,79 +302,90 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
     const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
     __m128i avgTTR;
     Average2_m128i(&T, &TR, &avgTTR);
-    for (j = 0; j < 4; ++j) {
-      __m128i avgLTL, avg;
-      Average2_m128i(&L, &TL, &avgLTL);
-      Average2_m128i(&avgTTR, &avgLTL, &avg);
-      L = _mm_add_epi8(avg, src);
-      out[i + j] = _mm_cvtsi128_si32(L);
-      // Rotate the pre-computed values for the next iteration.
-      avgTTR = _mm_srli_si128(avgTTR, 4);
-      TL = _mm_srli_si128(TL, 4);
-      src = _mm_srli_si128(src, 4);
-    }
+    DO_PRED10(0);
+    DO_PRED10_SHIFT;
+    DO_PRED10(1);
+    DO_PRED10_SHIFT;
+    DO_PRED10(2);
+    DO_PRED10_SHIFT;
+    DO_PRED10(3);
   }
   if (i != num_pixels) {
     VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
   }
 }
+#undef DO_PRED10
+#undef DO_PRED10_SHIFT
 
 // Predictor11: select.
-static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
-                            __m128i* const out) {
-  // We can unpack with any value on the upper 32 bits, provided it's the same
-  // on both operands (to that their sum of abs diff is zero). Here we use *A.
-  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
-  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
-  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
-  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
-  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
-  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
-  *out = _mm_packs_epi32(s_lo, s_hi);
-}
+#define DO_PRED11(OUT) do {                                            \
+  const __m128i L_lo = _mm_unpacklo_epi32(L, T);                       \
+  const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);                     \
+  const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
+  const __m128i mask = _mm_cmpgt_epi32(pb, pa);                        \
+  const __m128i A = _mm_and_si128(mask, L);                            \
+  const __m128i B = _mm_andnot_si128(mask, T);                         \
+  const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
+  L = _mm_add_epi8(src, pred);                                         \
+  out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
+} while (0)
+
+#define DO_PRED11_SHIFT do {                                \
+  /* Shift the pre-computed value for the next iteration.*/ \
+  T = _mm_srli_si128(T, 4);                                 \
+  TL = _mm_srli_si128(TL, 4);                               \
+  src = _mm_srli_si128(src, 4);                             \
+  pa = _mm_srli_si128(pa, 4);                               \
+} while (0)
 
 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
-  int i, j;
+  int i;
+  __m128i pa;
   __m128i L = _mm_cvtsi32_si128(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
-    __m128i pa;
-    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
-    for (j = 0; j < 4; ++j) {
-      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
-      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
-      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
-      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
-      const __m128i A = _mm_and_si128(mask, L);
-      const __m128i B = _mm_andnot_si128(mask, T);
-      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
-      L = _mm_add_epi8(src, pred);
-      out[i + j] = _mm_cvtsi128_si32(L);
-      // Shift the pre-computed value for the next iteration.
-      T = _mm_srli_si128(T, 4);
-      TL = _mm_srli_si128(TL, 4);
-      src = _mm_srli_si128(src, 4);
-      pa = _mm_srli_si128(pa, 4);
+    {
+      // We can unpack with any value on the upper 32 bits, provided it's the
+      // same on both operands (so that their sum of abs diff is zero). Here we
+      // use T.
+      const __m128i T_lo = _mm_unpacklo_epi32(T, T);
+      const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
+      const __m128i T_hi = _mm_unpackhi_epi32(T, T);
+      const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
+      const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
+      const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
+      pa = _mm_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
     }
+    DO_PRED11(0);
+    DO_PRED11_SHIFT;
+    DO_PRED11(1);
+    DO_PRED11_SHIFT;
+    DO_PRED11(2);
+    DO_PRED11_SHIFT;
+    DO_PRED11(3);
   }
   if (i != num_pixels) {
     VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
   }
 }
+#undef DO_PRED11
+#undef DO_PRED11_SHIFT
 
 // Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT)                          \
-do {                                                        \
-  const __m128i all = _mm_add_epi16(L, (DIFF));             \
-  const __m128i alls = _mm_packus_epi16(all, all);          \
-  const __m128i res = _mm_add_epi8(src, alls);              \
-  out[i + (OUT)] = _mm_cvtsi128_si32(res);                  \
-  L = _mm_unpacklo_epi8(res, zero);                         \
+#define DO_PRED12(DIFF, LANE, OUT) do {            \
+  const __m128i all = _mm_add_epi16(L, (DIFF));    \
+  const __m128i alls = _mm_packus_epi16(all, all); \
+  const __m128i res = _mm_add_epi8(src, alls);     \
+  out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
+  L = _mm_unpacklo_epi8(res, zero);                \
+} while (0)
+
+#define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
   /* Shift the pre-computed value for the next iteration.*/ \
-  if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8);        \
+  if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8);      \
   src = _mm_srli_si128(src, 4);                             \
 } while (0)
 
@@ -377,8 +407,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
     __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
     __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
     DO_PRED12(diff_lo, 0, 0);
+    DO_PRED12_SHIFT(diff_lo, 0);
     DO_PRED12(diff_lo, 1, 1);
+    DO_PRED12_SHIFT(diff_lo, 1);
     DO_PRED12(diff_hi, 0, 2);
+    DO_PRED12_SHIFT(diff_hi, 0);
     DO_PRED12(diff_hi, 1, 3);
   }
   if (i != num_pixels) {
@@ -386,6 +419,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
   }
 }
 #undef DO_PRED12
+#undef DO_PRED12_SHIFT
 
 // Due to averages with integers, values cannot be accumulated in parallel for
 // predictors 13.
@@ -394,8 +428,8 @@ GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
-                                 uint32_t* dst) {
+static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
+                                      uint32_t* dst) {
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
@@ -414,9 +448,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
 //------------------------------------------------------------------------------
 // Color Transform
 
-static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  const uint32_t* const src, int num_pixels,
-                                  uint32_t* dst) {
+static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
+                                       const uint32_t* const src,
+                                       int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 5.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
   const __m128i mults_rb = _mm_set_epi16(
@@ -454,8 +488,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
 //------------------------------------------------------------------------------
 // Color-space conversion functions
 
-static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
-                             uint8_t* dst) {
+static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
+                                  uint8_t* dst) {
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
 
@@ -490,27 +524,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
   }
 }
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
+  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   while (num_pixels >= 8) {
-    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
-    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
-    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
-    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
-    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);   // b0b2b4b6g0g2g4g6...
-    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);   // b1b3b5b7g1g3g5g7...
-    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);   // b0...b7 | g0...g7
-    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);   // r0...r7 | a0...a7
-    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);  // g0...g7 | a0...a7
-    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);  // r0...r7 | b0...b7
-    const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0);   // r0g0r1g1 ... r6g6r7g7
-    const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0);   // b0a0b1a1 ... b6a6b7a7
-    const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);  // rgba0|rgba1...
-    const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0);  // rgba4|rgba5...
-    _mm_storeu_si128(out++, rgba0);
-    _mm_storeu_si128(out++, rgba4);
+    const __m128i A1 = _mm_loadu_si128(in++);
+    const __m128i A2 = _mm_loadu_si128(in++);
+    const __m128i B1 = _mm_and_si128(A1, red_blue_mask);     // R 0 B 0
+    const __m128i B2 = _mm_and_si128(A2, red_blue_mask);     // R 0 B 0
+    const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1);  // 0 G 0 A
+    const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2);  // 0 G 0 A
+    const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i F1 = _mm_or_si128(E1, C1);
+    const __m128i F2 = _mm_or_si128(E2, C2);
+    _mm_storeu_si128(out++, F1);
+    _mm_storeu_si128(out++, F2);
     num_pixels -= 8;
   }
   // left-overs
@@ -519,8 +552,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
   const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
   const __m128i* in = (const __m128i*)src;
@@ -541,7 +574,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
     const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
     const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
     const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
 #else
     const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1);  // rgba0...rgba7
@@ -555,8 +588,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGB565(const uint32_t* src,
-                                int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
+                                     int num_pixels, uint8_t* dst) {
   const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
   const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@@ -582,7 +615,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
     const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
     const __m128i b1 = _mm_srli_epi16(b0, 3);
     const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
 #else
     const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
@@ -596,8 +629,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
   const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
   const __m128i* in = (const __m128i*)src;
@@ -660,14 +693,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
   VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
   VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
 
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-  VP8LTransformColorInverse = TransformColorInverse;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
+  VP8LTransformColorInverse = TransformColorInverse_SSE2;
 
-  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
-  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
-  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
-  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
-  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/mips_macro.h b/thirdparty/libwebp/src/dsp/mips_macro.h
index 44aba9b71d..44aba9b71d 100644
--- a/thirdparty/libwebp/dsp/mips_macro.h
+++ b/thirdparty/libwebp/src/dsp/mips_macro.h
diff --git a/thirdparty/libwebp/dsp/msa_macro.h b/thirdparty/libwebp/src/dsp/msa_macro.h
index d0e5f45e01..dfacda6ccd 100644
--- a/thirdparty/libwebp/dsp/msa_macro.h
+++ b/thirdparty/libwebp/src/dsp/msa_macro.h
@@ -22,6 +22,7 @@
 #endif
 
 #ifdef CLANG_BUILD
+  #define ALPHAVAL  (-1)
   #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
   #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
   #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
@@ -32,6 +33,7 @@
   #define ANDI_B(a, b)  __msa_andi_b((v16u8)a, b)
   #define ORI_B(a, b)   __msa_ori_b((v16u8)a, b)
 #else
+  #define ALPHAVAL  (0xff)
   #define ADDVI_H(a, b)  (a + b)
   #define ADDVI_W(a, b)  (a + b)
   #define SRAI_B(a, b)  (a >> b)
diff --git a/thirdparty/libwebp/dsp/neon.h b/thirdparty/libwebp/src/dsp/neon.h
index 3b548a6855..aa1dea1301 100644
--- a/thirdparty/libwebp/dsp/neon.h
+++ b/thirdparty/libwebp/src/dsp/neon.h
@@ -14,11 +14,12 @@
 
 #include <arm_neon.h>
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 // Right now, some intrinsics functions seem slower, so we disable them
-// everywhere except aarch64 where the inline assembly is incompatible.
-#if defined(__aarch64__)
+// everywhere except newer clang/gcc or aarch64 where the inline assembly is
+// incompatible.
+#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
 #define WEBP_USE_INTRINSICS   // use intrinsics when possible
 #endif
 
@@ -43,11 +44,11 @@
 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
-#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
 #define WORK_AROUND_GCC
 #endif
 
-static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
+static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
   uint64x2x2_t row01, row23;
 
   row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
diff --git a/thirdparty/libwebp/dsp/rescaler.c b/thirdparty/libwebp/src/dsp/rescaler.c
index 0f54502352..4b6b7834e5 100644
--- a/thirdparty/libwebp/dsp/rescaler.c
+++ b/thirdparty/libwebp/src/dsp/rescaler.c
@@ -13,8 +13,8 @@
 
 #include <assert.h>
 
-#include "./dsp.h"
-#include "../utils/rescaler_utils.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
@@ -25,7 +25,8 @@
 //------------------------------------------------------------------------------
 // Row import
 
-void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
+void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   int channel;
@@ -56,7 +57,8 @@ void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
   }
 }
 
-void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
+void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   int channel;
@@ -92,7 +94,7 @@ void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
 //------------------------------------------------------------------------------
 // Row export
 
-void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
+void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -123,7 +125,7 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
   }
 }
 
-void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
+void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -207,11 +209,14 @@ static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
   if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
+#if !defined(WEBP_REDUCE_SIZE)
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
+  WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C;
+#endif
 
-  WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC;
-  WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC;
-  WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC;
-  WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC;
+  WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
+  WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
 
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -219,11 +224,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
       WebPRescalerDspInitSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPRescalerDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       WebPRescalerDspInitMIPS32();
@@ -240,5 +240,18 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPRescalerDspInitNEON();
+  }
+#endif
+
+  assert(WebPRescalerExportRowExpand != NULL);
+  assert(WebPRescalerExportRowShrink != NULL);
+  assert(WebPRescalerImportRowExpand != NULL);
+  assert(WebPRescalerImportRowShrink != NULL);
+#endif   // WEBP_REDUCE_SIZE
   rescaler_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/rescaler_mips32.c b/thirdparty/libwebp/src/dsp/rescaler_mips32.c
index e09ad5d19f..542f7e5970 100644
--- a/thirdparty/libwebp/dsp/rescaler_mips32.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_mips32.c
@@ -11,17 +11,18 @@
 //
 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_MIPS32)
+#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
 
 #include <assert.h>
-#include "../utils/rescaler_utils.h"
+#include "src/utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 // Row import
 
-static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
+static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int fx_scale = wrk->fx_scale;
@@ -80,7 +81,8 @@ static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
   }
 }
 
-static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
+static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int x_add = wrk->x_add;
@@ -144,7 +146,7 @@ static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
 //------------------------------------------------------------------------------
 // Row export
 
-static void ExportRowExpand(WebPRescaler* const wrk) {
+static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -207,7 +209,7 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
   }
 }
 
-static void ExportRowShrink(WebPRescaler* const wrk) {
+static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
@@ -278,10 +280,10 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
-  WebPRescalerImportRowExpand = ImportRowExpand;
-  WebPRescalerImportRowShrink = ImportRowShrink;
-  WebPRescalerExportRowExpand = ExportRowExpand;
-  WebPRescalerExportRowShrink = ExportRowShrink;
+  WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
+  WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
+  WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
+  WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/thirdparty/libwebp/dsp/rescaler_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
index 2308d64544..b78aac15e6 100644
--- a/thirdparty/libwebp/dsp/rescaler_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
@@ -11,12 +11,12 @@
 //
 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_MIPS_DSP_R2)
+#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
 
 #include <assert.h>
-#include "../utils/rescaler_utils.h"
+#include "src/utils/rescaler_utils.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -24,7 +24,7 @@
 //------------------------------------------------------------------------------
 // Row export
 
-static void ExportRowShrink(WebPRescaler* const wrk) {
+static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
   int i;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   uint8_t* dst = wrk->dst;
@@ -162,7 +162,7 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
   }
 }
 
-static void ExportRowExpand(WebPRescaler* const wrk) {
+static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
   int i;
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
@@ -303,8 +303,8 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
-  WebPRescalerExportRowExpand = ExportRowExpand;
-  WebPRescalerExportRowShrink = ExportRowShrink;
+  WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
+  WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/rescaler_msa.c b/thirdparty/libwebp/src/dsp/rescaler_msa.c
index 2c10e55d8c..f3bc99f1cd 100644
--- a/thirdparty/libwebp/dsp/rescaler_msa.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_msa.c
@@ -11,14 +11,14 @@
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_MSA)
+#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
 
 #include <assert.h>
 
-#include "../utils/rescaler_utils.h"
-#include "./msa_macro.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/dsp/msa_macro.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -246,7 +246,7 @@ static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
   }
 }
 
-static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -411,7 +411,7 @@ static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
   }
 }
 
-static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -433,8 +433,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
-  WebPRescalerExportRowExpand = RescalerExportRowExpand;
-  WebPRescalerExportRowShrink = RescalerExportRowShrink;
+  WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
 }
 
 #else     // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/rescaler_neon.c b/thirdparty/libwebp/src/dsp/rescaler_neon.c
index b2dd8f30cc..3eff9fbaf4 100644
--- a/thirdparty/libwebp/dsp/rescaler_neon.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_neon.c
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE)
 
 #include <arm_neon.h>
 #include <assert.h>
-#include "./neon.h"
-#include "../utils/rescaler_utils.h"
+#include "src/dsp/neon.h"
+#include "src/utils/rescaler_utils.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -41,9 +41,9 @@
 #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
 #endif
 
-static uint32x4_t Interpolate(const rescaler_t* const frow,
-                              const rescaler_t* const irow,
-                              uint32_t A, uint32_t B) {
+static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
+                                   const rescaler_t* const irow,
+                                   uint32_t A, uint32_t B) {
   LOAD_32x4(frow, A0);
   LOAD_32x4(irow, B0);
   const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
@@ -56,7 +56,7 @@ static uint32x4_t Interpolate(const rescaler_t* const frow,
   return E;
 }
 
-static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -91,9 +91,9 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
     const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
     for (x_out = 0; x_out < max_span; x_out += 8) {
       const uint32x4_t C0 =
-          Interpolate(frow + x_out + 0, irow + x_out + 0, A, B);
+          Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B);
       const uint32x4_t C1 =
-          Interpolate(frow + x_out + 4, irow + x_out + 4, A, B);
+          Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B);
       const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
       const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
       const uint16x4_t E0 = vmovn_u32(D0);
@@ -112,7 +112,7 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
   }
 }
 
-static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -175,8 +175,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
-  WebPRescalerExportRowExpand = RescalerExportRowExpand;
-  WebPRescalerExportRowShrink = RescalerExportRowShrink;
+  WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON;
 }
 
 #else     // !WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/rescaler_sse2.c b/thirdparty/libwebp/src/dsp/rescaler_sse2.c
index 8271c22e05..f93b204fe1 100644
--- a/thirdparty/libwebp/dsp/rescaler_sse2.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE)
 #include <emmintrin.h>
 
 #include <assert.h>
-#include "../utils/rescaler_utils.h"
-#include "../utils/utils.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
@@ -27,7 +27,7 @@
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
 
 // input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
-static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
+static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
   const __m128i B = _mm_unpacklo_epi8(A, zero);              // A0B0C0D0E0F0G0H0
@@ -36,14 +36,14 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
 }
 
 // input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
-static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
+static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
   *out = _mm_unpacklo_epi8(A, zero);
 }
 
-static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
-                                        const uint8_t* src) {
+static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
+                                         const uint8_t* src) {
   rescaler_t* frow = wrk->frow;
   const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
   const int x_add = wrk->x_add;
@@ -54,10 +54,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
   assert(wrk->x_expand);
   if (wrk->num_channels == 4) {
     if (wrk->src_width < 2) {
-      WebPRescalerImportRowExpandC(wrk, src);
+      WebPRescalerImportRowExpand_C(wrk, src);
       return;
     }
-    LoadTwoPixels(src, &cur_pixels);
+    LoadTwoPixels_SSE2(src, &cur_pixels);
     src += 4;
     while (1) {
       const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
@@ -67,7 +67,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
       if (frow >= frow_end) break;
       accum -= wrk->x_sub;
       if (accum < 0) {
-        LoadTwoPixels(src, &cur_pixels);
+        LoadTwoPixels_SSE2(src, &cur_pixels);
         src += 4;
         accum += x_add;
       }
@@ -76,10 +76,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
     int left;
     const uint8_t* const src_limit = src + wrk->src_width - 8;
     if (wrk->src_width < 8) {
-      WebPRescalerImportRowExpandC(wrk, src);
+      WebPRescalerImportRowExpand_C(wrk, src);
       return;
     }
-    LoadHeightPixels(src, &cur_pixels);
+    LoadHeightPixels_SSE2(src, &cur_pixels);
     src += 7;
     left = 7;
     while (1) {
@@ -94,7 +94,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
         if (--left) {
           cur_pixels = _mm_srli_si128(cur_pixels, 2);
         } else if (src <= src_limit) {
-          LoadHeightPixels(src, &cur_pixels);
+          LoadHeightPixels_SSE2(src, &cur_pixels);
           src += 7;
           left = 7;
         } else {   // tail
@@ -110,8 +110,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
   assert(accum == 0);
 }
 
-static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
-                                        const uint8_t* src) {
+static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
+                                         const uint8_t* src) {
   const int x_sub = wrk->x_sub;
   int accum = 0;
   const __m128i zero = _mm_setzero_si128();
@@ -123,7 +123,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
   const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
 
   if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
-    WebPRescalerImportRowShrinkC(wrk, src);
+    WebPRescalerImportRowShrink_C(wrk, src);
     return;
   }
   assert(!WebPRescalerInputDone(wrk));
@@ -169,12 +169,12 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
 // Row export
 
 // load *src as epi64, multiply by mult and store result in [out0 ... out3]
-static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
-                                            const __m128i* const mult,
-                                            __m128i* const out0,
-                                            __m128i* const out1,
-                                            __m128i* const out2,
-                                            __m128i* const out3) {
+static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
+                                                 const __m128i* const mult,
+                                                 __m128i* const out0,
+                                                 __m128i* const out1,
+                                                 __m128i* const out2,
+                                                 __m128i* const out3) {
   const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
   const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
   const __m128i A2 = _mm_srli_epi64(A0, 32);
@@ -192,12 +192,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
   }
 }
 
-static WEBP_INLINE void ProcessRow(const __m128i* const A0,
-                                   const __m128i* const A1,
-                                   const __m128i* const A2,
-                                   const __m128i* const A3,
-                                   const __m128i* const mult,
-                                   uint8_t* const dst) {
+static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
+                                        const __m128i* const A1,
+                                        const __m128i* const A2,
+                                        const __m128i* const A3,
+                                        const __m128i* const mult,
+                                        uint8_t* const dst) {
   const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
   const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
   const __m128i B0 = _mm_mul_epu32(*A0, *mult);
@@ -210,7 +210,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
   const __m128i C3 = _mm_add_epi64(B3, rounder);
   const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
   const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
-#if (WEBP_RESCALER_FIX < 32)
+#if (WEBP_RESCALER_RFIX < 32)
   const __m128i D2 =
       _mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask);
   const __m128i D3 =
@@ -226,7 +226,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
   _mm_storel_epi64((__m128i*)dst, G);
 }
 
-static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -240,8 +240,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
   if (wrk->y_accum == 0) {
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3;
-      LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
-      ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
+      LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3);
+      ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
     }
     for (; x_out < x_out_max; ++x_out) {
       const uint32_t J = frow[x_out];
@@ -257,8 +257,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
     const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3, B0, B1, B2, B3;
-      LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
-      LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
+      LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3);
+      LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3);
       {
         const __m128i C0 = _mm_add_epi64(A0, B0);
         const __m128i C1 = _mm_add_epi64(A1, B1);
@@ -272,7 +272,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
         const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
         const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
         const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
-        ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
+        ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out);
       }
     }
     for (; x_out < x_out_max; ++x_out) {
@@ -286,7 +286,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
   }
 }
 
-static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -303,8 +303,8 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
     const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3, B0, B1, B2, B3;
-      LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
-      LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
+      LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
+      LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
       {
         const __m128i C0 = _mm_add_epi64(B0, rounder);
         const __m128i C1 = _mm_add_epi64(B1, rounder);
@@ -324,7 +324,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
         const __m128i G1 = _mm_or_si128(D1, F3);
         _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
         _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
-        ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
+        ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
       }
     }
     for (; x_out < x_out_max; ++x_out) {
@@ -340,10 +340,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
     const __m128i zero = _mm_setzero_si128();
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3;
-      LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
+      LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
       _mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
       _mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
-      ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
+      ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
     }
     for (; x_out < x_out_max; ++x_out) {
       const int v = (int)MULT_FIX(irow[x_out], scale);
@@ -362,10 +362,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
-  WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
-  WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
-  WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
-  WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
+  WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2;
+  WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2;
+  WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/src/dsp/ssim.c b/thirdparty/libwebp/src/dsp/ssim.c
new file mode 100644
index 0000000000..dc1b518a33
--- /dev/null
+++ b/thirdparty/libwebp/src/dsp/ssim.c
@@ -0,0 +1,166 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+
+#include "src/dsp/dsp.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR
+
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  if (xmxm + ymym >= C3) {
+    const int64_t xmym = (int64_t)stats->xm * stats->ym;
+    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+    // we descale by 8 to prevent overflow during the fnum/fden multiply.
+    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+    const uint64_t den_S = (sxx + syy + C2) >> 8;
+    const uint64_t fnum = (2 * xmym + C1) * num_S;
+    const uint64_t fden = (xmxm + ymym + C1) * den_S;
+    const double r = (double)fnum / fden;
+    assert(r >= 0. && r <= 1.0);
+    return r;
+  }
+  return 1.;   // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+                                                  : yo + VP8_SSIM_KERNEL;
+  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+                                                  : xo + VP8_SSIM_KERNEL;
+  int x, y;
+  src1 += ymin * stride1;
+  src2 += ymin * stride2;
+  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+    for (x = xmin; x <= xmax; ++x) {
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStatsClipped(&stats);
+}
+
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  int x, y;
+  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStats(&stats);
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static uint32_t AccumulateSSE_C(const uint8_t* src1,
+                                const uint8_t* src2, int len) {
+  int i;
+  uint32_t sse2 = 0;
+  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
+  for (i = 0; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_REDUCE_SIZE)
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+#endif
+#if !defined(WEBP_DISABLE_STATS)
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
+
+extern void VP8SSIMDspInitSSE2(void);
+
+static volatile VP8CPUInfo ssim_last_cpuinfo_used =
+    (VP8CPUInfo)&ssim_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
+  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+#if !defined(WEBP_REDUCE_SIZE)
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;
+#endif
+
+#if !defined(WEBP_DISABLE_STATS)
+  VP8AccumulateSSE = AccumulateSSE_C;
+#endif
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SSIMDspInitSSE2();
+    }
+#endif
+  }
+
+  ssim_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/thirdparty/libwebp/src/dsp/ssim_sse2.c b/thirdparty/libwebp/src/dsp/ssim_sse2.c
new file mode 100644
index 0000000000..1dcb0eb0ec
--- /dev/null
+++ b/thirdparty/libwebp/src/dsp/ssim_sse2.c
@@ -0,0 +1,165 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "src/dsp/common_sse2.h"
+
+#if !defined(WEBP_DISABLE_STATS)
+
+// Helper function
+static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
+                                               __m128i* const sum) {
+  // take abs(a-b) in 8b
+  const __m128i a_b = _mm_subs_epu8(a, b);
+  const __m128i b_a = _mm_subs_epu8(b, a);
+  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+  // zero-extend to 16b
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+  // multiply with self
+  const __m128i sum1 = _mm_madd_epi16(C0, C0);
+  const __m128i sum2 = _mm_madd_epi16(C1, C1);
+  *sum = _mm_add_epi32(sum1, sum2);
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+                                   const uint8_t* src2, int len) {
+  int i = 0;
+  uint32_t sse2 = 0;
+  if (len >= 16) {
+    const int limit = len - 32;
+    int32_t tmp[4];
+    __m128i sum1;
+    __m128i sum = _mm_setzero_si128();
+    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+    i += 16;
+    while (i <= limit) {
+      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      __m128i sum2;
+      i += 16;
+      SubtractAndSquare_SSE2(a0, b0, &sum1);
+      sum = _mm_add_epi32(sum, sum1);
+      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      i += 16;
+      SubtractAndSquare_SSE2(a1, b1, &sum2);
+      sum = _mm_add_epi32(sum, sum2);
+    }
+    SubtractAndSquare_SSE2(a0, b0, &sum1);
+    sum = _mm_add_epi32(sum, sum1);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+
+  for (; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
+  uint16_t tmp[8];
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi16(*m, a);
+  _mm_storeu_si128((__m128i*)tmp, b);
+  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi32(*m, a);
+  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+  return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2) {
+  VP8DistoStats stats;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i xm = zero, ym = zero;                // 16b accums
+  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
+  stats.xm  = HorizontalAdd16b_SSE2(&xm);
+  stats.ym  = HorizontalAdd16b_SSE2(&ym);
+  stats.xxm = HorizontalAdd32b_SSE2(&xxm);
+  stats.xym = HorizontalAdd32b_SSE2(&xym);
+  stats.yym = HorizontalAdd32b_SSE2(&yym);
+  return VP8SSIMFromStats(&stats);
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+#if !defined(WEBP_DISABLE_STATS)
+  VP8AccumulateSSE = AccumulateSSE_SSE2;
+#endif
+#if !defined(WEBP_REDUCE_SIZE)
+  VP8SSIMGet = SSIMGet_SSE2;
+#endif
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/upsampling.c b/thirdparty/libwebp/src/dsp/upsampling.c
index 265e722c10..e72626a82a 100644
--- a/thirdparty/libwebp/dsp/upsampling.c
+++ b/thirdparty/libwebp/src/dsp/upsampling.c
@@ -11,8 +11,8 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#include "./dsp.h"
-#include "./yuv.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/yuv.h"
 
 #include <assert.h>
 
@@ -63,17 +63,17 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
       const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
       const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
       FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
-           top_dst + (2 * x - 1) * XSTEP);                                     \
+           top_dst + (2 * x - 1) * (XSTEP));                                   \
       FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
-           top_dst + (2 * x - 0) * XSTEP);                                     \
+           top_dst + (2 * x - 0) * (XSTEP));                                   \
     }                                                                          \
     if (bottom_y != NULL) {                                                    \
       const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
       const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
       FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
-           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+           bottom_dst + (2 * x - 1) * (XSTEP));                                \
       FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
-           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+           bottom_dst + (2 * x + 0) * (XSTEP));                                \
     }                                                                          \
     tl_uv = t_uv;                                                              \
     l_uv = uv;                                                                 \
@@ -82,24 +82,50 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
     {                                                                          \
       const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
       FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
-           top_dst + (len - 1) * XSTEP);                                       \
+           top_dst + (len - 1) * (XSTEP));                                     \
     }                                                                          \
     if (bottom_y != NULL) {                                                    \
       const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
       FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
-           bottom_dst + (len - 1) * XSTEP);                                    \
+           bottom_dst + (len - 1) * (XSTEP));                                  \
     }                                                                          \
   }                                                                            \
 }
 
 // All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
-UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
-UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
-UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
+#if !WEBP_NEON_OMIT_C_CODE
+UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgbLinePair_C,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair_C,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair_C,  VP8YuvToRgb565,  2)
+#else
+static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
+                              const uint8_t* top_u, const uint8_t* top_v,
+                              const uint8_t* cur_u, const uint8_t* cur_v,
+                              uint8_t* top_dst, uint8_t* bottom_dst, int len) {
+  (void)top_y;
+  (void)bottom_y;
+  (void)top_u;
+  (void)top_v;
+  (void)cur_u;
+  (void)cur_v;
+  (void)top_dst;
+  (void)bottom_dst;
+  (void)len;
+  assert(0);   // COLORSPACE SUPPORT NOT COMPILED
+}
+#define UpsampleArgbLinePair_C EmptyUpsampleFunc
+#define UpsampleRgbLinePair_C EmptyUpsampleFunc
+#define UpsampleBgrLinePair_C EmptyUpsampleFunc
+#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc
+#define UpsampleRgb565LinePair_C EmptyUpsampleFunc
+#endif   // WEBP_REDUCE_CSP
+
+#endif
 
 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
@@ -141,7 +167,6 @@ DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
 
 WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
   WebPInitUpsamplers();
-  VP8YUVInit();
 #ifdef FANCY_UPSAMPLING
   return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
 #else
@@ -158,16 +183,33 @@ extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
 void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,           \
                uint8_t* dst, int len) {                                        \
   int i;                                                                       \
-  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]);         \
 }
 
-YUV444_FUNC(WebPYuv444ToRgbC,      VP8YuvToRgb,  3)
-YUV444_FUNC(WebPYuv444ToBgrC,      VP8YuvToBgr,  3)
-YUV444_FUNC(WebPYuv444ToRgbaC,     VP8YuvToRgba, 4)
-YUV444_FUNC(WebPYuv444ToBgraC,     VP8YuvToBgra, 4)
-YUV444_FUNC(WebPYuv444ToArgbC,     VP8YuvToArgb, 4)
-YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
-YUV444_FUNC(WebPYuv444ToRgb565C,   VP8YuvToRgb565, 2)
+YUV444_FUNC(WebPYuv444ToRgba_C,     VP8YuvToRgba, 4)
+YUV444_FUNC(WebPYuv444ToBgra_C,     VP8YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(WebPYuv444ToRgb_C,      VP8YuvToRgb,  3)
+YUV444_FUNC(WebPYuv444ToBgr_C,      VP8YuvToBgr,  3)
+YUV444_FUNC(WebPYuv444ToArgb_C,     VP8YuvToArgb, 4)
+YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
+YUV444_FUNC(WebPYuv444ToRgb565_C,   VP8YuvToRgb565, 2)
+#else
+static void EmptyYuv444Func(const uint8_t* y,
+                            const uint8_t* u, const uint8_t* v,
+                            uint8_t* dst, int len) {
+  (void)y;
+  (void)u;
+  (void)v;
+  (void)dst;
+  (void)len;
+}
+#define WebPYuv444ToRgb_C EmptyYuv444Func
+#define WebPYuv444ToBgr_C EmptyYuv444Func
+#define WebPYuv444ToArgb_C EmptyYuv444Func
+#define WebPYuv444ToRgba4444_C EmptyYuv444Func
+#define WebPYuv444ToRgb565_C EmptyYuv444Func
+#endif   // WEBP_REDUCE_CSP
 
 #undef YUV444_FUNC
 
@@ -182,17 +224,17 @@ static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
   if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
 
-  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgbC;
-  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgbaC;
-  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgrC;
-  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgraC;
-  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgbC;
-  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
-  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565C;
-  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgbaC;
-  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgraC;
-  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgbC;
-  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
+  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgba_C;
+  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgra_C;
+  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgb_C;
+  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgr_C;
+  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgb_C;
+  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
+  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565_C;
+  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgba_C;
+  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgra_C;
+  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgb_C;
+  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
 
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -224,17 +266,19 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
   if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
 
 #ifdef FANCY_UPSAMPLING
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
-  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair_C;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair_C;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair_C;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair_C;
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair_C;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair_C;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair_C;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair_C;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair_C;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
+#endif
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -243,11 +287,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
       WebPInitUpsamplersSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitUpsamplersNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       WebPInitUpsamplersMIPSdspR2();
@@ -259,6 +298,26 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitUpsamplersNEON();
+  }
+#endif
+
+  assert(WebPUpsamplers[MODE_RGBA] != NULL);
+  assert(WebPUpsamplers[MODE_BGRA] != NULL);
+  assert(WebPUpsamplers[MODE_rgbA] != NULL);
+  assert(WebPUpsamplers[MODE_bgrA] != NULL);
+  assert(WebPUpsamplers[MODE_RGB] != NULL);
+  assert(WebPUpsamplers[MODE_BGR] != NULL);
+  assert(WebPUpsamplers[MODE_ARGB] != NULL);
+  assert(WebPUpsamplers[MODE_RGBA_4444] != NULL);
+  assert(WebPUpsamplers[MODE_RGB_565] != NULL);
+  assert(WebPUpsamplers[MODE_Argb] != NULL);
+  assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
+
 #endif  // FANCY_UPSAMPLING
   upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/upsampling_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
index ed2eb74825..10d499d771 100644
--- a/thirdparty/libwebp/dsp/upsampling_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
@@ -12,14 +12,12 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut  (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include <assert.h>
-#include "./yuv.h"
-
-#if !defined(WEBP_YUV_USE_TABLE)
+#include "src/dsp/yuv.h"
 
 #define YUV_TO_RGB(Y, U, V, R, G, B) do {                                      \
     const int t1 = MultHi(Y, 19077);                                           \
@@ -48,6 +46,7 @@
     );                                                                         \
   } while (0)
 
+#if !defined(WEBP_REDUCE_CSP)
 static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
   int r, g, b;
   YUV_TO_RGB(y, u, v, r, g, b);
@@ -68,7 +67,7 @@ static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
   {
     const int rg = (r & 0xf8) | (g >> 5);
     const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     rgb[0] = gb;
     rgb[1] = rg;
 #else
@@ -84,7 +83,7 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
   {
     const int rg = (r & 0xf0) | (g >> 4);
     const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     argb[0] = ba;
     argb[1] = rg;
 #else
@@ -93,11 +92,12 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
 #endif
    }
 }
-#endif  // WEBP_YUV_USE_TABLE
+#endif   // WEBP_REDUCE_CSP
 
 //-----------------------------------------------------------------------------
 // Alpha handling variants
 
+#if !defined(WEBP_REDUCE_CSP)
 static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
                                   uint8_t* const argb) {
   int r, g, b;
@@ -107,6 +107,7 @@ static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
   argb[2] = g;
   argb[3] = b;
 }
+#endif   // WEBP_REDUCE_CSP
 static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
                                   uint8_t* const bgra) {
   int r, g, b;
@@ -200,13 +201,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
 }
 
 // All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
 UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
 UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
 UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
 UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
 UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
 
 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
@@ -217,17 +220,19 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
 extern void WebPInitUpsamplersMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
   WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
   WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
   WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
   WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
   WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #endif  // FANCY_UPSAMPLING
@@ -242,13 +247,15 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
   for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
 }
 
-YUV444_FUNC(Yuv444ToRgb,      YuvToRgb,      3)
-YUV444_FUNC(Yuv444ToBgr,      YuvToBgr,      3)
 YUV444_FUNC(Yuv444ToRgba,     YuvToRgba,     4)
 YUV444_FUNC(Yuv444ToBgra,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb,      YuvToRgb,      3)
+YUV444_FUNC(Yuv444ToBgr,      YuvToBgr,      3)
 YUV444_FUNC(Yuv444ToArgb,     YuvToArgb,     4)
 YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
 YUV444_FUNC(Yuv444ToRgb565,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
 
 #undef YUV444_FUNC
 
@@ -258,17 +265,19 @@ YUV444_FUNC(Yuv444ToRgb565,   YuvToRgb565,   2)
 extern void WebPInitYUV444ConvertersMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
-  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
   WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba;
-  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
   WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
   WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb;
   WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
   WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565;
-  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
-  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
   WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb;
   WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/upsampling_msa.c b/thirdparty/libwebp/src/dsp/upsampling_msa.c
index f24926fa94..535ffb772c 100644
--- a/thirdparty/libwebp/dsp/upsampling_msa.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_msa.c
@@ -12,12 +12,12 @@
 // Author: Prashant Patil (prashant.patil@imgtec.com)
 
 #include <string.h>
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./msa_macro.h"
-#include "./yuv.h"
+#include "src/dsp/msa_macro.h"
+#include "src/dsp/yuv.h"
 
 #ifdef FANCY_UPSAMPLING
 
@@ -274,7 +274,7 @@ static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
   const int b = Clip8(b1 >> 6);
   const int rg = (r & 0xf8) | (g >> 5);
   const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   rgb[0] = gb;
   rgb[1] = rg;
 #else
@@ -293,7 +293,7 @@ static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
   const int b = Clip8(b1 >> 6);
   const int rg = (r & 0xf0) | (g >> 4);
   const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   argb[0] = ba;
   argb[1] = rg;
 #else
@@ -374,7 +374,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
 static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
                           const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
     CALC_RGB16(y, u, v, R, G, B);
     STORE16_4(R, G, B, A, dst);
@@ -402,7 +402,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
 static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
                           const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
     CALC_RGB16(y, u, v, R, G, B);
     STORE16_4(B, G, R, A, dst);
@@ -430,7 +430,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
 static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
                           const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
     CALC_RGB16(y, u, v, R, G, B);
     STORE16_4(A, R, G, B, dst);
@@ -459,11 +459,11 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
                               const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B, RG, BA, tmp0, tmp1;
   while (length >= 16) {
-  #ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
-  #else
+#else
     CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
-  #endif
+#endif
     y      += 16;
     u      += 16;
     v      += 16;
@@ -473,7 +473,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
   if (length > 8) {
     uint8_t temp[2 * 16] = { 0 };
     memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
 #else
     CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
@@ -482,7 +482,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
   } else if (length > 0) {
     uint8_t temp[2 * 8] = { 0 };
     memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
 #else
     CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
@@ -495,11 +495,11 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
                             const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B, RG, GB, tmp0, tmp1;
   while (length >= 16) {
-  #ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGB565(y, u, v, GB, RG, 16, dst);
-  #else
+#else
     CALC_RGB565(y, u, v, RG, GB, 16, dst);
-  #endif
+#endif
     y      += 16;
     u      += 16;
     v      += 16;
@@ -509,7 +509,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
   if (length > 8) {
     uint8_t temp[2 * 16] = { 0 };
     memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGB565(temp, u, v, GB, RG, 16, temp);
 #else
     CALC_RGB565(temp, u, v, RG, GB, 16, temp);
@@ -518,7 +518,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
   } else if (length > 0) {
     uint8_t temp[2 * 8] = { 0 };
     memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGB565(temp, u, v, GB, RG, 8, temp);
 #else
     CALC_RGB565(temp, u, v, RG, GB, 8, temp);
@@ -640,13 +640,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
   }                                                                      \
 }
 
-UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
 UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
 UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
 UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
 UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
 UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
 
 //------------------------------------------------------------------------------
 // Entry point
@@ -656,17 +658,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 extern void WebPInitUpsamplersMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
   WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
   WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
   WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
   WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
   WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #endif  // FANCY_UPSAMPLING
diff --git a/thirdparty/libwebp/dsp/upsampling_neon.c b/thirdparty/libwebp/src/dsp/upsampling_neon.c
index d371a834ff..17cbc9f911 100644
--- a/thirdparty/libwebp/dsp/upsampling_neon.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_neon.c
@@ -12,15 +12,15 @@
 // Author: mans@mansr.com (Mans Rullgard)
 // Based on SSE code by: somnath@google.com (Somnath Banerjee)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <assert.h>
 #include <arm_neon.h>
 #include <string.h>
-#include "./neon.h"
-#include "./yuv.h"
+#include "src/dsp/neon.h"
+#include "src/dsp/yuv.h"
 
 #ifdef FANCY_UPSAMPLING
 
@@ -58,8 +58,8 @@
 } while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
-                             uint8_t *out) {
+static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
+                                  uint8_t *out) {
   UPSAMPLE_16PIXELS(r1, r2, out);
 }
 
@@ -70,7 +70,7 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
   /* replicate last byte */                                             \
   memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels));    \
   memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels));    \
-  Upsample16Pixels(r1, r2, out);                                        \
+  Upsample16Pixels_NEON(r1, r2, out);                                   \
 }
 
 //-----------------------------------------------------------------------------
@@ -243,13 +243,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
 }
 
 // NEON variants of the fancy upsampler.
-NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair,  Rgb,  3)
-NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair,  Bgr,  3)
-NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
-NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
-NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4)
-NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2)
-NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2)
+NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4)
+NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON,  Rgb,  3)
+NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON,  Bgr,  3)
+NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4)
+NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2)
+NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2)
+#endif   // WEBP_REDUCE_CSP
 
 //------------------------------------------------------------------------------
 // Entry point
@@ -259,17 +261,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 extern void WebPInitUpsamplersNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
-  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON;
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair_NEON;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair_NEON;
+  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON;
+  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON;
+  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #endif  // FANCY_UPSAMPLING
diff --git a/thirdparty/libwebp/dsp/upsampling_sse2.c b/thirdparty/libwebp/src/dsp/upsampling_sse2.c
index b5b668900f..fd5d303982 100644
--- a/thirdparty/libwebp/dsp/upsampling_sse2.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -11,14 +11,14 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 
 #include <assert.h>
 #include <emmintrin.h>
 #include <string.h>
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
 #ifdef FANCY_UPSAMPLING
 
@@ -83,13 +83,13 @@
   GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
                                                                                \
   /* pack the alternate pixels */                                              \
-  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
-  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
+  PACK_AND_STORE(a, b, diag1, diag2, (out) +      0);  /* store top */         \
+  PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32);  /* store bottom */      \
 }
 
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
-                             uint8_t* const out) {
+static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
+                                  uint8_t* const out) {
   UPSAMPLE_32PIXELS(r1, r2, out);
 }
 
@@ -101,30 +101,30 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
   memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels));          \
   memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels));          \
   /* using the shared function instead of the macro saves ~3k code size */     \
-  Upsample32Pixels(r1, r2, out);                                               \
+  Upsample32Pixels_SSE2(r1, r2, out);                                          \
 }
 
 #define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
                     top_dst, bottom_dst, cur_x, num_pixels) {                  \
   int n;                                                                       \
   for (n = 0; n < (num_pixels); ++n) {                                         \
-    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
-         top_dst + ((cur_x) + n) * XSTEP);                                     \
+    FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n],                                 \
+         (top_dst) + ((cur_x) + n) * (XSTEP));                                 \
   }                                                                            \
-  if (bottom_y != NULL) {                                                      \
+  if ((bottom_y) != NULL) {                                                    \
     for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
-           bottom_dst + ((cur_x) + n) * XSTEP);                                \
+      FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n],                  \
+           (bottom_dst) + ((cur_x) + n) * (XSTEP));                            \
     }                                                                          \
   }                                                                            \
 }
 
 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
                        top_dst, bottom_dst, cur_x) do {                        \
-  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
-  if (bottom_y != NULL) {                                                      \
-    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
-             bottom_dst + (cur_x) * XSTEP);                                    \
+  FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP));   \
+  if ((bottom_y) != NULL) {                                                    \
+    FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64,                    \
+                  (bottom_dst) + (cur_x) * (XSTEP));                           \
   }                                                                            \
 } while (0)
 
@@ -169,13 +169,16 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
 }
 
 // SSE2 variants of the fancy upsampler.
-SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
-SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
-SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4)
+
+#if !defined(WEBP_REDUCE_CSP)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2,  VP8YuvToRgb,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2,  VP8YuvToBgr,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2)
+SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2)
+#endif   // WEBP_REDUCE_CSP
 
 #undef GET_M
 #undef PACK_AND_STORE
@@ -193,17 +196,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 extern void WebPInitUpsamplersSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
-  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2;
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair_SSE2;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair_SSE2;
+  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2;
+  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2;
+  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #endif  // FANCY_UPSAMPLING
@@ -213,29 +218,46 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern void WebPInitYUV444ConvertersSSE2(void);
 
-#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
-extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u,             \
-                               const uint8_t* v, uint8_t* dst, int len);       \
+#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
+extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
+                   uint8_t* dst, int len);                                     \
 static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
                       uint8_t* dst, int len) {                                 \
   int i;                                                                       \
   const int max_len = len & ~31;                                               \
-  for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
+  for (i = 0; i < max_len; i += 32) {                                          \
+    CALL(y + i, u + i, v + i, dst + i * (XSTEP));                              \
+  }                                                                            \
   if (i < len) {  /* C-fallback */                                             \
-    WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i);         \
+    CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i);                   \
   }                                                                            \
 }
 
-YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
-YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
-YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
-YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
+YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
+YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
+YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
+YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
+YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
+            WebPYuv444ToRgba4444_C, 2)
+YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)
+#endif   // WEBP_REDUCE_CSP
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
-  WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
-  WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
-  WebPYUV444Converters[MODE_RGB]  = Yuv444ToRgb;
-  WebPYUV444Converters[MODE_BGR]  = Yuv444ToBgr;
+  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba_SSE2;
+  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra_SSE2;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba_SSE2;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra_SSE2;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb_SSE2;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr_SSE2;
+  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb_SSE2;
+  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
+  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565_SSE2;
+  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb_SSE2;
+  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #else
diff --git a/thirdparty/libwebp/dsp/yuv.c b/thirdparty/libwebp/src/dsp/yuv.c
index dd7d9dedfa..bddf81fe09 100644
--- a/thirdparty/libwebp/dsp/yuv.c
+++ b/thirdparty/libwebp/src/dsp/yuv.c
@@ -11,63 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
+#include <assert.h>
 #include <stdlib.h>
 
-#if defined(WEBP_YUV_USE_TABLE)
-
-static int done = 0;
-
-static WEBP_INLINE uint8_t clip(int v, int max_value) {
-  return v < 0 ? 0 : v > max_value ? max_value : v;
-}
-
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
-  int i;
-  if (done) {
-    return;
-  }
-#ifndef USE_YUVj
-  for (i = 0; i < 256; ++i) {
-    VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
-    VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
-    VP8kVToG[i] = -45773 * (i - 128);
-    VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
-  }
-  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
-    const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
-    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
-    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
-  }
-#else
-  for (i = 0; i < 256; ++i) {
-    VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
-    VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
-    VP8kVToG[i] = -46802 * (i - 128);
-    VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
-  }
-  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
-    const int k = i;
-    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
-    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
-  }
-#endif
-
-  done = 1;
-}
-
-#else
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
-
-#endif  // WEBP_YUV_USE_TABLE
-
 //-----------------------------------------------------------------------------
 // Plain-C version
 
@@ -75,14 +23,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
 static void FUNC_NAME(const uint8_t* y,                                        \
                       const uint8_t* u, const uint8_t* v,                      \
                       uint8_t* dst, int len) {                                 \
-  const uint8_t* const end = dst + (len & ~1) * XSTEP;                         \
+  const uint8_t* const end = dst + (len & ~1) * (XSTEP);                       \
   while (dst != end) {                                                         \
     FUNC(y[0], u[0], v[0], dst);                                               \
-    FUNC(y[1], u[0], v[0], dst + XSTEP);                                       \
+    FUNC(y[1], u[0], v[0], dst + (XSTEP));                                     \
     y += 2;                                                                    \
     ++u;                                                                       \
     ++v;                                                                       \
-    dst += 2 * XSTEP;                                                          \
+    dst += 2 * (XSTEP);                                                        \
   }                                                                            \
   if (len & 1) {                                                               \
     FUNC(y[0], u[0], v[0], dst);                                               \
@@ -168,7 +116,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
 //-----------------------------------------------------------------------------
 // ARGB -> YUV converters
 
-static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const uint32_t p = argb[i];
@@ -220,14 +168,14 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 
 //-----------------------------------------------------------------------------
 
-static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
   int i;
   for (i = 0; i < width; ++i, rgb += 3) {
     y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
   }
 }
 
-static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
   int i;
   for (i = 0; i < width; ++i, bgr += 3) {
     y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
@@ -246,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
 
 //-----------------------------------------------------------------------------
 
+#if !WEBP_NEON_OMIT_C_CODE
 #define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
 static uint16_t clip_y(int v) {
   return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
@@ -283,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
     out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 #undef MAX_Y
 
@@ -308,22 +258,26 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
     (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
 
 extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitConvertARGBToYUVNEON(void);
 extern void WebPInitSharpYUVSSE2(void);
+extern void WebPInitSharpYUVNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
   if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  WebPConvertARGBToY = ConvertARGBToY;
+  WebPConvertARGBToY = ConvertARGBToY_C;
   WebPConvertARGBToUV = WebPConvertARGBToUV_C;
 
-  WebPConvertRGB24ToY = ConvertRGB24ToY;
-  WebPConvertBGR24ToY = ConvertBGR24ToY;
+  WebPConvertRGB24ToY = ConvertRGB24ToY_C;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_C;
 
   WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
 
+#if !WEBP_NEON_OMIT_C_CODE
   WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
   WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
   WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
+#endif
 
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -333,5 +287,23 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
     }
 #endif  // WEBP_USE_SSE2
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitConvertARGBToYUVNEON();
+    WebPInitSharpYUVNEON();
+  }
+#endif  // WEBP_USE_NEON
+
+  assert(WebPConvertARGBToY != NULL);
+  assert(WebPConvertARGBToUV != NULL);
+  assert(WebPConvertRGB24ToY != NULL);
+  assert(WebPConvertBGR24ToY != NULL);
+  assert(WebPConvertRGBA32ToUV != NULL);
+  assert(WebPSharpYUVUpdateY != NULL);
+  assert(WebPSharpYUVUpdateRGB != NULL);
+  assert(WebPSharpYUVFilterRow != NULL);
+
   rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/yuv.h b/thirdparty/libwebp/src/dsp/yuv.h
index 1d33b5863b..c8a55832d4 100644
--- a/thirdparty/libwebp/dsp/yuv.h
+++ b/thirdparty/libwebp/src/dsp/yuv.h
@@ -35,18 +35,8 @@
 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_
 
-#include "./dsp.h"
-#include "../dec/vp8_dec.h"
-
-#if defined(WEBP_EXPERIMENTAL_FEATURES)
-// Do NOT activate this feature for real compression. This is only experimental!
-// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
-// This colorspace is close to Rec.601's Y'CbCr model with the notable
-// difference of allowing larger range for luma/chroma.
-// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
-// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
-// #define USE_YUVj
-#endif
+#include "src/dsp/dsp.h"
+#include "src/dec/vp8_dec.h"
 
 //------------------------------------------------------------------------------
 // YUV -> RGB conversion
@@ -58,12 +48,8 @@ extern "C" {
 enum {
   YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
   YUV_HALF = 1 << (YUV_FIX - 1),
-  YUV_MASK = (256 << YUV_FIX) - 1,
-  YUV_RANGE_MIN = -227,            // min value of r/g/b output
-  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
 
   YUV_FIX2 = 6,                   // fixed-point precision for YUV->RGB
-  YUV_HALF2 = 1 << YUV_FIX2 >> 1,
   YUV_MASK2 = (256 << YUV_FIX2) - 1
 };
 
@@ -111,7 +97,7 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
   const int b = VP8YUVToB(y, u);      // 5 usable bits
   const int rg = (r & 0xf8) | (g >> 5);
   const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   rgb[0] = gb;
   rgb[1] = rg;
 #else
@@ -127,7 +113,7 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
   const int b = VP8YUVToB(y, u);        // 4 usable bits
   const int rg = (r & 0xf0) | (g >> 4);
   const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   argb[0] = ba;
   argb[1] = rg;
 #else
@@ -157,29 +143,26 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
   rgba[3] = 0xff;
 }
 
-// Must be called before everything, to initialize the tables.
-void VP8YUVInit(void);
-
 //-----------------------------------------------------------------------------
 // SSE2 extra functions (mostly for upsampling_sse2.c)
 
 #if defined(WEBP_USE_SSE2)
 
 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst);
+void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                         uint8_t* dst);
-void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                      uint8_t* dst);
+void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+                             const uint8_t* v, uint8_t* dst);
+void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst);
 
 #endif    // WEBP_USE_SSE2
 
@@ -192,8 +175,6 @@ static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
   return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
 }
 
-#ifndef USE_YUVj
-
 static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
   const int luma = 16839 * r + 33059 * g + 6420 * b;
   return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
@@ -209,28 +190,6 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
   return VP8ClipUV(v, rounding);
 }
 
-#else
-
-// This JPEG-YUV colorspace, only for comparison!
-// These are also 16bit precision coefficients from Rec.601, but with full
-// [0..255] output range.
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
-  const int luma = 19595 * r + 38470 * g + 7471 * b;
-  return (luma + rounding) >> YUV_FIX;  // no need to clip
-}
-
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
-  const int u = -11058 * r - 21710 * g + 32768 * b;
-  return VP8ClipUV(u, rounding);
-}
-
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
-  const int v = 32768 * r - 27439 * g - 5329 * b;
-  return VP8ClipUV(v, rounding);
-}
-
-#endif    // USE_YUVj
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/thirdparty/libwebp/dsp/yuv_mips32.c b/thirdparty/libwebp/src/dsp/yuv_mips32.c
index e61aac571f..9d0a887824 100644
--- a/thirdparty/libwebp/dsp/yuv_mips32.c
+++ b/thirdparty/libwebp/src/dsp/yuv_mips32.c
@@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
 //------------------------------------------------------------------------------
 // simple point-sampling
@@ -77,10 +77,10 @@ static void FUNC_NAME(const uint8_t* y,                                        \
   }                                                                            \
 }
 
-ROW_FUNC(YuvToRgbRow,      3, 0, 1, 2, 0)
-ROW_FUNC(YuvToRgbaRow,     4, 0, 1, 2, 3)
-ROW_FUNC(YuvToBgrRow,      3, 2, 1, 0, 0)
-ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
+ROW_FUNC(YuvToRgbRow_MIPS32,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPS32,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPS32,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPS32,     4, 2, 1, 0, 3)
 
 #undef ROW_FUNC
 
@@ -90,10 +90,10 @@ ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
 extern void WebPInitSamplersMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
-  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
-  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
-  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
-  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_MIPS32;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_MIPS32;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/thirdparty/libwebp/dsp/yuv_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
index 1720d4190f..cc8afcc756 100644
--- a/thirdparty/libwebp/dsp/yuv_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
@@ -12,11 +12,11 @@
 // Author(s):  Branimir Vasic (branimir.vasic@imgtec.com)
 //             Djordje Pesut  (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
 //------------------------------------------------------------------------------
 // simple point-sampling
@@ -105,10 +105,10 @@ static void FUNC_NAME(const uint8_t* y,                                        \
   }                                                                            \
 }
 
-ROW_FUNC(YuvToRgbRow,      3, 0, 1, 2, 0)
-ROW_FUNC(YuvToRgbaRow,     4, 0, 1, 2, 3)
-ROW_FUNC(YuvToBgrRow,      3, 2, 1, 0, 0)
-ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
+ROW_FUNC(YuvToRgbRow_MIPSdspR2,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPSdspR2,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPSdspR2,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPSdspR2,     4, 2, 1, 0, 3)
 
 #undef ROW_FUNC
 #undef ASM_CLOBBER_LIST
@@ -121,10 +121,10 @@ ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
 extern void WebPInitSamplersMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
-  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
-  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
-  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
-  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_MIPSdspR2;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_MIPSdspR2;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/src/dsp/yuv_neon.c b/thirdparty/libwebp/src/dsp/yuv_neon.c
new file mode 100644
index 0000000000..a34d60248f
--- /dev/null
+++ b/thirdparty/libwebp/src/dsp/yuv_neon.c
@@ -0,0 +1,288 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/yuv.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "src/dsp/neon.h"
+
+//-----------------------------------------------------------------------------
+
+static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
+                                    const uint8x8_t G,
+                                    const uint8x8_t B) {
+  const uint16x8_t r = vmovl_u8(R);
+  const uint16x8_t g = vmovl_u8(G);
+  const uint16x8_t b = vmovl_u8(B);
+  const uint16x4_t r_lo = vget_low_u16(r);
+  const uint16x4_t r_hi = vget_high_u16(r);
+  const uint16x4_t g_lo = vget_low_u16(g);
+  const uint16x4_t g_hi = vget_high_u16(g);
+  const uint16x4_t b_lo = vget_low_u16(b);
+  const uint16x4_t b_hi = vget_high_u16(b);
+  const uint32x4_t tmp0_lo = vmull_n_u16(         r_lo, 16839u);
+  const uint32x4_t tmp0_hi = vmull_n_u16(         r_hi, 16839u);
+  const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
+  const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
+  const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
+  const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
+  const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
+                                     vrshrn_n_u32(tmp2_hi, 16));
+  const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
+  return vqmovn_u16(Y2);
+}
+
+static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
+    const uint8x8x3_t RGB = vld3_u8(rgb);
+    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i, rgb += 3) {   // left-over
+    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+  }
+}
+
+static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
+    const uint8x8x3_t BGR = vld3_u8(bgr);
+    const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i, bgr += 3) {  // left-over
+    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+  }
+}
+
+static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8) {
+    const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
+    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i) {   // left-over
+    const uint32_t p = argb[i];
+    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
+                     YUV_HALF);
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
+#define MULTIPLY_16b_PREAMBLE(r, g, b)                           \
+  const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r));  \
+  const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
+  const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g));  \
+  const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
+  const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b));  \
+  const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
+
+#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do {              \
+  const int32x4_t tmp0_lo = vmull_n_s16(         r_lo, C0);      \
+  const int32x4_t tmp0_hi = vmull_n_s16(         r_hi, C0);      \
+  const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1);      \
+  const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1);      \
+  const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2);      \
+  const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2);      \
+  const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16),  \
+                                      vshrn_n_s32(tmp2_hi, 16)); \
+  DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST));                   \
+} while (0)
+
+// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
+#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do {     \
+  MULTIPLY_16b_PREAMBLE(r, g, b);                                \
+  MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST);       \
+  MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST);       \
+} while (0)
+
+static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
+                                   uint8_t* u, uint8_t* v, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
+    const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
+    int16x8_t U, V;
+    CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
+    vst1_u8(u + i, vqrshrun_n_s16(U, 2));
+    vst1_u8(v + i, vqrshrun_n_s16(V, 2));
+  }
+  for (; i < width; i += 1, rgb += 4) {
+    const int r = rgb[0], g = rgb[1], b = rgb[2];
+    u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+    v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+  }
+}
+
+static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                 int src_width, int do_store) {
+  int i;
+  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
+    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
+    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
+    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
+    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
+    int16x8_t U_tmp, V_tmp;
+    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
+    {
+      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
+      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
+      if (do_store) {
+        vst1_u8(u, U);
+        vst1_u8(v, V);
+      } else {
+        const uint8x8_t prev_u = vld1_u8(u);
+        const uint8x8_t prev_v = vld1_u8(v);
+        vst1_u8(u, vrhadd_u8(U, prev_u));
+        vst1_u8(v, vrhadd_u8(V, prev_v));
+      }
+    }
+  }
+  if (i < src_width) {  // left-over
+    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
+  }
+}
+
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitConvertARGBToYUVNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
+  WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
+  WebPConvertARGBToY = ConvertARGBToY_NEON;
+  WebPConvertARGBToUV = ConvertARGBToUV_NEON;
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
+}
+
+//------------------------------------------------------------------------------
+
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y_NEON(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len) {
+  int i;
+  const int16x8_t zero = vdupq_n_s16(0);
+  const int16x8_t max = vdupq_n_s16(MAX_Y);
+  uint64x2_t sum = vdupq_n_u64(0);
+  uint64_t diff;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+    const int16x8_t D = vsubq_s16(A, B);       // diff_y
+    const int16x8_t F = vaddq_s16(C, D);       // new_y
+    const uint16x8_t H =
+        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
+    vst1q_u16(dst + i, H);
+    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+  }
+  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)(dst[i]) + diff_y;
+    dst[i] = clip_y_NEON(new_y);
+    diff += (uint64_t)(abs(diff_y));
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vld1q_s16(ref + i);
+    const int16x8_t B = vld1q_s16(src + i);
+    const int16x8_t C = vld1q_s16(dst + i);
+    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
+    const int16x8_t E = vaddq_s16(C, D);   // new_uv
+    vst1q_s16(dst + i, E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out) {
+  int i;
+  const int16x8_t max = vdupq_n_s16(MAX_Y);
+  const int16x8_t zero = vdupq_n_s16(0);
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t a0 = vld1q_s16(A + i + 0);
+    const int16x8_t a1 = vld1q_s16(A + i + 1);
+    const int16x8_t b0 = vld1q_s16(B + i + 0);
+    const int16x8_t b1 = vld1q_s16(B + i + 1);
+    const int16x8_t a0b1 = vaddq_s16(a0, b1);
+    const int16x8_t a1b0 = vaddq_s16(a1, b0);
+    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
+    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
+    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
+    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+    const int16x8_t d0 = vaddq_s16(c1, a0);
+    const int16x8_t d1 = vaddq_s16(c0, a1);
+    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
+    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
+    const int16x8x2_t f = vzipq_s16(e0, e1);
+    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
+  }
+}
+#undef MAX_Y
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitSharpYUVNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
+  WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
+  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
+  WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
+WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/yuv_sse2.c b/thirdparty/libwebp/src/dsp/yuv_sse2.c
index e33c2bbafd..6810bf8d15 100644
--- a/thirdparty/libwebp/dsp/yuv_sse2.c
+++ b/thirdparty/libwebp/src/dsp/yuv_sse2.c
@@ -11,11 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
 #if defined(WEBP_USE_SSE2)
 
-#include "./common_sse2.h"
+#include "src/dsp/common_sse2.h"
 #include <stdlib.h>
 #include <emmintrin.h>
 
@@ -26,12 +26,12 @@
 // R = (19077 * y             + 26149 * v - 14234) >> 6
 // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
 // B = (19077 * y + 33050 * u             - 17685) >> 6
-static void ConvertYUV444ToRGB(const __m128i* const Y0,
-                               const __m128i* const U0,
-                               const __m128i* const V0,
-                               __m128i* const R,
-                               __m128i* const G,
-                               __m128i* const B) {
+static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
+                                    const __m128i* const U0,
+                                    const __m128i* const V0,
+                                    __m128i* const R,
+                                    __m128i* const G,
+                                    __m128i* const B) {
   const __m128i k19077 = _mm_set1_epi16(19077);
   const __m128i k26149 = _mm_set1_epi16(26149);
   const __m128i k14234 = _mm_set1_epi16(14234);
@@ -66,13 +66,13 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0,
 }
 
 // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
-static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) {
+static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
   return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
 }
 
 // Load and replicate the U/V samples
-static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
+static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
@@ -80,29 +80,33 @@ static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
 }
 
 // Convert 32 samples of YUV444 to R/G/B
-static void YUV444ToRGB(const uint8_t* const y,
-                        const uint8_t* const u,
-                        const uint8_t* const v,
-                        __m128i* const R, __m128i* const G, __m128i* const B) {
-  const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v);
-  ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
+static void YUV444ToRGB_SSE2(const uint8_t* const y,
+                             const uint8_t* const u,
+                             const uint8_t* const v,
+                             __m128i* const R, __m128i* const G,
+                             __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
+                V0 = Load_HI_16_SSE2(v);
+  ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
 }
 
 // Convert 32 samples of YUV420 to R/G/B
-static void YUV420ToRGB(const uint8_t* const y,
-                        const uint8_t* const u,
-                        const uint8_t* const v,
-                        __m128i* const R, __m128i* const G, __m128i* const B) {
-  const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v);
-  ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
+static void YUV420ToRGB_SSE2(const uint8_t* const y,
+                             const uint8_t* const u,
+                             const uint8_t* const v,
+                             __m128i* const R, __m128i* const G,
+                             __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
+                V0 = Load_UV_HI_8_SSE2(v);
+  ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
 }
 
 // Pack R/G/B/A results into 32b output.
-static WEBP_INLINE void PackAndStore4(const __m128i* const R,
-                                      const __m128i* const G,
-                                      const __m128i* const B,
-                                      const __m128i* const A,
-                                      uint8_t* const dst) {
+static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
+                                           const __m128i* const G,
+                                           const __m128i* const B,
+                                           const __m128i* const A,
+                                           uint8_t* const dst) {
   const __m128i rb = _mm_packus_epi16(*R, *B);
   const __m128i ga = _mm_packus_epi16(*G, *A);
   const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@@ -114,12 +118,12 @@ static WEBP_INLINE void PackAndStore4(const __m128i* const R,
 }
 
 // Pack R/G/B/A results into 16b output.
-static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
-                                         const __m128i* const G,
-                                         const __m128i* const B,
-                                         const __m128i* const A,
-                                         uint8_t* const dst) {
-#if !defined(WEBP_SWAP_16BIT_CSP)
+static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
+                                              const __m128i* const G,
+                                              const __m128i* const B,
+                                              const __m128i* const A,
+                                              uint8_t* const dst) {
+#if (WEBP_SWAP_16BIT_CSP == 0)
   const __m128i rg0 = _mm_packus_epi16(*R, *G);
   const __m128i ba0 = _mm_packus_epi16(*B, *A);
 #else
@@ -136,10 +140,10 @@ static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
 }
 
 // Pack R/G/B results into 16b output.
-static WEBP_INLINE void PackAndStore565(const __m128i* const R,
-                                        const __m128i* const G,
-                                        const __m128i* const B,
-                                        uint8_t* const dst) {
+static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
+                                             const __m128i* const G,
+                                             const __m128i* const B,
+                                             uint8_t* const dst) {
   const __m128i r0 = _mm_packus_epi16(*R, *R);
   const __m128i g0 = _mm_packus_epi16(*G, *G);
   const __m128i b0 = _mm_packus_epi16(*B, *B);
@@ -149,7 +153,7 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
   const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
   const __m128i rg = _mm_or_si128(r1, g1);
   const __m128i gb = _mm_or_si128(g2, b1);
-#if !defined(WEBP_SWAP_16BIT_CSP)
+#if (WEBP_SWAP_16BIT_CSP == 0)
   const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
 #else
   const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
@@ -160,10 +164,10 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
 // Pack the planar buffers
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
-                                    __m128i* const in2, __m128i* const in3,
-                                    __m128i* const in4, __m128i* const in5,
-                                    uint8_t* const rgb) {
+static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
+                                         __m128i* const in2, __m128i* const in3,
+                                         __m128i* const in4, __m128i* const in5,
+                                         uint8_t* const rgb) {
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // let's take 6 registers of four 8b values.
   // To pack, we will keep taking one every two 8b integer and move it
@@ -186,69 +190,69 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
   _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
 }
 
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore4(&R, &G, &B, &kAlpha, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
   }
 }
 
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore4(&B, &G, &R, &kAlpha, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
   }
 }
 
-void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore4(&kAlpha, &R, &G, &B, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
   }
 }
 
-void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst) {
+void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+                             const uint8_t* v, uint8_t* dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n < 32; n += 8, dst += 16) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore4444(&R, &G, &B, &kAlpha, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
   }
 }
 
-void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                      uint8_t* dst) {
+void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst) {
   int n;
   for (n = 0; n < 32; n += 8, dst += 16) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore565(&R, &G, &B, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore565_SSE2(&R, &G, &B, dst);
   }
 }
 
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
+void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
-  YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
-  YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
-  YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
-  YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+  YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
 
   // Cast to 8b and store as RRRRGGGGBBBB.
   rgb0 = _mm_packus_epi16(R0, R1);
@@ -259,18 +263,18 @@ void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   rgb5 = _mm_packus_epi16(B2, B3);
 
   // Pack as RGBRGBRGBRGB.
-  PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+  PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }
 
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
+void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
-  YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
-  YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
-  YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
-  YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+  YUV444ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE2(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
 
   // Cast to 8b and store as BBBBGGGGRRRR.
   bgr0 = _mm_packus_epi16(B0, B1);
@@ -281,20 +285,21 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   bgr5= _mm_packus_epi16(R2, R3);
 
   // Pack as BGRBGRBGRBGR.
-  PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+  PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
 }
 
 //-----------------------------------------------------------------------------
 // Arbitrary-length row conversion functions
 
-static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst, int len) {
+static void YuvToRgbaRow_SSE2(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV420ToRGB(y, u, v, &R, &G, &B);
-    PackAndStore4(&R, &G, &B, &kAlpha, dst);
+    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+    PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
     y += 8;
     u += 4;
     v += 4;
@@ -308,14 +313,15 @@ static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
 }
 
-static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst, int len) {
+static void YuvToBgraRow_SSE2(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV420ToRGB(y, u, v, &R, &G, &B);
-    PackAndStore4(&B, &G, &R, &kAlpha, dst);
+    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+    PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
     y += 8;
     u += 4;
     v += 4;
@@ -329,14 +335,15 @@ static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
 }
 
-static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst, int len) {
+static void YuvToArgbRow_SSE2(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV420ToRGB(y, u, v, &R, &G, &B);
-    PackAndStore4(&kAlpha, &R, &G, &B, dst);
+    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+    PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
     y += 8;
     u += 4;
     v += 4;
@@ -350,17 +357,18 @@ static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
 }
 
-static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst, int len) {
+static void YuvToRgbRow_SSE2(const uint8_t* y,
+                             const uint8_t* u, const uint8_t* v,
+                             uint8_t* dst, int len) {
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
     __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
-    YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
-    YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
-    YUV420ToRGB(y + 16, u +  8, v +  8, &R2, &G2, &B2);
-    YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+    YUV420ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE2(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE2(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
 
     // Cast to 8b and store as RRRRGGGGBBBB.
     rgb0 = _mm_packus_epi16(R0, R1);
@@ -371,7 +379,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
     rgb5 = _mm_packus_epi16(B2, B3);
 
     // Pack as RGBRGBRGBRGB.
-    PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+    PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 
     y += 32;
     u += 16;
@@ -386,17 +394,18 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
 }
 
-static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst, int len) {
+static void YuvToBgrRow_SSE2(const uint8_t* y,
+                             const uint8_t* u, const uint8_t* v,
+                             uint8_t* dst, int len) {
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
     __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
-    YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
-    YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
-    YUV420ToRGB(y + 16, u +  8, v +  8, &R2, &G2, &B2);
-    YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+    YUV420ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE2(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE2(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
 
     // Cast to 8b and store as BBBBGGGGRRRR.
     bgr0 = _mm_packus_epi16(B0, B1);
@@ -407,7 +416,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
     bgr5 = _mm_packus_epi16(R2, R3);
 
     // Pack as BGRBGRBGRBGR.
-    PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+    PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
 
     y += 32;
     u += 16;
@@ -428,11 +437,11 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 extern void WebPInitSamplersSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
-  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
-  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
-  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
-  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
-  WebPSamplers[MODE_ARGB] = YuvToArgbRow;
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_SSE2;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_SSE2;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
+  WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
 }
 
 //------------------------------------------------------------------------------
@@ -445,7 +454,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
 
 // Function that inserts a value of the second half of the in buffer in between
 // every two char of the first half.
-static WEBP_INLINE void RGB24PackedToPlanarHelper(
+static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
     const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
   out[0] = _mm_unpacklo_epi8(in[0], in[3]);
   out[1] = _mm_unpackhi_epi8(in[0], in[3]);
@@ -458,8 +467,8 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper(
 // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // Similar to PlanarTo24bHelper(), but in reverse order.
-static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
-                                            __m128i* const out /*out[6]*/) {
+static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
+    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
   __m128i tmp[6];
   tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
   tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@@ -468,16 +477,16 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
   tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
   tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
 
-  RGB24PackedToPlanarHelper(tmp, out);
-  RGB24PackedToPlanarHelper(out, tmp);
-  RGB24PackedToPlanarHelper(tmp, out);
-  RGB24PackedToPlanarHelper(out, tmp);
-  RGB24PackedToPlanarHelper(tmp, out);
+  RGB24PackedToPlanarHelper_SSE2(tmp, out);
+  RGB24PackedToPlanarHelper_SSE2(out, tmp);
+  RGB24PackedToPlanarHelper_SSE2(tmp, out);
+  RGB24PackedToPlanarHelper_SSE2(out, tmp);
+  RGB24PackedToPlanarHelper_SSE2(tmp, out);
 }
 
 // Convert 8 packed ARGB to r[], g[], b[]
-static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
-                                            __m128i* const rgb /*in[6]*/) {
+static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
+                                                 __m128i* const rgb /*in[6]*/) {
   const __m128i zero = _mm_setzero_si128();
   __m128i a0 = LOAD_16(argb + 0);
   __m128i a1 = LOAD_16(argb + 4);
@@ -511,10 +520,10 @@ static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
 } while (0)
 
 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
-static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
-                                      const __m128i* const G,
-                                      const __m128i* const B,
-                                      __m128i* const Y) {
+static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
+                                           const __m128i* const G,
+                                           const __m128i* const B,
+                                           __m128i* const Y) {
   const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
   const __m128i kGB_y = MK_CST_16(16384, 6420);
   const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
@@ -526,10 +535,11 @@ static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
 }
 
-static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
-                                       const __m128i* const G,
-                                       const __m128i* const B,
-                                       __m128i* const U, __m128i* const V) {
+static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
+                                            const __m128i* const G,
+                                            const __m128i* const B,
+                                            __m128i* const U,
+                                            __m128i* const V) {
   const __m128i kRG_u = MK_CST_16(-9719, -19081);
   const __m128i kGB_u = MK_CST_16(0, 28800);
   const __m128i kRG_v = MK_CST_16(28800, 0);
@@ -549,14 +559,14 @@ static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
 #undef MK_CST_16
 #undef TRANSFORM
 
-static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
   const int max_width = width & ~31;
   int i;
   for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
     __m128i rgb_plane[6];
     int j;
 
-    RGB24PackedToPlanar(rgb, rgb_plane);
+    RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
 
     for (j = 0; j < 2; ++j, i += 16) {
       const __m128i zero = _mm_setzero_si128();
@@ -566,13 +576,13 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
       r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
       g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
       b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
-      ConvertRGBToY(&r, &g, &b, &Y0);
+      ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
 
       // Convert to 16-bit Y.
       r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
       g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
       b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
-      ConvertRGBToY(&r, &g, &b, &Y1);
+      ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
 
       // Cast to 8-bit and store.
       STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@@ -583,14 +593,14 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
   }
 }
 
-static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
   const int max_width = width & ~31;
   int i;
   for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
     __m128i bgr_plane[6];
     int j;
 
-    RGB24PackedToPlanar(bgr, bgr_plane);
+    RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
 
     for (j = 0; j < 2; ++j, i += 16) {
       const __m128i zero = _mm_setzero_si128();
@@ -600,13 +610,13 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
       b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
       g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
       r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
-      ConvertRGBToY(&r, &g, &b, &Y0);
+      ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
 
       // Convert to 16-bit Y.
       b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
       g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
       r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
-      ConvertRGBToY(&r, &g, &b, &Y1);
+      ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
 
       // Cast to 8-bit and store.
       STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@@ -617,14 +627,14 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
   }
 }
 
-static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
   const int max_width = width & ~15;
   int i;
   for (i = 0; i < max_width; i += 16) {
     __m128i Y0, Y1, rgb[6];
-    RGB32PackedToPlanar(&argb[i], rgb);
-    ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
-    ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
+    RGB32PackedToPlanar_SSE2(&argb[i], rgb);
+    ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
+    ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
     STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
   }
   for (; i < width; ++i) {   // left-over
@@ -636,31 +646,33 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
 
 // Horizontal add (doubled) of two 16b values, result is 16b.
 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
-static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
-                              __m128i* const out) {
+static void HorizontalAddPack_SSE2(const __m128i* const A,
+                                   const __m128i* const B,
+                                   __m128i* const out) {
   const __m128i k2 = _mm_set1_epi16(2);
   const __m128i C = _mm_madd_epi16(*A, k2);
   const __m128i D = _mm_madd_epi16(*B, k2);
   *out = _mm_packs_epi32(C, D);
 }
 
-static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
-                            int src_width, int do_store) {
+static void ConvertARGBToUV_SSE2(const uint32_t* argb,
+                                 uint8_t* u, uint8_t* v,
+                                 int src_width, int do_store) {
   const int max_width = src_width & ~31;
   int i;
   for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
     __m128i rgb[6], U0, V0, U1, V1;
-    RGB32PackedToPlanar(&argb[i], rgb);
-    HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
-    HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
-    HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
-    ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
-
-    RGB32PackedToPlanar(&argb[i + 16], rgb);
-    HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
-    HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
-    HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
-    ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
+    RGB32PackedToPlanar_SSE2(&argb[i], rgb);
+    HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
+
+    RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
+    HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
 
     U0 = _mm_packus_epi16(U0, U1);
     V0 = _mm_packus_epi16(V0, V1);
@@ -679,10 +691,9 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
 }
 
 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
-static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
-                                                 __m128i* const r,
-                                                 __m128i* const g,
-                                                 __m128i* const b) {
+static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
+    const uint16_t* const rgbx,
+    __m128i* const r, __m128i* const g, __m128i* const b) {
   const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
   const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
   const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
@@ -701,16 +712,16 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
   *b = _mm_unpacklo_epi64(B1, B3);
 }
 
-static void ConvertRGBA32ToUV(const uint16_t* rgb,
-                              uint8_t* u, uint8_t* v, int width) {
+static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
+                                   uint8_t* u, uint8_t* v, int width) {
   const int max_width = width & ~15;
   const uint16_t* const last_rgb = rgb + 4 * max_width;
   while (rgb < last_rgb) {
     __m128i r, g, b, U0, V0, U1, V1;
-    RGBA32PackedToPlanar_16b(rgb +  0, &r, &g, &b);
-    ConvertRGBToUV(&r, &g, &b, &U0, &V0);
-    RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b);
-    ConvertRGBToUV(&r, &g, &b, &U1, &V1);
+    RGBA32PackedToPlanar_16b_SSE2(rgb +  0, &r, &g, &b);
+    ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
+    RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
+    ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
     STORE_16(_mm_packus_epi16(U0, U1), u);
     STORE_16(_mm_packus_epi16(V0, V1), v);
     u += 16;
@@ -727,13 +738,13 @@ static void ConvertRGBA32ToUV(const uint16_t* rgb,
 extern void WebPInitConvertARGBToYUVSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
-  WebPConvertARGBToY = ConvertARGBToY;
-  WebPConvertARGBToUV = ConvertARGBToUV;
+  WebPConvertARGBToY = ConvertARGBToY_SSE2;
+  WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
 
-  WebPConvertRGB24ToY = ConvertRGB24ToY;
-  WebPConvertBGR24ToY = ConvertBGR24ToY;
+  WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
 
-  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
 }
 
 //------------------------------------------------------------------------------
diff --git a/thirdparty/libwebp/enc/alpha_enc.c b/thirdparty/libwebp/src/enc/alpha_enc.c
index 5a2c931f92..7e8d87f22e 100644
--- a/thirdparty/libwebp/enc/alpha_enc.c
+++ b/thirdparty/libwebp/src/enc/alpha_enc.c
@@ -14,12 +14,12 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8i_enc.h"
-#include "../dsp/dsp.h"
-#include "../utils/filters_utils.h"
-#include "../utils/quant_levels_utils.h"
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/filters_utils.h"
+#include "src/utils/quant_levels_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
 // Encodes the given alpha data via specified compression method 'method'.
@@ -44,11 +44,11 @@
 //           invalid quality or method, or
 //           memory allocation for the compressed data fails.
 
-#include "../enc/vp8li_enc.h"
+#include "src/enc/vp8li_enc.h"
 
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
-                          VP8LBitWriter* const bw,
+                          int use_quality_100, VP8LBitWriter* const bw,
                           WebPAuxStats* const stats) {
   int ok = 0;
   WebPConfig config;
@@ -76,7 +76,10 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   // Set a low default quality for encoding alpha. Ensure that Alpha quality at
   // lower methods (3 and below) is less than the threshold for triggering
   // costly 'BackwardReferencesTraceBackwards'.
-  config.quality = 8.f * effort_level;
+  // If the alpha quality is set to 100 and the method to 6, allow for a high
+  // lossless quality to trigger the cruncher.
+  config.quality =
+      (use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
   // TODO(urvang): Temporary fix to avoid generating images that trigger
@@ -134,7 +137,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   if (method != ALPHA_NO_COMPRESSION) {
     ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
     ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
-                              &tmp_bw, &result->stats);
+                              !reduce_levels, &tmp_bw, &result->stats);
     if (ok) {
       output = VP8LBitWriterFinish(&tmp_bw);
       output_size = VP8LBitWriterNumBytes(&tmp_bw);
@@ -264,6 +267,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
                              reduce_levels, effort_level, NULL, &best);
   }
   if (ok) {
+#if !defined(WEBP_DISABLE_STATS)
     if (stats != NULL) {
       stats->lossless_features = best.stats.lossless_features;
       stats->histogram_bits = best.stats.histogram_bits;
@@ -274,6 +278,9 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
       stats->lossless_hdr_size = best.stats.lossless_hdr_size;
       stats->lossless_data_size = best.stats.lossless_data_size;
     }
+#else
+    (void)stats;
+#endif
     *output_size = VP8BitWriterSize(&best.bw);
     *output = VP8BitWriterBuf(&best.bw);
   } else {
@@ -339,10 +346,12 @@ static int EncodeAlpha(VP8Encoder* const enc,
     ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
                                filter, reduce_levels, effort_level, output,
                                output_size, pic->stats);
+#if !defined(WEBP_DISABLE_STATS)
     if (pic->stats != NULL) {  // need stats?
       pic->stats->coded_size += (int)(*output_size);
       enc->sse_[3] = sse;
     }
+#endif
   }
 
   WebPSafeFree(quant_alpha);
diff --git a/thirdparty/libwebp/enc/analysis_enc.c b/thirdparty/libwebp/src/enc/analysis_enc.c
index dce159b316..08f471f5f8 100644
--- a/thirdparty/libwebp/enc/analysis_enc.c
+++ b/thirdparty/libwebp/src/enc/analysis_enc.c
@@ -15,9 +15,9 @@
 #include <string.h>
 #include <assert.h>
 
-#include "./vp8i_enc.h"
-#include "./cost_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"
+#include "src/utils/utils.h"
 
 #define MAX_ITERS_K_MEANS  6
 
diff --git a/thirdparty/libwebp/src/enc/backward_references_cost_enc.c b/thirdparty/libwebp/src/enc/backward_references_cost_enc.c
new file mode 100644
index 0000000000..7175496c7f
--- /dev/null
+++ b/thirdparty/libwebp/src/enc/backward_references_cost_enc.c
@@ -0,0 +1,790 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Improves a given set of backward references by analyzing its bit cost.
+// The algorithm is similar to the Zopfli compression algorithm but tailored to
+// images.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+//
+
+#include <assert.h>
+
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
+
+#define VALUES_IN_BYTE 256
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+
+typedef struct {
+  double alpha_[VALUES_IN_BYTE];
+  double red_[VALUES_IN_BYTE];
+  double blue_[VALUES_IN_BYTE];
+  double distance_[NUM_DISTANCE_CODES];
+  double* literal_;
+} CostModel;
+
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const uint32_t population_counts[], double output[]) {
+  uint32_t sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
+                          const VP8LBackwardRefs* const refs) {
+  int ok = 0;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
+
+  // The following code is similar to VP8LHistogramCreate but converts the
+  // distance to plane code.
+  VP8LHistogramInit(histo, cache_bits);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
+                                    xsize);
+    VP8LRefsCursorNext(&c);
+  }
+
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(histo->palette_code_bits_),
+      histo->literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
+  ok = 1;
+
+ Error:
+  VP8LFreeHistogram(histo);
+  return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
+  return m->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+                                        uint32_t length) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(length, &code, &extra_bits);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+                                          uint32_t distance) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
+  return m->distance_[code] + extra_bits;
+}
+
+static WEBP_INLINE void AddSingleLiteralWithCostModel(
+    const uint32_t* const argb, VP8LColorCache* const hashers,
+    const CostModel* const cost_model, int idx, int use_color_cache,
+    float prev_cost, float* const cost, uint16_t* const dist_array) {
+  double cost_val = prev_cost;
+  const uint32_t color = argb[idx];
+  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+  if (ix >= 0) {
+    // use_color_cache is true and hashers contains color
+    const double mul0 = 0.68;
+    cost_val += GetCacheCost(cost_model, ix) * mul0;
+  } else {
+    const double mul1 = 0.82;
+    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
+    cost_val += GetLiteralCost(cost_model, color) * mul1;
+  }
+  if (cost[idx] > cost_val) {
+    cost[idx] = (float)cost_val;
+    dist_array[idx] = 1;  // only one is inserted.
+  }
+}
+
+// -----------------------------------------------------------------------------
+// CostManager and interval handling
+
+// Empirical value to avoid high memory consumption but good for performance.
+#define COST_CACHE_INTERVAL_SIZE_MAX 500
+
+// To perform backward reference every pixel at index index_ is considered and
+// the cost for the MAX_LENGTH following pixels computed. Those following pixels
+// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
+//     cost_ = distance cost at index + GetLengthCost(cost_model, k)
+// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
+// array of size MAX_LENGTH.
+// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
+// minimal values using intervals of constant cost.
+// An interval is defined by the index_ of the pixel that generated it and
+// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
+// it contains the minimum value for pixels between start_ and end_.
+// Intervals are stored in a linked list and ordered by start_. When a new
+// interval has a better value, old intervals are split or removed. There are
+// therefore no overlapping intervals.
+typedef struct CostInterval CostInterval;
+struct CostInterval {
+  float cost_;
+  int start_;
+  int end_;
+  int index_;
+  CostInterval* previous_;
+  CostInterval* next_;
+};
+
+// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
+typedef struct {
+  double cost_;
+  int start_;
+  int end_;       // Exclusive.
+} CostCacheInterval;
+
+// This structure is in charge of managing intervals and costs.
+// It caches the different CostCacheInterval, caches the different
+// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
+// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
+#define COST_MANAGER_MAX_FREE_LIST 10
+typedef struct {
+  CostInterval* head_;
+  int count_;  // The number of stored intervals.
+  CostCacheInterval* cache_intervals_;
+  size_t cache_intervals_size_;
+  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  float* costs_;
+  uint16_t* dist_array_;
+  // Most of the time, we only need few intervals -> use a free-list, to avoid
+  // fragmentation with small allocs in most common cases.
+  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
+  CostInterval* free_intervals_;
+  // These are regularly malloc'd remains. This list can't grow larger than than
+  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
+  CostInterval* recycled_intervals_;
+} CostManager;
+
+static void CostIntervalAddToFreeList(CostManager* const manager,
+                                      CostInterval* const interval) {
+  interval->next_ = manager->free_intervals_;
+  manager->free_intervals_ = interval;
+}
+
+static int CostIntervalIsInFreeList(const CostManager* const manager,
+                                    const CostInterval* const interval) {
+  return (interval >= &manager->intervals_[0] &&
+          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
+}
+
+static void CostManagerInitFreeList(CostManager* const manager) {
+  int i;
+  manager->free_intervals_ = NULL;
+  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
+    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
+  }
+}
+
+static void DeleteIntervalList(CostManager* const manager,
+                               const CostInterval* interval) {
+  while (interval != NULL) {
+    const CostInterval* const next = interval->next_;
+    if (!CostIntervalIsInFreeList(manager, interval)) {
+      WebPSafeFree((void*)interval);
+    }  // else: do nothing
+    interval = next;
+  }
+}
+
+static void CostManagerClear(CostManager* const manager) {
+  if (manager == NULL) return;
+
+  WebPSafeFree(manager->costs_);
+  WebPSafeFree(manager->cache_intervals_);
+
+  // Clear the interval lists.
+  DeleteIntervalList(manager, manager->head_);
+  manager->head_ = NULL;
+  DeleteIntervalList(manager, manager->recycled_intervals_);
+  manager->recycled_intervals_ = NULL;
+
+  // Reset pointers, count_ and cache_intervals_size_.
+  memset(manager, 0, sizeof(*manager));
+  CostManagerInitFreeList(manager);
+}
+
+static int CostManagerInit(CostManager* const manager,
+                           uint16_t* const dist_array, int pix_count,
+                           const CostModel* const cost_model) {
+  int i;
+  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
+
+  manager->costs_ = NULL;
+  manager->cache_intervals_ = NULL;
+  manager->head_ = NULL;
+  manager->recycled_intervals_ = NULL;
+  manager->count_ = 0;
+  manager->dist_array_ = dist_array;
+  CostManagerInitFreeList(manager);
+
+  // Fill in the cost_cache_.
+  manager->cache_intervals_size_ = 1;
+  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
+  for (i = 1; i < cost_cache_size; ++i) {
+    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+    // Get the number of bound intervals.
+    if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
+      ++manager->cache_intervals_size_;
+    }
+  }
+
+  // With the current cost model, we usually have below 20 intervals.
+  // The worst case scenario with a cost model would be if every length has a
+  // different cost, hence MAX_LENGTH but that is impossible with the current
+  // implementation that spirals around a pixel.
+  assert(manager->cache_intervals_size_ <= MAX_LENGTH);
+  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
+      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
+  if (manager->cache_intervals_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+
+  // Fill in the cache_intervals_.
+  {
+    CostCacheInterval* cur = manager->cache_intervals_;
+
+    // Consecutive values in cost_cache_ are compared and if a big enough
+    // difference is found, a new interval is created and bounded.
+    cur->start_ = 0;
+    cur->end_ = 1;
+    cur->cost_ = manager->cost_cache_[0];
+    for (i = 1; i < cost_cache_size; ++i) {
+      const double cost_val = manager->cost_cache_[i];
+      if (cost_val != cur->cost_) {
+        ++cur;
+        // Initialize an interval.
+        cur->start_ = i;
+        cur->cost_ = cost_val;
+      }
+      cur->end_ = i + 1;
+    }
+  }
+
+  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
+  if (manager->costs_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+  // Set the initial costs_ high for every pixel as we will keep the minimum.
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+
+  return 1;
+}
+
+// Given the cost and the position that define an interval, update the cost at
+// pixel 'i' if it is smaller than the previously computed value.
+static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
+                                   int position, float cost) {
+  const int k = i - position;
+  assert(k >= 0 && k < MAX_LENGTH);
+
+  if (manager->costs_[i] > cost) {
+    manager->costs_[i] = cost;
+    manager->dist_array_[i] = k + 1;
+  }
+}
+
+// Given the cost and the position that define an interval, update the cost for
+// all the pixels between 'start' and 'end' excluded.
+static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
+                                              int start, int end, int position,
+                                              float cost) {
+  int i;
+  for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
+}
+
+// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
+static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
+                                         CostInterval* const prev,
+                                         CostInterval* const next) {
+  if (prev != NULL) {
+    prev->next_ = next;
+  } else {
+    manager->head_ = next;
+  }
+
+  if (next != NULL) next->previous_ = prev;
+}
+
+// Pop an interval in the manager.
+static WEBP_INLINE void PopInterval(CostManager* const manager,
+                                    CostInterval* const interval) {
+  if (interval == NULL) return;
+
+  ConnectIntervals(manager, interval->previous_, interval->next_);
+  if (CostIntervalIsInFreeList(manager, interval)) {
+    CostIntervalAddToFreeList(manager, interval);
+  } else {  // recycle regularly malloc'd intervals too
+    interval->next_ = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval;
+  }
+  --manager->count_;
+  assert(manager->count_ >= 0);
+}
+
+// Update the cost at index i by going over all the stored intervals that
+// overlap with i.
+// If 'do_clean_intervals' is set to something different than 0, intervals that
+// end before 'i' will be popped.
+static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
+                                          int do_clean_intervals) {
+  CostInterval* current = manager->head_;
+
+  while (current != NULL && current->start_ <= i) {
+    CostInterval* const next = current->next_;
+    if (current->end_ <= i) {
+      if (do_clean_intervals) {
+        // We have an outdated interval, remove it.
+        PopInterval(manager, current);
+      }
+    } else {
+      UpdateCost(manager, i, current->index_, current->cost_);
+    }
+    current = next;
+  }
+}
+
+// Given a current orphan interval and its previous interval, before
+// it was orphaned (which can be NULL), set it at the right place in the list
+// of intervals using the start_ ordering and the previous interval as a hint.
+static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
+                                               CostInterval* const current,
+                                               CostInterval* previous) {
+  assert(current != NULL);
+
+  if (previous == NULL) previous = manager->head_;
+  while (previous != NULL && current->start_ < previous->start_) {
+    previous = previous->previous_;
+  }
+  while (previous != NULL && previous->next_ != NULL &&
+         previous->next_->start_ < current->start_) {
+    previous = previous->next_;
+  }
+
+  if (previous != NULL) {
+    ConnectIntervals(manager, current, previous->next_);
+  } else {
+    ConnectIntervals(manager, current, manager->head_);
+  }
+  ConnectIntervals(manager, previous, current);
+}
+
+// Insert an interval in the list contained in the manager by starting at
+// interval_in as a hint. The intervals are sorted by start_ value.
+static WEBP_INLINE void InsertInterval(CostManager* const manager,
+                                       CostInterval* const interval_in,
+                                       float cost, int position, int start,
+                                       int end) {
+  CostInterval* interval_new;
+
+  if (start >= end) return;
+  if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
+    // Serialize the interval if we cannot store it.
+    UpdateCostPerInterval(manager, start, end, position, cost);
+    return;
+  }
+  if (manager->free_intervals_ != NULL) {
+    interval_new = manager->free_intervals_;
+    manager->free_intervals_ = interval_new->next_;
+  } else if (manager->recycled_intervals_ != NULL) {
+    interval_new = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval_new->next_;
+  } else {  // malloc for good
+    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
+    if (interval_new == NULL) {
+      // Write down the interval if we cannot create it.
+      UpdateCostPerInterval(manager, start, end, position, cost);
+      return;
+    }
+  }
+
+  interval_new->cost_ = cost;
+  interval_new->index_ = position;
+  interval_new->start_ = start;
+  interval_new->end_ = end;
+  PositionOrphanInterval(manager, interval_new, interval_in);
+
+  ++manager->count_;
+}
+
+// Given a new cost interval defined by its start at position, its length value
+// and distance_cost, add its contributions to the previous intervals and costs.
+// If handling the interval or one of its subintervals becomes to heavy, its
+// contribution is added to the costs right away.
+static WEBP_INLINE void PushInterval(CostManager* const manager,
+                                     double distance_cost, int position,
+                                     int len) {
+  size_t i;
+  CostInterval* interval = manager->head_;
+  CostInterval* interval_next;
+  const CostCacheInterval* const cost_cache_intervals =
+      manager->cache_intervals_;
+  // If the interval is small enough, no need to deal with the heavy
+  // interval logic, just serialize it right away. This constant is empirical.
+  const int kSkipDistance = 10;
+
+  if (len < kSkipDistance) {
+    int j;
+    for (j = position; j < position + len; ++j) {
+      const int k = j - position;
+      float cost_tmp;
+      assert(k >= 0 && k < MAX_LENGTH);
+      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+
+      if (manager->costs_[j] > cost_tmp) {
+        manager->costs_[j] = cost_tmp;
+        manager->dist_array_[j] = k + 1;
+      }
+    }
+    return;
+  }
+
+  for (i = 0; i < manager->cache_intervals_size_ &&
+              cost_cache_intervals[i].start_ < len;
+       ++i) {
+    // Define the intersection of the ith interval with the new one.
+    int start = position + cost_cache_intervals[i].start_;
+    const int end = position + (cost_cache_intervals[i].end_ > len
+                                 ? len
+                                 : cost_cache_intervals[i].end_);
+    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+
+    for (; interval != NULL && interval->start_ < end;
+         interval = interval_next) {
+      interval_next = interval->next_;
+
+      // Make sure we have some overlap
+      if (start >= interval->end_) continue;
+
+      if (cost >= interval->cost_) {
+        // When intervals are represented, the lower, the better.
+        // [**********************************************************[
+        // start                                                    end
+        //                   [----------------------------------[
+        //                   interval->start_       interval->end_
+        // If we are worse than what we already have, add whatever we have so
+        // far up to interval.
+        const int start_new = interval->end_;
+        InsertInterval(manager, interval, cost, position, start,
+                       interval->start_);
+        start = start_new;
+        if (start >= end) break;
+        continue;
+      }
+
+      if (start <= interval->start_) {
+        if (interval->end_ <= end) {
+          //                   [----------------------------------[
+          //                   interval->start_       interval->end_
+          // [**************************************************************[
+          // start                                                        end
+          // We can safely remove the old interval as it is fully included.
+          PopInterval(manager, interval);
+        } else {
+          //              [------------------------------------[
+          //              interval->start_        interval->end_
+          // [*****************************[
+          // start                       end
+          interval->start_ = end;
+          break;
+        }
+      } else {
+        if (end < interval->end_) {
+          // [--------------------------------------------------------------[
+          // interval->start_                                  interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          // We have to split the old interval as it fully contains the new one.
+          const int end_original = interval->end_;
+          interval->end_ = start;
+          InsertInterval(manager, interval, interval->cost_, interval->index_,
+                         end, end_original);
+          interval = interval->next_;
+          break;
+        } else {
+          // [------------------------------------[
+          // interval->start_        interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          interval->end_ = start;
+        }
+      }
+    }
+    // Insert the remaining interval from start to end.
+    InsertInterval(manager, interval, cost, position, start, end);
+  }
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
+    uint16_t* const dist_array) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  const size_t literal_array_size =
+      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
+  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
+  CostModel* const cost_model =
+      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
+  VP8LColorCache hashers;
+  CostManager* cost_manager =
+      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+  int offset_prev = -1, len_prev = -1;
+  double offset_cost = -1;
+  int first_offset_is_constant = -1;  // initialized with 'impossible' value
+  int reach = 0;
+
+  if (cost_model == NULL || cost_manager == NULL) goto Error;
+
+  cost_model->literal_ = (double*)(cost_model + 1);
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
+    goto Error;
+  }
+
+  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
+    goto Error;
+  }
+
+  // We loop one pixel at a time, but store all currently best points to
+  // non-processed locations from this point.
+  dist_array[0] = 0;
+  // Add first pixel as literal.
+  AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
+                                0.f, cost_manager->costs_, dist_array);
+
+  for (i = 1; i < pix_count; ++i) {
+    const float prev_cost = cost_manager->costs_[i - 1];
+    int offset, len;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+
+    // Try adding the pixel as a literal.
+    AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
+                                  use_color_cache, prev_cost,
+                                  cost_manager->costs_, dist_array);
+
+    // If we are dealing with a non-literal.
+    if (len >= 2) {
+      if (offset != offset_prev) {
+        const int code = VP8LDistanceToPlaneCode(xsize, offset);
+        offset_cost = GetDistanceCost(cost_model, code);
+        first_offset_is_constant = 1;
+        PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+      } else {
+        assert(offset_cost >= 0);
+        assert(len_prev >= 0);
+        assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
+        // Instead of considering all contributions from a pixel i by calling:
+        //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+        // we optimize these contributions in case offset_cost stays the same
+        // for consecutive pixels. This describes a set of pixels similar to a
+        // previous set (e.g. constant color regions).
+        if (first_offset_is_constant) {
+          reach = i - 1 + len_prev - 1;
+          first_offset_is_constant = 0;
+        }
+
+        if (i + len - 1 > reach) {
+          // We can only be go further with the same offset if the previous
+          // length was maxed, hence len_prev == len == MAX_LENGTH.
+          // TODO(vrabaud), bump i to the end right away (insert cache and
+          // update cost).
+          // TODO(vrabaud), check if one of the points in between does not have
+          // a lower cost.
+          // Already consider the pixel at "reach" to add intervals that are
+          // better than whatever we add.
+          int offset_j, len_j = 0;
+          int j;
+          assert(len == MAX_LENGTH || len == pix_count - i);
+          // Figure out the last consecutive pixel within [i, reach + 1] with
+          // the same offset.
+          for (j = i; j <= reach; ++j) {
+            VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
+            if (offset_j != offset) {
+              VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
+              break;
+            }
+          }
+          // Update the cost at j - 1 and j.
+          UpdateCostAtIndex(cost_manager, j - 1, 0);
+          UpdateCostAtIndex(cost_manager, j, 0);
+
+          PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
+                       j, len_j);
+          reach = j + len_j - 1;
+        }
+      }
+    }
+
+    UpdateCostAtIndex(cost_manager, i, 1);
+    offset_prev = offset;
+    len_prev = len;
+  }
+
+  ok = !refs->error_;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  CostManagerClear(cost_manager);
+  WebPSafeFree(cost_model);
+  WebPSafeFree(cost_manager);
+  return ok;
+}
+
+// We pack the path at the end of *dist_array and return
+// a pointer to this part of the array. Example:
+// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
+static void TraceBackwards(uint16_t* const dist_array,
+                           int dist_array_size,
+                           uint16_t** const chosen_path,
+                           int* const chosen_path_size) {
+  uint16_t* path = dist_array + dist_array_size;
+  uint16_t* cur = dist_array + dist_array_size - 1;
+  while (cur >= dist_array) {
+    const int k = *cur;
+    --path;
+    *path = k;
+    cur -= k;
+  }
+  *chosen_path = path;
+  *chosen_path_size = (int)(dist_array + dist_array_size - path);
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+    const uint32_t* const argb, int cache_bits,
+    const uint16_t* const chosen_path, int chosen_path_size,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
+  const int use_color_cache = (cache_bits > 0);
+  int ix;
+  int i = 0;
+  int ok = 0;
+  int cc_init = 0;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  VP8LClearBackwardRefs(refs);
+  for (ix = 0; ix < chosen_path_size; ++ix) {
+    const int len = chosen_path[ix];
+    if (len != 1) {
+      int k;
+      const int offset = VP8LHashChainFindOffset(hash_chain, i);
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += len;
+    } else {
+      PixOrCopy v;
+      const int idx =
+          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+      if (idx >= 0) {
+        // use_color_cache is true and hashers contains argb[i]
+        // push pixel as a color cache index
+        v = PixOrCopyCreateCacheIdx(idx);
+      } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+        v = PixOrCopyCreateLiteral(argb[i]);
+      }
+      VP8LBackwardRefsCursorAdd(refs, v);
+      ++i;
+    }
+  }
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Returns 1 on success.
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
+                                         const uint32_t* const argb,
+                                         int cache_bits,
+                                         const VP8LHashChain* const hash_chain,
+                                         const VP8LBackwardRefs* const refs_src,
+                                         VP8LBackwardRefs* const refs_dst) {
+  int ok = 0;
+  const int dist_array_size = xsize * ysize;
+  uint16_t* chosen_path = NULL;
+  int chosen_path_size = 0;
+  uint16_t* dist_array =
+      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
+
+  if (dist_array == NULL) goto Error;
+
+  if (!BackwardReferencesHashChainDistanceOnly(
+          xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
+    goto Error;
+  }
+  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  if (!BackwardReferencesHashChainFollowChosenPath(
+          argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
+          refs_dst)) {
+    goto Error;
+  }
+  ok = 1;
+ Error:
+  WebPSafeFree(dist_array);
+  return ok;
+}
diff --git a/thirdparty/libwebp/src/enc/backward_references_enc.c b/thirdparty/libwebp/src/enc/backward_references_enc.c
new file mode 100644
index 0000000000..39230188b9
--- /dev/null
+++ b/thirdparty/libwebp/src/enc/backward_references_enc.c
@@ -0,0 +1,943 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#include <assert.h>
+#include <math.h>
+
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
+
+#define MIN_BLOCK_SIZE 256  // minimum block size for backward references
+
+#define MAX_ENTROPY    (1e30f)
+
+// 1M window (4M bytes) minus 120 special codes for short distances.
+#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
+
+// Minimum number of pixels for which it is cheaper to encode a
+// distance + length instead of each pixel as a literal.
+#define MIN_LENGTH 4
+
+// -----------------------------------------------------------------------------
+
+static const uint8_t plane_to_code_lut[128] = {
+ 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
+ 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
+ 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
+ 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
+ 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
+ 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
+ 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
+ 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117
+};
+
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+int VP8LDistanceToPlaneCode(int xsize, int dist) {
+  const int yoffset = dist / xsize;
+  const int xoffset = dist - yoffset * xsize;
+  if (xoffset <= 8 && yoffset < 8) {
+    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
+  } else if (xoffset > xsize - 8 && yoffset < 7) {
+    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
+  }
+  return dist + 120;
+}
+
+// Returns the exact index where array1 and array2 are different. For an index
+// inferior or equal to best_len_match, the return value just has to be strictly
+// inferior to best_len_match. The current behavior is to return 0 if this index
+// is best_len_match, and the index itself otherwise.
+// If no two elements are the same, it returns max_limit.
+static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
+                                       const uint32_t* const array2,
+                                       int best_len_match, int max_limit) {
+  // Before 'expensive' linear match, check if the two arrays match at the
+  // current best length index.
+  if (array1[best_len_match] != array2[best_len_match]) return 0;
+
+  return VP8LVectorMismatch(array1, array2, max_limit);
+}
+
+// -----------------------------------------------------------------------------
+//  VP8LBackwardRefs
+
+struct PixOrCopyBlock {
+  PixOrCopyBlock* next_;   // next block (or NULL)
+  PixOrCopy* start_;       // data start
+  int size_;               // currently used size
+};
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  if (refs->tail_ != NULL) {
+    *refs->tail_ = refs->free_blocks_;  // recycle all blocks at once
+  }
+  refs->free_blocks_ = refs->refs_;
+  refs->tail_ = &refs->refs_;
+  refs->last_block_ = NULL;
+  refs->refs_ = NULL;
+}
+
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  VP8LClearBackwardRefs(refs);
+  while (refs->free_blocks_ != NULL) {
+    PixOrCopyBlock* const next = refs->free_blocks_->next_;
+    WebPSafeFree(refs->free_blocks_);
+    refs->free_blocks_ = next;
+  }
+}
+
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
+  assert(refs != NULL);
+  memset(refs, 0, sizeof(*refs));
+  refs->tail_ = &refs->refs_;
+  refs->block_size_ =
+      (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
+}
+
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c;
+  c.cur_block_ = refs->refs_;
+  if (refs->refs_ != NULL) {
+    c.cur_pos = c.cur_block_->start_;
+    c.last_pos_ = c.cur_pos + c.cur_block_->size_;
+  } else {
+    c.cur_pos = NULL;
+    c.last_pos_ = NULL;
+  }
+  return c;
+}
+
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
+  PixOrCopyBlock* const b = c->cur_block_->next_;
+  c->cur_pos = (b == NULL) ? NULL : b->start_;
+  c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
+  c->cur_block_ = b;
+}
+
+// Create a new block, either from the free list or allocated
+static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
+  PixOrCopyBlock* b = refs->free_blocks_;
+  if (b == NULL) {   // allocate new memory chunk
+    const size_t total_size =
+        sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
+    b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
+    if (b == NULL) {
+      refs->error_ |= 1;
+      return NULL;
+    }
+    b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b));  // not always aligned
+  } else {  // recycle from free-list
+    refs->free_blocks_ = b->next_;
+  }
+  *refs->tail_ = b;
+  refs->tail_ = &b->next_;
+  refs->last_block_ = b;
+  b->next_ = NULL;
+  b->size_ = 0;
+  return b;
+}
+
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                               const PixOrCopy v) {
+  PixOrCopyBlock* b = refs->last_block_;
+  if (b == NULL || b->size_ == refs->block_size_) {
+    b = BackwardRefsNewBlock(refs);
+    if (b == NULL) return;   // refs->error_ is set
+  }
+  b->start_[b->size_++] = v;
+}
+
+// -----------------------------------------------------------------------------
+// Hash chains
+
+int VP8LHashChainInit(VP8LHashChain* const p, int size) {
+  assert(p->size_ == 0);
+  assert(p->offset_length_ == NULL);
+  assert(size > 0);
+  p->offset_length_ =
+      (uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
+  if (p->offset_length_ == NULL) return 0;
+  p->size_ = size;
+
+  return 1;
+}
+
+void VP8LHashChainClear(VP8LHashChain* const p) {
+  assert(p != NULL);
+  WebPSafeFree(p->offset_length_);
+
+  p->size_ = 0;
+  p->offset_length_ = NULL;
+}
+
+// -----------------------------------------------------------------------------
+
+#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
+#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
+
+static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
+  uint32_t key;
+  key  = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
+  key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
+  key = key >> (32 - HASH_BITS);
+  return key;
+}
+
+// Returns the maximum number of hash chain lookups to do for a
+// given compression quality. Return value in range [8, 86].
+static int GetMaxItersForQuality(int quality) {
+  return 8 + (quality * quality) / 128;
+}
+
+static int GetWindowSizeForHashChain(int quality, int xsize) {
+  const int max_window_size = (quality > 75) ? WINDOW_SIZE
+                            : (quality > 50) ? (xsize << 8)
+                            : (quality > 25) ? (xsize << 6)
+                            : (xsize << 4);
+  assert(xsize > 0);
+  return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
+}
+
+static WEBP_INLINE int MaxFindCopyLength(int len) {
+  return (len < MAX_LENGTH) ? len : MAX_LENGTH;
+}
+
+int VP8LHashChainFill(VP8LHashChain* const p, int quality,
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort) {
+  const int size = xsize * ysize;
+  const int iter_max = GetMaxItersForQuality(quality);
+  const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
+  int pos;
+  int argb_comp;
+  uint32_t base_position;
+  int32_t* hash_to_first_index;
+  // Temporarily use the p->offset_length_ as a hash chain.
+  int32_t* chain = (int32_t*)p->offset_length_;
+  assert(size > 0);
+  assert(p->size_ != 0);
+  assert(p->offset_length_ != NULL);
+
+  if (size <= 2) {
+    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+    return 1;
+  }
+
+  hash_to_first_index =
+      (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
+  if (hash_to_first_index == NULL) return 0;
+
+  // Set the int32_t array to -1.
+  memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
+  // Fill the chain linking pixels with the same hash.
+  argb_comp = (argb[0] == argb[1]);
+  for (pos = 0; pos < size - 2;) {
+    uint32_t hash_code;
+    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
+    if (argb_comp && argb_comp_next) {
+      // Consecutive pixels with the same color will share the same hash.
+      // We therefore use a different hash: the color and its repetition
+      // length.
+      uint32_t tmp[2];
+      uint32_t len = 1;
+      tmp[0] = argb[pos];
+      // Figure out how far the pixels are the same.
+      // The last pixel has a different 64 bit hash, as its next pixel does
+      // not have the same color, so we just need to get to the last pixel equal
+      // to its follower.
+      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
+        ++len;
+      }
+      if (len > MAX_LENGTH) {
+        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
+        // because they are linked to their predecessor and we automatically
+        // check that in the main for loop below. Skipping means setting no
+        // predecessor in the chain, hence -1.
+        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
+        pos += len - MAX_LENGTH;
+        len = MAX_LENGTH;
+      }
+      // Process the rest of the hash chain.
+      while (len) {
+        tmp[1] = len--;
+        hash_code = GetPixPairHash64(tmp);
+        chain[pos] = hash_to_first_index[hash_code];
+        hash_to_first_index[hash_code] = pos++;
+      }
+      argb_comp = 0;
+    } else {
+      // Just move one pixel forward.
+      hash_code = GetPixPairHash64(argb + pos);
+      chain[pos] = hash_to_first_index[hash_code];
+      hash_to_first_index[hash_code] = pos++;
+      argb_comp = argb_comp_next;
+    }
+  }
+  // Process the penultimate pixel.
+  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
+
+  WebPSafeFree(hash_to_first_index);
+
+  // Find the best match interval at each pixel, defined by an offset to the
+  // pixel and a length. The right-most pixel cannot match anything to the right
+  // (hence a best length of 0) and the left-most pixel nothing to the left
+  // (hence an offset of 0).
+  assert(size > 2);
+  p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+  for (base_position = size - 2; base_position > 0;) {
+    const int max_len = MaxFindCopyLength(size - 1 - base_position);
+    const uint32_t* const argb_start = argb + base_position;
+    int iter = iter_max;
+    int best_length = 0;
+    uint32_t best_distance = 0;
+    uint32_t best_argb;
+    const int min_pos =
+        (base_position > window_size) ? base_position - window_size : 0;
+    const int length_max = (max_len < 256) ? max_len : 256;
+    uint32_t max_base_position;
+
+    pos = chain[base_position];
+    if (!low_effort) {
+      int curr_length;
+      // Heuristic: use the comparison with the above line as an initialization.
+      if (base_position >= (uint32_t)xsize) {
+        curr_length = FindMatchLength(argb_start - xsize, argb_start,
+                                      best_length, max_len);
+        if (curr_length > best_length) {
+          best_length = curr_length;
+          best_distance = xsize;
+        }
+        --iter;
+      }
+      // Heuristic: compare to the previous pixel.
+      curr_length =
+          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
+      if (curr_length > best_length) {
+        best_length = curr_length;
+        best_distance = 1;
+      }
+      --iter;
+      // Skip the for loop if we already have the maximum.
+      if (best_length == MAX_LENGTH) pos = min_pos - 1;
+    }
+    best_argb = argb_start[best_length];
+
+    for (; pos >= min_pos && --iter; pos = chain[pos]) {
+      int curr_length;
+      assert(base_position > (uint32_t)pos);
+
+      if (argb[pos + best_length] != best_argb) continue;
+
+      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
+      if (best_length < curr_length) {
+        best_length = curr_length;
+        best_distance = base_position - pos;
+        best_argb = argb_start[best_length];
+        // Stop if we have reached a good enough length.
+        if (best_length >= length_max) break;
+      }
+    }
+    // We have the best match but in case the two intervals continue matching
+    // to the left, we have the best matches for the left-extended pixels.
+    max_base_position = base_position;
+    while (1) {
+      assert(best_length <= MAX_LENGTH);
+      assert(best_distance <= WINDOW_SIZE);
+      p->offset_length_[base_position] =
+          (best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      --base_position;
+      // Stop if we don't have a match or if we are out of bounds.
+      if (best_distance == 0 || base_position == 0) break;
+      // Stop if we cannot extend the matching intervals to the left.
+      if (base_position < best_distance ||
+          argb[base_position - best_distance] != argb[base_position]) {
+        break;
+      }
+      // Stop if we are matching at its limit because there could be a closer
+      // matching interval with the same maximum length. Then again, if the
+      // matching interval is as close as possible (best_distance == 1), we will
+      // never find anything better so let's continue.
+      if (best_length == MAX_LENGTH && best_distance != 1 &&
+          base_position + MAX_LENGTH < max_base_position) {
+        break;
+      }
+      if (best_length < MAX_LENGTH) {
+        ++best_length;
+        max_base_position = base_position;
+      }
+    }
+  }
+  return 1;
+}
+
+static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
+                                         VP8LColorCache* const hashers,
+                                         VP8LBackwardRefs* const refs) {
+  PixOrCopy v;
+  if (use_color_cache) {
+    const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
+    if (VP8LColorCacheLookup(hashers, key) == pixel) {
+      v = PixOrCopyCreateCacheIdx(key);
+    } else {
+      v = PixOrCopyCreateLiteral(pixel);
+      VP8LColorCacheSet(hashers, key, pixel);
+    }
+  } else {
+    v = PixOrCopyCreateLiteral(pixel);
+  }
+  VP8LBackwardRefsCursorAdd(refs, v);
+}
+
+static int BackwardReferencesRle(int xsize, int ysize,
+                                 const uint32_t* const argb,
+                                 int cache_bits, VP8LBackwardRefs* const refs) {
+  const int pix_count = xsize * ysize;
+  int i, k;
+  const int use_color_cache = (cache_bits > 0);
+  VP8LColorCache hashers;
+
+  if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
+    return 0;
+  }
+  VP8LClearBackwardRefs(refs);
+  // Add first pixel as literal.
+  AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
+  i = 1;
+  while (i < pix_count) {
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
+    const int prev_row_len = (i < xsize) ? 0 :
+        FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
+    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
+      // We don't need to update the color cache here since it is always the
+      // same pixel being copied, and that does not change the color cache
+      // state.
+      i += rle_len;
+    } else if (prev_row_len >= MIN_LENGTH) {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
+      if (use_color_cache) {
+        for (k = 0; k < prev_row_len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += prev_row_len;
+    } else {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+      i++;
+    }
+  }
+  if (use_color_cache) VP8LColorCacheClear(&hashers);
+  return !refs->error_;
+}
+
+static int BackwardReferencesLz77(int xsize, int ysize,
+                                  const uint32_t* const argb, int cache_bits,
+                                  const VP8LHashChain* const hash_chain,
+                                  VP8LBackwardRefs* const refs) {
+  int i;
+  int i_last_check = -1;
+  int ok = 0;
+  int cc_init = 0;
+  const int use_color_cache = (cache_bits > 0);
+  const int pix_count = xsize * ysize;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+  VP8LClearBackwardRefs(refs);
+  for (i = 0; i < pix_count;) {
+    // Alternative#1: Code the pixels starting at 'i' using backward reference.
+    int offset = 0;
+    int len = 0;
+    int j;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+    if (len >= MIN_LENGTH) {
+      const int len_ini = len;
+      int max_reach = 0;
+      const int j_max =
+          (i + len_ini >= pix_count) ? pix_count - 1 : i + len_ini;
+      // Only start from what we have not checked already.
+      i_last_check = (i > i_last_check) ? i : i_last_check;
+      // We know the best match for the current pixel but we try to find the
+      // best matches for the current pixel AND the next one combined.
+      // The naive method would use the intervals:
+      // [i,i+len) + [i+len, length of best match at i+len)
+      // while we check if we can use:
+      // [i,j) (where j<=i+len) + [j, length of best match at j)
+      for (j = i_last_check + 1; j <= j_max; ++j) {
+        const int len_j = VP8LHashChainFindLength(hash_chain, j);
+        const int reach =
+            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
+        if (reach > max_reach) {
+          len = j - i;
+          max_reach = reach;
+          if (max_reach >= pix_count) break;
+        }
+      }
+    } else {
+      len = 1;
+    }
+    // Go with literal or backward reference.
+    assert(len > 0);
+    if (len == 1) {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+    } else {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
+      }
+    }
+    i += len;
+  }
+
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Compute an LZ77 by forcing matches to happen within a given distance cost.
+// We therefore limit the algorithm to the lowest 32 values in the PlaneCode
+// definition.
+#define WINDOW_OFFSETS_SIZE_MAX 32
+static int BackwardReferencesLz77Box(int xsize, int ysize,
+                                     const uint32_t* const argb, int cache_bits,
+                                     const VP8LHashChain* const hash_chain_best,
+                                     VP8LHashChain* hash_chain,
+                                     VP8LBackwardRefs* const refs) {
+  int i;
+  const int pix_count = xsize * ysize;
+  uint16_t* counts;
+  int window_offsets[WINDOW_OFFSETS_SIZE_MAX] = {0};
+  int window_offsets_new[WINDOW_OFFSETS_SIZE_MAX] = {0};
+  int window_offsets_size = 0;
+  int window_offsets_new_size = 0;
+  uint16_t* const counts_ini =
+      (uint16_t*)WebPSafeMalloc(xsize * ysize, sizeof(*counts_ini));
+  int best_offset_prev = -1, best_length_prev = -1;
+  if (counts_ini == NULL) return 0;
+
+  // counts[i] counts how many times a pixel is repeated starting at position i.
+  i = pix_count - 2;
+  counts = counts_ini + i;
+  counts[1] = 1;
+  for (; i >= 0; --i, --counts) {
+    if (argb[i] == argb[i + 1]) {
+      // Max out the counts to MAX_LENGTH.
+      counts[0] = counts[1] + (counts[1] != MAX_LENGTH);
+    } else {
+      counts[0] = 1;
+    }
+  }
+
+  // Figure out the window offsets around a pixel. They are stored in a
+  // spiraling order around the pixel as defined by VP8LDistanceToPlaneCode.
+  {
+    int x, y;
+    for (y = 0; y <= 6; ++y) {
+      for (x = -6; x <= 6; ++x) {
+        const int offset = y * xsize + x;
+        int plane_code;
+        // Ignore offsets that bring us after the pixel.
+        if (offset <= 0) continue;
+        plane_code = VP8LDistanceToPlaneCode(xsize, offset) - 1;
+        if (plane_code >= WINDOW_OFFSETS_SIZE_MAX) continue;
+        window_offsets[plane_code] = offset;
+      }
+    }
+    // For narrow images, not all plane codes are reached, so remove those.
+    for (i = 0; i < WINDOW_OFFSETS_SIZE_MAX; ++i) {
+      if (window_offsets[i] == 0) continue;
+      window_offsets[window_offsets_size++] = window_offsets[i];
+    }
+    // Given a pixel P, find the offsets that reach pixels unreachable from P-1
+    // with any of the offsets in window_offsets[].
+    for (i = 0; i < window_offsets_size; ++i) {
+      int j;
+      int is_reachable = 0;
+      for (j = 0; j < window_offsets_size && !is_reachable; ++j) {
+        is_reachable |= (window_offsets[i] == window_offsets[j] + 1);
+      }
+      if (!is_reachable) {
+        window_offsets_new[window_offsets_new_size] = window_offsets[i];
+        ++window_offsets_new_size;
+      }
+    }
+  }
+
+  hash_chain->offset_length_[0] = 0;
+  for (i = 1; i < pix_count; ++i) {
+    int ind;
+    int best_length = VP8LHashChainFindLength(hash_chain_best, i);
+    int best_offset;
+    int do_compute = 1;
+
+    if (best_length >= MAX_LENGTH) {
+      // Do not recompute the best match if we already have a maximal one in the
+      // window.
+      best_offset = VP8LHashChainFindOffset(hash_chain_best, i);
+      for (ind = 0; ind < window_offsets_size; ++ind) {
+        if (best_offset == window_offsets[ind]) {
+          do_compute = 0;
+          break;
+        }
+      }
+    }
+    if (do_compute) {
+      // Figure out if we should use the offset/length from the previous pixel
+      // as an initial guess and therefore only inspect the offsets in
+      // window_offsets_new[].
+      const int use_prev =
+          (best_length_prev > 1) && (best_length_prev < MAX_LENGTH);
+      const int num_ind =
+          use_prev ? window_offsets_new_size : window_offsets_size;
+      best_length = use_prev ? best_length_prev - 1 : 0;
+      best_offset = use_prev ? best_offset_prev : 0;
+      // Find the longest match in a window around the pixel.
+      for (ind = 0; ind < num_ind; ++ind) {
+        int curr_length = 0;
+        int j = i;
+        int j_offset =
+            use_prev ? i - window_offsets_new[ind] : i - window_offsets[ind];
+        if (j_offset < 0 || argb[j_offset] != argb[i]) continue;
+        // The longest match is the sum of how many times each pixel is
+        // repeated.
+        do {
+          const int counts_j_offset = counts_ini[j_offset];
+          const int counts_j = counts_ini[j];
+          if (counts_j_offset != counts_j) {
+            curr_length +=
+                (counts_j_offset < counts_j) ? counts_j_offset : counts_j;
+            break;
+          }
+          // The same color is repeated counts_pos times at j_offset and j.
+          curr_length += counts_j_offset;
+          j_offset += counts_j_offset;
+          j += counts_j_offset;
+        } while (curr_length <= MAX_LENGTH && j < pix_count &&
+                 argb[j_offset] == argb[j]);
+        if (best_length < curr_length) {
+          best_offset =
+              use_prev ? window_offsets_new[ind] : window_offsets[ind];
+          if (curr_length >= MAX_LENGTH) {
+            best_length = MAX_LENGTH;
+            break;
+          } else {
+            best_length = curr_length;
+          }
+        }
+      }
+    }
+
+    assert(i + best_length <= pix_count);
+    assert(best_length <= MAX_LENGTH);
+    if (best_length <= MIN_LENGTH) {
+      hash_chain->offset_length_[i] = 0;
+      best_offset_prev = 0;
+      best_length_prev = 0;
+    } else {
+      hash_chain->offset_length_[i] =
+          (best_offset << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      best_offset_prev = best_offset;
+      best_length_prev = best_length;
+    }
+  }
+  hash_chain->offset_length_[0] = 0;
+  WebPSafeFree(counts_ini);
+
+  return BackwardReferencesLz77(xsize, ysize, argb, cache_bits, hash_chain,
+                                refs);
+}
+
+// -----------------------------------------------------------------------------
+
+static void BackwardReferences2DLocality(int xsize,
+                                         const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    if (PixOrCopyIsCopy(c.cur_pos)) {
+      const int dist = c.cur_pos->argb_or_distance;
+      const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist);
+      c.cur_pos->argb_or_distance = transformed_dist;
+    }
+    VP8LRefsCursorNext(&c);
+  }
+}
+
+// Evaluate optimal cache bits for the local color cache.
+// The input *best_cache_bits sets the maximum cache bits to use (passing 0
+// implies disabling the local color cache). The local color cache is also
+// disabled for the lower (<= 25) quality.
+// Returns 0 in case of memory error.
+static int CalculateBestCacheSize(const uint32_t* argb, int quality,
+                                  const VP8LBackwardRefs* const refs,
+                                  int* const best_cache_bits) {
+  int i;
+  const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
+  double entropy_min = MAX_ENTROPY;
+  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
+  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
+  int ok = 0;
+
+  assert(cache_bits_max >= 0 && cache_bits_max <= MAX_COLOR_CACHE_BITS);
+
+  if (cache_bits_max == 0) {
+    *best_cache_bits = 0;
+    // Local color cache is disabled.
+    return 1;
+  }
+
+  // Allocate data.
+  for (i = 0; i <= cache_bits_max; ++i) {
+    histos[i] = VP8LAllocateHistogram(i);
+    if (histos[i] == NULL) goto Error;
+    if (i == 0) continue;
+    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
+    if (!cc_init[i]) goto Error;
+  }
+
+  // Find the cache_bits giving the lowest entropy. The search is done in a
+  // brute-force way as the function (entropy w.r.t cache_bits) can be
+  // anything in practice.
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t pix = *argb++;
+      const uint32_t a = (pix >> 24) & 0xff;
+      const uint32_t r = (pix >> 16) & 0xff;
+      const uint32_t g = (pix >>  8) & 0xff;
+      const uint32_t b = (pix >>  0) & 0xff;
+      // The keys of the caches can be derived from the longest one.
+      int key = VP8LHashPix(pix, 32 - cache_bits_max);
+      // Do not use the color cache for cache_bits = 0.
+      ++histos[0]->blue_[b];
+      ++histos[0]->literal_[g];
+      ++histos[0]->red_[r];
+      ++histos[0]->alpha_[a];
+      // Deal with cache_bits > 0.
+      for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+        if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
+          ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+        } else {
+          VP8LColorCacheSet(&hashers[i], key, pix);
+          ++histos[i]->blue_[b];
+          ++histos[i]->literal_[g];
+          ++histos[i]->red_[r];
+          ++histos[i]->alpha_[a];
+        }
+      }
+    } else {
+      // We should compute the contribution of the (distance,length)
+      // histograms but those are the same independently from the cache size.
+      // As those constant contributions are in the end added to the other
+      // histogram contributions, we can safely ignore them.
+      int len = PixOrCopyLength(v);
+      uint32_t argb_prev = *argb ^ 0xffffffffu;
+      // Update the color caches.
+      do {
+        if (*argb != argb_prev) {
+          // Efficiency: insert only if the color changes.
+          int key = VP8LHashPix(*argb, 32 - cache_bits_max);
+          for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+            hashers[i].colors_[key] = *argb;
+          }
+          argb_prev = *argb;
+        }
+        argb++;
+      } while (--len != 0);
+    }
+    VP8LRefsCursorNext(&c);
+  }
+
+  for (i = 0; i <= cache_bits_max; ++i) {
+    const double entropy = VP8LHistogramEstimateBits(histos[i]);
+    if (i == 0 || entropy < entropy_min) {
+      entropy_min = entropy;
+      *best_cache_bits = i;
+    }
+  }
+  ok = 1;
+Error:
+  for (i = 0; i <= cache_bits_max; ++i) {
+    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
+    VP8LFreeHistogram(histos[i]);
+  }
+  return ok;
+}
+
+// Update (in-place) backward references for specified cache_bits.
+static int BackwardRefsWithLocalCache(const uint32_t* const argb,
+                                      int cache_bits,
+                                      VP8LBackwardRefs* const refs) {
+  int pixel_index = 0;
+  VP8LColorCache hashers;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+
+  while (VP8LRefsCursorOk(&c)) {
+    PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t argb_literal = v->argb_or_distance;
+      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
+      if (ix >= 0) {
+        // hashers contains argb_literal
+        *v = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        VP8LColorCacheInsert(&hashers, argb_literal);
+      }
+      ++pixel_index;
+    } else {
+      // refs was created without local cache, so it can not have cache indexes.
+      int k;
+      assert(PixOrCopyIsCopy(v));
+      for (k = 0; k < v->len; ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
+      }
+    }
+    VP8LRefsCursorNext(&c);
+  }
+  VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
+    int width, int height, const uint32_t* const argb,
+    int* const cache_bits, const VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs* const refs_lz77) {
+  *cache_bits = 0;
+  if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
+    return NULL;
+  }
+  BackwardReferences2DLocality(width, refs_lz77);
+  return refs_lz77;
+}
+
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+static VP8LBackwardRefs* GetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int lz77_types_to_try, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* best,
+    VP8LBackwardRefs* worst) {
+  const int cache_bits_initial = *cache_bits;
+  double bit_cost_best = -1;
+  VP8LHistogram* histo = NULL;
+  int lz77_type, lz77_type_best = 0;
+  VP8LHashChain hash_chain_box;
+  memset(&hash_chain_box, 0, sizeof(hash_chain_box));
+
+  histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
+  if (histo == NULL) goto Error;
+
+  for (lz77_type = 1; lz77_types_to_try;
+       lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
+    int res = 0;
+    double bit_cost;
+    int cache_bits_tmp = cache_bits_initial;
+    if ((lz77_types_to_try & lz77_type) == 0) continue;
+    switch (lz77_type) {
+      case kLZ77RLE:
+        res = BackwardReferencesRle(width, height, argb, 0, worst);
+        break;
+      case kLZ77Standard:
+        // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
+        // cache is not that different in practice.
+        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain, worst);
+        break;
+      case kLZ77Box:
+        if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
+        res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
+                                        &hash_chain_box, worst);
+        break;
+      default:
+        assert(0);
+    }
+    if (!res) goto Error;
+
+    // Next, try with a color cache and update the references.
+    if (!CalculateBestCacheSize(argb, quality, worst, &cache_bits_tmp)) {
+      goto Error;
+    }
+    if (cache_bits_tmp > 0) {
+      if (!BackwardRefsWithLocalCache(argb, cache_bits_tmp, worst)) {
+        goto Error;
+      }
+    }
+
+    // Keep the best backward references.
+    VP8LHistogramCreate(histo, worst, cache_bits_tmp);
+    bit_cost = VP8LHistogramEstimateBits(histo);
+    if (lz77_type_best == 0 || bit_cost < bit_cost_best) {
+      VP8LBackwardRefs* const tmp = worst;
+      worst = best;
+      best = tmp;
+      bit_cost_best = bit_cost;
+      *cache_bits = cache_bits_tmp;
+      lz77_type_best = lz77_type;
+    }
+  }
+  assert(lz77_type_best > 0);
+
+  // Improve on simple LZ77 but only for high quality (TraceBackwards is
+  // costly).
+  if ((lz77_type_best == kLZ77Standard || lz77_type_best == kLZ77Box) &&
+      quality >= 25) {
+    const VP8LHashChain* const hash_chain_tmp =
+        (lz77_type_best == kLZ77Standard) ? hash_chain : &hash_chain_box;
+    if (VP8LBackwardReferencesTraceBackwards(width, height, argb, *cache_bits,
+                                             hash_chain_tmp, best, worst)) {
+      double bit_cost_trace;
+      VP8LHistogramCreate(histo, worst, *cache_bits);
+      bit_cost_trace = VP8LHistogramEstimateBits(histo);
+      if (bit_cost_trace < bit_cost_best) best = worst;
+    }
+  }
+
+  BackwardReferences2DLocality(width, best);
+
+Error:
+  VP8LHashChainClear(&hash_chain_box);
+  VP8LFreeHistogram(histo);
+  return best;
+}
+
+VP8LBackwardRefs* VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int low_effort, int lz77_types_to_try, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
+    VP8LBackwardRefs* const refs_tmp2) {
+  if (low_effort) {
+    return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
+                                          hash_chain, refs_tmp1);
+  } else {
+    return GetBackwardReferences(width, height, argb, quality,
+                                 lz77_types_to_try, cache_bits, hash_chain,
+                                 refs_tmp1, refs_tmp2);
+  }
+}
diff --git a/thirdparty/libwebp/enc/backward_references_enc.h b/thirdparty/libwebp/src/enc/backward_references_enc.h
index 3a19aa763e..103ddfdcb7 100644
--- a/thirdparty/libwebp/enc/backward_references_enc.h
+++ b/thirdparty/libwebp/src/enc/backward_references_enc.h
@@ -10,13 +10,13 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //
 
-#ifndef WEBP_ENC_BACKWARD_REFERENCES_H_
-#define WEBP_ENC_BACKWARD_REFERENCES_H_
+#ifndef WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
+#define WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
 
 #include <assert.h>
 #include <stdlib.h>
-#include "../webp/types.h"
-#include "../webp/format_constants.h"
+#include "src/webp/types.h"
+#include "src/webp/format_constants.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -91,11 +91,6 @@ static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) {
   return p->len;
 }
 
-static WEBP_INLINE uint32_t PixOrCopyArgb(const PixOrCopy* const p) {
-  assert(p->mode == kLiteral);
-  return p->argb_or_distance;
-}
-
 static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) {
   assert(p->mode == kCacheIdx);
   assert(p->argb_or_distance < (1U << MAX_COLOR_CACHE_BITS));
@@ -113,6 +108,16 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
 #define HASH_BITS 18
 #define HASH_SIZE (1 << HASH_BITS)
 
+// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
+// is used in VP8LHashChain.
+#define MAX_LENGTH_BITS 12
+#define WINDOW_SIZE_BITS 20
+// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
+#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
+#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
+#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
+#endif
+
 typedef struct VP8LHashChain VP8LHashChain;
 struct VP8LHashChain {
   // The 20 most significant bits contain the offset at which the best match
@@ -134,6 +139,24 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                       int low_effort);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
+static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
+                                               const int base_position) {
+  return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
+}
+
+static WEBP_INLINE int VP8LHashChainFindLength(const VP8LHashChain* const p,
+                                               const int base_position) {
+  return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
+}
+
+static WEBP_INLINE void VP8LHashChainFindCopy(const VP8LHashChain* const p,
+                                              int base_position,
+                                              int* const offset_ptr,
+                                              int* const length_ptr) {
+  *offset_ptr = VP8LHashChainFindOffset(p, base_position);
+  *length_ptr = VP8LHashChainFindLength(p, base_position);
+}
+
 // -----------------------------------------------------------------------------
 // VP8LBackwardRefs (block-based backward-references storage)
 
@@ -158,9 +181,6 @@ struct VP8LBackwardRefs {
 void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
 // Release memory for backward references.
 void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
-// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error.
-int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
-                         VP8LBackwardRefs* const dst);
 
 // Cursor for iterating on references content
 typedef struct {
@@ -189,6 +209,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // -----------------------------------------------------------------------------
 // Main entry points
 
+enum VP8LLZ77Type {
+  kLZ77Standard = 1,
+  kLZ77RLE = 2,
+  kLZ77Box = 4
+};
+
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
@@ -197,11 +223,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // refs[0] or refs[1].
 VP8LBackwardRefs* VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs[2]);
+    int low_effort, int lz77_types_to_try, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
+    VP8LBackwardRefs* const refs_tmp2);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // WEBP_ENC_BACKWARD_REFERENCES_H_
+#endif  // WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
diff --git a/thirdparty/libwebp/enc/config_enc.c b/thirdparty/libwebp/src/enc/config_enc.c
index 4589dc0619..9d4828978e 100644
--- a/thirdparty/libwebp/enc/config_enc.c
+++ b/thirdparty/libwebp/src/enc/config_enc.c
@@ -12,10 +12,10 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "../webp/encode.h"
+#include "src/webp/encode.h"
 
 //------------------------------------------------------------------------------
 // WebPConfig
diff --git a/thirdparty/libwebp/enc/cost_enc.c b/thirdparty/libwebp/src/enc/cost_enc.c
index c823f5a664..48fd9bc347 100644
--- a/thirdparty/libwebp/enc/cost_enc.c
+++ b/thirdparty/libwebp/src/enc/cost_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./cost_enc.h"
+#include "src/enc/cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Level cost tables
diff --git a/thirdparty/libwebp/enc/cost_enc.h b/thirdparty/libwebp/src/enc/cost_enc.h
index 99e4b37aa3..bdce1e6a3b 100644
--- a/thirdparty/libwebp/enc/cost_enc.h
+++ b/thirdparty/libwebp/src/enc/cost_enc.h
@@ -11,12 +11,12 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_ENC_COST_H_
-#define WEBP_ENC_COST_H_
+#ifndef WEBP_ENC_COST_ENC_H_
+#define WEBP_ENC_COST_ENC_H_
 
 #include <assert.h>
 #include <stdlib.h>
-#include "./vp8i_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -79,4 +79,4 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_COST_H_ */
+#endif  /* WEBP_ENC_COST_ENC_H_ */
diff --git a/thirdparty/libwebp/enc/delta_palettization_enc.c b/thirdparty/libwebp/src/enc/delta_palettization_enc.c
index eaf0f050ea..a61c8e6c93 100644
--- a/thirdparty/libwebp/enc/delta_palettization_enc.c
+++ b/thirdparty/libwebp/src/enc/delta_palettization_enc.c
@@ -10,11 +10,11 @@
 // Author: Mislav Bradac (mislavm@google.com)
 //
 
-#include "./delta_palettization_enc.h"
+#include "src/enc/delta_palettization_enc.h"
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "../webp/types.h"
-#include "../dsp/lossless.h"
+#include "src/webp/types.h"
+#include "src/dsp/lossless.h"
 
 #define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
 
diff --git a/thirdparty/libwebp/enc/delta_palettization_enc.h b/thirdparty/libwebp/src/enc/delta_palettization_enc.h
index 63048ec6e8..b15e2cd487 100644
--- a/thirdparty/libwebp/enc/delta_palettization_enc.h
+++ b/thirdparty/libwebp/src/enc/delta_palettization_enc.h
@@ -10,11 +10,11 @@
 // Author: Mislav Bradac (mislavm@google.com)
 //
 
-#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_
-#define WEBP_ENC_DELTA_PALETTIZATION_H_
+#ifndef WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
+#define WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
 
-#include "../webp/encode.h"
-#include "../enc/vp8li_enc.h"
+#include "src/webp/encode.h"
+#include "src/enc/vp8li_enc.h"
 
 // Replaces enc->argb_[] input by a palettizable approximation of it,
 // and generates optimal enc->palette_[].
@@ -22,4 +22,4 @@
 // if delta-palettization is not producing expected saving.
 WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
 
-#endif  // WEBP_ENC_DELTA_PALETTIZATION_H_
+#endif  // WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
diff --git a/thirdparty/libwebp/enc/filter_enc.c b/thirdparty/libwebp/src/enc/filter_enc.c
index 4bc367274c..580800bfb8 100644
--- a/thirdparty/libwebp/enc/filter_enc.c
+++ b/thirdparty/libwebp/src/enc/filter_enc.c
@@ -12,8 +12,8 @@
 // Author: somnath@google.com (Somnath Banerjee)
 
 #include <assert.h>
-#include "./vp8i_enc.h"
-#include "../dsp/dsp.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
 
 // This table gives, for a given sharpness, the filtering strength to be
 // used (at least) in order to filter a given edge step delta.
@@ -65,6 +65,8 @@ int VP8FilterStrengthFromDelta(int sharpness, int delta) {
 //------------------------------------------------------------------------------
 // Paragraph 15.4: compute the inner-edge filtering strength
 
+#if !defined(WEBP_REDUCE_SIZE)
+
 static int GetILevel(int sharpness, int level) {
   if (sharpness > 0) {
     if (sharpness > 4) {
@@ -129,11 +131,14 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   return sum;
 }
 
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
 //------------------------------------------------------------------------------
 // Exposed APIs: Encoder should call the following 3 functions to adjust
 // loop filter strength
 
 void VP8InitFilter(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
   if (it->lf_stats_ != NULL) {
     int s, i;
     for (s = 0; s < NUM_MB_SEGMENTS; s++) {
@@ -143,9 +148,13 @@ void VP8InitFilter(VP8EncIterator* const it) {
     }
     VP8SSIMDspInit();
   }
+#else
+  (void)it;
+#endif
 }
 
 void VP8StoreFilterStats(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
   int d;
   VP8Encoder* const enc = it->enc_;
   const int s = it->mb_->segment_;
@@ -177,10 +186,14 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
     DoFilter(it, level);
     (*it->lf_stats_)[s][level] += GetMBSSIM(it->yuv_in_, it->yuv_out2_);
   }
+#else  // defined(WEBP_REDUCE_SIZE)
+  (void)it;
+#endif  // !defined(WEBP_REDUCE_SIZE)
 }
 
 void VP8AdjustFilterStrength(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
+#if !defined(WEBP_REDUCE_SIZE)
   if (it->lf_stats_ != NULL) {
     int s;
     for (s = 0; s < NUM_MB_SEGMENTS; s++) {
@@ -196,7 +209,10 @@ void VP8AdjustFilterStrength(VP8EncIterator* const it) {
       }
       enc->dqm_[s].fstrength_ = best_level;
     }
-  } else if (enc->config_->filter_strength > 0) {
+    return;
+  }
+#endif  // !defined(WEBP_REDUCE_SIZE)
+  if (enc->config_->filter_strength > 0) {
     int max_level = 0;
     int s;
     for (s = 0; s < NUM_MB_SEGMENTS; s++) {
diff --git a/thirdparty/libwebp/enc/frame_enc.c b/thirdparty/libwebp/src/enc/frame_enc.c
index abef523bbf..2b0dc66410 100644
--- a/thirdparty/libwebp/enc/frame_enc.c
+++ b/thirdparty/libwebp/src/enc/frame_enc.c
@@ -14,10 +14,10 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost_enc.h"
-#include "./vp8i_enc.h"
-#include "../dsp/dsp.h"
-#include "../webp/format_constants.h"  // RIFF constants
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/webp/format_constants.h"  // RIFF constants
 
 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
@@ -200,11 +200,13 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
     const VP8MBInfo* const mb = &enc->mb_info_[n];
     p[mb->segment_]++;
   }
+#if !defined(WEBP_DISABLE_STATS)
   if (enc->pic_->stats != NULL) {
     for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
       enc->pic_->stats->segment_size[n] = p[n];
     }
   }
+#endif
   if (enc->segment_hdr_.num_segments_ > 1) {
     uint8_t* const probas = enc->proba_.segments_;
     probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
@@ -452,6 +454,8 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
 //------------------------------------------------------------------------------
 // ExtraInfo map / Debug function
 
+#if !defined(WEBP_DISABLE_STATS)
+
 #if SEGMENT_VISU
 static void SetBlock(uint8_t* p, int value, int size) {
   int y;
@@ -516,6 +520,20 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }
 
+#else  // defined(WEBP_DISABLE_STATS)
+static void ResetSSE(VP8Encoder* const enc) {
+  (void)enc;
+}
+static void StoreSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->extra_info != NULL) {
+    memset(pic->extra_info, 0,
+           enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+  }
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
 static double GetPSNR(uint64_t mse, uint64_t size) {
   return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
 }
@@ -640,7 +658,7 @@ static int StatLoop(VP8Encoder* const enc) {
 // Main loops
 //
 
-static const int kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
+static const uint8_t kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
 
 static int PreLoopInitialize(VP8Encoder* const enc) {
   int p;
@@ -670,6 +688,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
   }
 
   if (ok) {      // All good. Finish up.
+#if !defined(WEBP_DISABLE_STATS)
     if (enc->pic_->stats != NULL) {  // finalize byte counters...
       int i, s;
       for (i = 0; i <= 2; ++i) {
@@ -678,6 +697,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
         }
       }
     }
+#endif
     VP8AdjustFilterStrength(it);     // ...and store filter stats.
   } else {
     // Something bad happened -> need to do some memory cleanup.
diff --git a/thirdparty/libwebp/enc/histogram_enc.c b/thirdparty/libwebp/src/enc/histogram_enc.c
index 808b6f78ab..056a972dda 100644
--- a/thirdparty/libwebp/enc/histogram_enc.c
+++ b/thirdparty/libwebp/src/enc/histogram_enc.c
@@ -10,16 +10,16 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <math.h>
 
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../utils/utils.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/utils.h"
 
 #define MAX_COST 1.e38
 
@@ -76,7 +76,7 @@ void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
                             VP8LHistogram* const histo) {
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
   while (VP8LRefsCursorOk(&c)) {
-    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, NULL, 0);
     VP8LRefsCursorNext(&c);
   }
 }
@@ -138,7 +138,9 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
 // -----------------------------------------------------------------------------
 
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
-                                     const PixOrCopy* const v) {
+                                     const PixOrCopy* const v,
+                                     int (*const distance_modifier)(int, int),
+                                     int distance_modifier_arg0) {
   if (PixOrCopyIsLiteral(v)) {
     ++histo->alpha_[PixOrCopyLiteral(v, 3)];
     ++histo->red_[PixOrCopyLiteral(v, 2)];
@@ -152,7 +154,13 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
     int code, extra_bits;
     VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
     ++histo->literal_[NUM_LITERAL_CODES + code];
-    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    if (distance_modifier == NULL) {
+      VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    } else {
+      VP8LPrefixEncodeBits(
+          distance_modifier(distance_modifier_arg0, PixOrCopyDistance(v)),
+          &code, &extra_bits);
+    }
     ++histo->distance_[code];
   }
 }
@@ -473,7 +481,7 @@ static void HistogramBuild(
   while (VP8LRefsCursorOk(&c)) {
     const PixOrCopy* const v = c.cur_pos;
     const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
-    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v);
+    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v, NULL, 0);
     x += PixOrCopyLength(v);
     while (x >= xsize) {
       x -= xsize;
@@ -523,11 +531,12 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
 
 // Compact image_histo[] by merging some histograms with same bin_id together if
 // it's advantageous.
-static VP8LHistogram* HistogramCombineEntropyBin(
-    VP8LHistogramSet* const image_histo,
-    VP8LHistogram* cur_combo,
-    const uint16_t* const bin_map, int bin_map_size, int num_bins,
-    double combine_cost_factor, int low_effort) {
+static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+                                       VP8LHistogram* cur_combo,
+                                       const uint16_t* const bin_map,
+                                       int bin_map_size, int num_bins,
+                                       double combine_cost_factor,
+                                       int low_effort) {
   VP8LHistogram** const histograms = image_histo->histograms;
   int idx;
   // Work in-place: processed histograms are put at the beginning of
@@ -593,14 +602,13 @@ static VP8LHistogram* HistogramCombineEntropyBin(
       UpdateHistogramCost(histograms[idx]);
     }
   }
-  return cur_combo;
 }
 
+// Implement a Lehmer random number generator with a multiplicative constant of
+// 48271 and a modulo constant of 2^31 − 1.
 static uint32_t MyRand(uint32_t* const seed) {
-  *seed = (*seed * 16807ull) & 0xffffffffu;
-  if (*seed == 0) {
-    *seed = 1;
-  }
+  *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
+  assert(*seed > 0);
   return *seed;
 }
 
@@ -641,57 +649,75 @@ static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
 static void HistoQueueClear(HistoQueue* const histo_queue) {
   assert(histo_queue != NULL);
   WebPSafeFree(histo_queue->queue);
+  histo_queue->size = 0;
+  histo_queue->max_size = 0;
 }
 
-static void SwapHistogramPairs(HistogramPair *p1,
-                               HistogramPair *p2) {
-  const HistogramPair tmp = *p1;
-  *p1 = *p2;
-  *p2 = tmp;
+// Pop a specific pair in the queue by replacing it with the last one
+// and shrinking the queue.
+static void HistoQueuePopPair(HistoQueue* const histo_queue,
+                              HistogramPair* const pair) {
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  *pair = histo_queue->queue[histo_queue->size - 1];
+  --histo_queue->size;
 }
 
-// Given a valid priority queue in range [0, queue_size) this function checks
-// whether histo_queue[queue_size] should be accepted and swaps it with the
-// front if it is smaller. Otherwise, it leaves it as is.
-static void UpdateQueueFront(HistoQueue* const histo_queue) {
-  if (histo_queue->queue[histo_queue->size].cost_diff >= 0) return;
-
-  if (histo_queue->queue[histo_queue->size].cost_diff <
-      histo_queue->queue[0].cost_diff) {
-    SwapHistogramPairs(histo_queue->queue,
-                       histo_queue->queue + histo_queue->size);
+// Check whether a pair in the queue should be updated as head or not.
+static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
+                                 HistogramPair* const pair) {
+  assert(pair->cost_diff < 0.);
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  if (pair->cost_diff < histo_queue->queue[0].cost_diff) {
+    // Replace the best pair.
+    const HistogramPair tmp = histo_queue->queue[0];
+    histo_queue->queue[0] = *pair;
+    *pair = tmp;
   }
-  ++histo_queue->size;
-
-  // We cannot add more elements than the capacity.
-  // The allocation adds an extra element to the official capacity so that
-  // histo_queue->queue[histo_queue->max_size] is read/written within bound.
-  assert(histo_queue->size <= histo_queue->max_size);
 }
 
-// -----------------------------------------------------------------------------
-
-static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2,
-                        HistogramPair* const pair) {
-  VP8LHistogram* h1;
-  VP8LHistogram* h2;
+// Create a pair from indices "idx1" and "idx2" provided its cost
+// is inferior to "threshold", a negative entropy.
+// It returns the cost of the pair, or 0. if it superior to threshold.
+static double HistoQueuePush(HistoQueue* const histo_queue,
+                             VP8LHistogram** const histograms, int idx1,
+                             int idx2, double threshold) {
+  const VP8LHistogram* h1;
+  const VP8LHistogram* h2;
+  HistogramPair pair;
   double sum_cost;
 
+  assert(threshold <= 0.);
   if (idx1 > idx2) {
     const int tmp = idx2;
     idx2 = idx1;
     idx1 = tmp;
   }
-  pair->idx1 = idx1;
-  pair->idx2 = idx2;
+  pair.idx1 = idx1;
+  pair.idx2 = idx2;
   h1 = histograms[idx1];
   h2 = histograms[idx2];
   sum_cost = h1->bit_cost_ + h2->bit_cost_;
-  pair->cost_combo = 0.;
-  GetCombinedHistogramEntropy(h1, h2, sum_cost, &pair->cost_combo);
-  pair->cost_diff = pair->cost_combo - sum_cost;
+  pair.cost_combo = 0.;
+  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair.cost_combo);
+  pair.cost_diff = pair.cost_combo - sum_cost;
+
+  // Do not even consider the pair if it does not improve the entropy.
+  if (pair.cost_diff >= threshold) return 0.;
+
+  // We cannot add more elements than the capacity.
+  assert(histo_queue->size < histo_queue->max_size);
+  histo_queue->queue[histo_queue->size++] = pair;
+  HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
+
+  return pair.cost_diff;
 }
 
+// -----------------------------------------------------------------------------
+
 // Combines histograms by continuously choosing the one with the highest cost
 // reduction.
 static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
@@ -714,13 +740,11 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
     clusters[i] = i;
     for (j = i + 1; j < image_histo_size; ++j) {
       // Initialize positions array.
-      PreparePair(histograms, i, j, &histo_queue.queue[histo_queue.size]);
-      UpdateQueueFront(&histo_queue);
+      HistoQueuePush(&histo_queue, histograms, i, j, 0.);
     }
   }
 
   while (image_histo_size > 1 && histo_queue.size > 0) {
-    HistogramPair* copy_to;
     const int idx1 = histo_queue.queue[0].idx1;
     const int idx2 = histo_queue.queue[0].idx2;
     HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
@@ -733,31 +757,22 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
     }
     --image_histo_size;
 
-    // Remove pairs intersecting the just combined best pair. This will
-    // therefore pop the head of the queue.
-    copy_to = histo_queue.queue;
-    for (i = 0; i < histo_queue.size; ++i) {
+    // Remove pairs intersecting the just combined best pair.
+    for (i = 0; i < histo_queue.size;) {
       HistogramPair* const p = histo_queue.queue + i;
       if (p->idx1 == idx1 || p->idx2 == idx1 ||
           p->idx1 == idx2 || p->idx2 == idx2) {
-        // Do not copy the invalid pair.
-        continue;
-      }
-      if (p->cost_diff < histo_queue.queue[0].cost_diff) {
-        // Replace the top of the queue if we found better.
-        SwapHistogramPairs(histo_queue.queue, p);
+        HistoQueuePopPair(&histo_queue, p);
+      } else {
+        HistoQueueUpdateHead(&histo_queue, p);
+        ++i;
       }
-      SwapHistogramPairs(copy_to, p);
-      ++copy_to;
     }
-    histo_queue.size = (int)(copy_to - histo_queue.queue);
 
     // Push new pairs formed with combined histogram to the queue.
     for (i = 0; i < image_histo_size; ++i) {
       if (clusters[i] != idx1) {
-        PreparePair(histograms, idx1, clusters[i],
-                    &histo_queue.queue[histo_queue.size]);
-        UpdateQueueFront(&histo_queue);
+        HistoQueuePush(&histo_queue, histograms, idx1, clusters[i], 0.);
       }
     }
   }
@@ -777,90 +792,130 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
   return ok;
 }
 
-static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
-                                       VP8LHistogram* tmp_histo,
-                                       VP8LHistogram* best_combo,
-                                       int quality, int min_cluster_size) {
+// Perform histogram aggregation using a stochastic approach.
+// 'do_greedy' is set to 1 if a greedy approach needs to be performed
+// afterwards, 0 otherwise.
+static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
+                                      int min_cluster_size,
+                                      int* const do_greedy) {
   int iter;
-  uint32_t seed = 0;
+  uint32_t seed = 1;
   int tries_with_no_success = 0;
   int image_histo_size = image_histo->size;
-  const int iter_mult = (quality < 25) ? 2 : 2 + (quality - 25) / 8;
-  const int outer_iters = image_histo_size * iter_mult;
-  const int num_pairs = image_histo_size / 2;
+  const int outer_iters = image_histo_size;
   const int num_tries_no_success = outer_iters / 2;
-  int idx2_max = image_histo_size - 1;
-  int do_brute_dorce = 0;
   VP8LHistogram** const histograms = image_histo->histograms;
+  // Priority queue of histogram pairs. Its size of "kCostHeapSizeSqrt"^2
+  // impacts the quality of the compression and the speed: the smaller the
+  // faster but the worse for the compression.
+  HistoQueue histo_queue;
+  const int kHistoQueueSizeSqrt = 3;
+  int ok = 0;
 
+  if (!HistoQueueInit(&histo_queue, kHistoQueueSizeSqrt)) {
+    goto End;
+  }
   // Collapse similar histograms in 'image_histo'.
   ++min_cluster_size;
-  for (iter = 0;
-       iter < outer_iters && image_histo_size >= min_cluster_size;
+  for (iter = 0; iter < outer_iters && image_histo_size >= min_cluster_size &&
+                 ++tries_with_no_success < num_tries_no_success;
        ++iter) {
-    double best_cost_diff = 0.;
+    double best_cost =
+        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
     int best_idx1 = -1, best_idx2 = 1;
     int j;
-    int num_tries =
-        (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
-    // Use a brute force approach if:
-    // - stochastic has not worked for a while and
-    // - if the number of iterations for brute force is less than the number of
-    // iterations if we never find a match ever again stochastically (hence
-    // num_tries times the number of remaining outer iterations).
-    do_brute_dorce =
-        (tries_with_no_success > 10) &&
-        (idx2_max * (idx2_max + 1) < 2 * num_tries * (outer_iters - iter));
-    if (do_brute_dorce) num_tries = idx2_max;
-
-    seed += iter;
-    for (j = 0; j < num_tries; ++j) {
-      double curr_cost_diff;
-      // Choose two histograms at random and try to combine them.
-      uint32_t idx1, idx2;
-      if (do_brute_dorce) {
-        // Use a brute force approach.
-        idx1 = (uint32_t)j;
-        idx2 = (uint32_t)idx2_max;
-      } else {
-        const uint32_t tmp = (j & 7) + 1;
-        const uint32_t diff =
-            (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
-        idx1 = MyRand(&seed) % image_histo_size;
-        idx2 = (idx1 + diff + 1) % image_histo_size;
-        if (idx1 == idx2) {
-          continue;
-        }
-      }
+    const uint32_t rand_range = (image_histo_size - 1) * image_histo_size;
+    // image_histo_size / 2 was chosen empirically. Less means faster but worse
+    // compression.
+    const int num_tries = image_histo_size / 2;
 
-      // Calculate cost reduction on combining.
-      curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
-                                        tmp_histo, best_cost_diff);
-      if (curr_cost_diff < best_cost_diff) {  // found a better pair?
-        HistogramSwap(&best_combo, &tmp_histo);
-        best_cost_diff = curr_cost_diff;
-        best_idx1 = idx1;
-        best_idx2 = idx2;
+    for (j = 0; j < num_tries; ++j) {
+      double curr_cost;
+      // Choose two different histograms at random and try to combine them.
+      const uint32_t tmp = MyRand(&seed) % rand_range;
+      const uint32_t idx1 = tmp / (image_histo_size - 1);
+      uint32_t idx2 = tmp % (image_histo_size - 1);
+      if (idx2 >= idx1) ++idx2;
+
+      // Calculate cost reduction on combination.
+      curr_cost =
+          HistoQueuePush(&histo_queue, histograms, idx1, idx2, best_cost);
+      if (curr_cost < 0) {  // found a better pair?
+        best_cost = curr_cost;
+        // Empty the queue if we reached full capacity.
+        if (histo_queue.size == histo_queue.max_size) break;
       }
     }
-    if (do_brute_dorce) --idx2_max;
-
-    if (best_idx1 >= 0) {
-      HistogramSwap(&best_combo, &histograms[best_idx1]);
-      // swap best_idx2 slot with last one (which is now unused)
-      --image_histo_size;
-      if (idx2_max >= image_histo_size) idx2_max = image_histo_size - 1;
-      if (best_idx2 != image_histo_size) {
-        HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
-        histograms[image_histo_size] = NULL;
-      }
-      tries_with_no_success = 0;
+    if (histo_queue.size == 0) continue;
+
+    // Merge the two best histograms.
+    best_idx1 = histo_queue.queue[0].idx1;
+    best_idx2 = histo_queue.queue[0].idx2;
+    assert(best_idx1 < best_idx2);
+    HistogramAddEval(histograms[best_idx1], histograms[best_idx2],
+                     histograms[best_idx1], 0);
+    // Swap the best_idx2 histogram with the last one (which is now unused).
+    --image_histo_size;
+    if (best_idx2 != image_histo_size) {
+      HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
     }
-    if (++tries_with_no_success >= num_tries_no_success || idx2_max == 0) {
-      break;
+    histograms[image_histo_size] = NULL;
+    // Parse the queue and update each pair that deals with best_idx1,
+    // best_idx2 or image_histo_size.
+    for (j = 0; j < histo_queue.size;) {
+      HistogramPair* const p = histo_queue.queue + j;
+      const int is_idx1_best = p->idx1 == best_idx1 || p->idx1 == best_idx2;
+      const int is_idx2_best = p->idx2 == best_idx1 || p->idx2 == best_idx2;
+      int do_eval = 0;
+      // The front pair could have been duplicated by a random pick so
+      // check for it all the time nevertheless.
+      if (is_idx1_best && is_idx2_best) {
+        HistoQueuePopPair(&histo_queue, p);
+        continue;
+      }
+      // Any pair containing one of the two best indices should only refer to
+      // best_idx1. Its cost should also be updated.
+      if (is_idx1_best) {
+        p->idx1 = best_idx1;
+        do_eval = 1;
+      } else if (is_idx2_best) {
+        p->idx2 = best_idx1;
+        do_eval = 1;
+      }
+      if (p->idx2 == image_histo_size) {
+        // No need to re-evaluate here as it does not involve a pair
+        // containing best_idx1 or best_idx2.
+        p->idx2 = best_idx2;
+      }
+      assert(p->idx2 < image_histo_size);
+      // Make sure the index order is respected.
+      if (p->idx1 > p->idx2) {
+        const int tmp = p->idx2;
+        p->idx2 = p->idx1;
+        p->idx1 = tmp;
+      }
+      if (do_eval) {
+        // Re-evaluate the cost of an updated pair.
+        GetCombinedHistogramEntropy(histograms[p->idx1], histograms[p->idx2], 0,
+                                    &p->cost_diff);
+        if (p->cost_diff >= 0.) {
+          HistoQueuePopPair(&histo_queue, p);
+          continue;
+        }
+      }
+      HistoQueueUpdateHead(&histo_queue, p);
+      ++j;
     }
+
+    tries_with_no_success = 0;
   }
   image_histo->size = image_histo_size;
+  *do_greedy = (image_histo->size <= min_cluster_size);
+  ok = 1;
+
+End:
+  HistoQueueClear(&histo_queue);
+  return ok;
 }
 
 // -----------------------------------------------------------------------------
@@ -925,7 +980,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              int quality, int low_effort,
                              int histo_bits, int cache_bits,
                              VP8LHistogramSet* const image_histo,
-                             VP8LHistogramSet* const tmp_histos,
+                             VP8LHistogram* const tmp_histo,
                              uint16_t* const histogram_symbols) {
   int ok = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
@@ -933,7 +988,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   const int image_histo_raw_size = histo_xsize * histo_ysize;
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
-  VP8LHistogram* cur_combo;
   // Don't attempt linear bin-partition heuristic for
   // histograms of small sizes (as bin_map will be very sparse) and
   // maximum quality q==100 (to preserve the compression gains at that level).
@@ -948,7 +1002,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   // Copies the histograms and computes its bit_cost.
   HistogramCopyAndAnalyze(orig_histo, image_histo);
 
-  cur_combo = tmp_histos->histograms[1];  // pick up working slot
   if (entropy_combine) {
     const int bin_map_size = orig_histo->size;
     // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
@@ -958,10 +1011,9 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
 
     HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo,
-                                           bin_map, bin_map_size,
-                                           entropy_combine_num_bins,
-                                           combine_cost_factor, low_effort);
+    HistogramCombineEntropyBin(image_histo, tmp_histo, bin_map, bin_map_size,
+                               entropy_combine_num_bins, combine_cost_factor,
+                               low_effort);
   }
 
   // Don't combine the histograms using stochastic and greedy heuristics for
@@ -970,10 +1022,11 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
     const float x = quality / 100.f;
     // cubic ramp between 1 and MAX_HISTO_GREEDY:
     const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
-    HistogramCombineStochastic(image_histo, tmp_histos->histograms[0],
-                               cur_combo, quality, threshold_size);
-    if ((image_histo->size <= threshold_size) &&
-        !HistogramCombineGreedy(image_histo)) {
+    int do_greedy;
+    if (!HistogramCombineStochastic(image_histo, threshold_size, &do_greedy)) {
+      goto Error;
+    }
+    if (do_greedy && !HistogramCombineGreedy(image_histo)) {
       goto Error;
     }
   }
diff --git a/thirdparty/libwebp/enc/histogram_enc.h b/thirdparty/libwebp/src/enc/histogram_enc.h
index a9d258a166..15b1fbda34 100644
--- a/thirdparty/libwebp/enc/histogram_enc.h
+++ b/thirdparty/libwebp/src/enc/histogram_enc.h
@@ -11,14 +11,14 @@
 //
 // Models the histograms of literal and distance codes.
 
-#ifndef WEBP_ENC_HISTOGRAM_H_
-#define WEBP_ENC_HISTOGRAM_H_
+#ifndef WEBP_ENC_HISTOGRAM_ENC_H_
+#define WEBP_ENC_HISTOGRAM_ENC_H_
 
 #include <string.h>
 
-#include "./backward_references_enc.h"
-#include "../webp/format_constants.h"
-#include "../webp/types.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -90,7 +90,9 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits);
 
 // Accumulate a token 'v' into a histogram.
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
-                                     const PixOrCopy* const v);
+                                     const PixOrCopy* const v,
+                                     int (*const distance_modifier)(int, int),
+                                     int distance_modifier_arg0);
 
 static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
   return NUM_LITERAL_CODES + NUM_LENGTH_CODES +
@@ -103,7 +105,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              int quality, int low_effort,
                              int histogram_bits, int cache_bits,
                              VP8LHistogramSet* const image_in,
-                             VP8LHistogramSet* const tmp_histos,
+                             VP8LHistogram* const tmp_histo,
                              uint16_t* const histogram_symbols);
 
 // Returns the entropy for the symbols in the input array.
@@ -120,4 +122,4 @@ double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
 }
 #endif
 
-#endif  // WEBP_ENC_HISTOGRAM_H_
+#endif  // WEBP_ENC_HISTOGRAM_ENC_H_
diff --git a/thirdparty/libwebp/enc/iterator_enc.c b/thirdparty/libwebp/src/enc/iterator_enc.c
index e48d30bd31..cfacfd2401 100644
--- a/thirdparty/libwebp/enc/iterator_enc.c
+++ b/thirdparty/libwebp/src/enc/iterator_enc.c
@@ -13,7 +13,7 @@
 
 #include <string.h>
 
-#include "./vp8i_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // VP8Iterator
diff --git a/thirdparty/libwebp/enc/near_lossless_enc.c b/thirdparty/libwebp/src/enc/near_lossless_enc.c
index 2bd03ab20d..cadd14c664 100644
--- a/thirdparty/libwebp/enc/near_lossless_enc.c
+++ b/thirdparty/libwebp/src/enc/near_lossless_enc.c
@@ -17,18 +17,20 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "../dsp/lossless_common.h"
-#include "../utils/utils.h"
-#include "./vp8i_enc.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/utils.h"
+#include "src/enc/vp8li_enc.h"
+
+#if (WEBP_NEAR_LOSSLESS == 1)
 
 #define MIN_DIM_FOR_NEAR_LOSSLESS 64
 #define MAX_LIMIT_BITS             5
 
 // Quantizes the value up or down to a multiple of 1<<bits (or to 255),
 // choosing the closer one, resolving ties using bankers' rounding.
-static int FindClosestDiscretized(int a, int bits) {
-  const int mask = (1 << bits) - 1;
-  const int biased = a + (mask >> 1) + ((a >> bits) & 1);
+static uint32_t FindClosestDiscretized(uint32_t a, int bits) {
+  const uint32_t mask = (1u << bits) - 1;
+  const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1);
   assert(bits > 0);
   if (biased > 0xff) return 0xff;
   return biased & ~mask;
@@ -69,22 +71,30 @@ static int IsSmooth(const uint32_t* const prev_row,
 }
 
 // Adjusts pixel values of image with given maximum error.
-static void NearLossless(int xsize, int ysize, uint32_t* argb,
-                         int limit_bits, uint32_t* copy_buffer) {
+static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
+                         int stride, int limit_bits, uint32_t* copy_buffer,
+                         uint32_t* argb_dst) {
   int x, y;
   const int limit = 1 << limit_bits;
   uint32_t* prev_row = copy_buffer;
   uint32_t* curr_row = prev_row + xsize;
   uint32_t* next_row = curr_row + xsize;
-  memcpy(copy_buffer, argb, xsize * 2 * sizeof(argb[0]));
+  memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0]));
+  memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
 
-  for (y = 1; y < ysize - 1; ++y) {
-    uint32_t* const curr_argb_row = argb + y * xsize;
-    uint32_t* const next_argb_row = curr_argb_row + xsize;
-    memcpy(next_row, next_argb_row, xsize * sizeof(argb[0]));
-    for (x = 1; x < xsize - 1; ++x) {
-      if (!IsSmooth(prev_row, curr_row, next_row, x, limit)) {
-        curr_argb_row[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+  for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) {
+    if (y == 0 || y == ysize - 1) {
+      memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0]));
+    } else {
+      memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+      argb_dst[0] = argb_src[0];
+      argb_dst[xsize - 1] = argb_src[xsize - 1];
+      for (x = 1; x < xsize - 1; ++x) {
+        if (IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+          argb_dst[x] = curr_row[x];
+        } else {
+          argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+        }
       }
     }
     {
@@ -97,26 +107,45 @@ static void NearLossless(int xsize, int ysize, uint32_t* argb,
   }
 }
 
-int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) {
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+                         uint32_t* const argb_dst) {
   int i;
+  const int xsize = picture->width;
+  const int ysize = picture->height;
+  const int stride = picture->argb_stride;
   uint32_t* const copy_buffer =
       (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
   const int limit_bits = VP8LNearLosslessBits(quality);
-  assert(argb != NULL);
-  assert(limit_bits >= 0);
+  assert(argb_dst != NULL);
+  assert(limit_bits > 0);
   assert(limit_bits <= MAX_LIMIT_BITS);
   if (copy_buffer == NULL) {
     return 0;
   }
   // For small icon images, don't attempt to apply near-lossless compression.
-  if (xsize < MIN_DIM_FOR_NEAR_LOSSLESS && ysize < MIN_DIM_FOR_NEAR_LOSSLESS) {
+  if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS &&
+       ysize < MIN_DIM_FOR_NEAR_LOSSLESS) ||
+      ysize < 3) {
+    for (i = 0; i < ysize; ++i) {
+      memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride,
+             xsize * sizeof(*argb_dst));
+    }
     WebPSafeFree(copy_buffer);
     return 1;
   }
 
-  for (i = limit_bits; i != 0; --i) {
-    NearLossless(xsize, ysize, argb, i, copy_buffer);
+  NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer,
+               argb_dst);
+  for (i = limit_bits - 1; i != 0; --i) {
+    NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst);
   }
   WebPSafeFree(copy_buffer);
   return 1;
 }
+#else  // (WEBP_NEAR_LOSSLESS == 1)
+
+// Define a stub to suppress compiler warnings.
+extern void VP8LNearLosslessStub(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8LNearLosslessStub(void) {}
+
+#endif  // (WEBP_NEAR_LOSSLESS == 1)
diff --git a/thirdparty/libwebp/enc/picture_csp_enc.c b/thirdparty/libwebp/src/enc/picture_csp_enc.c
index e5d1c75a66..d531dd0282 100644
--- a/thirdparty/libwebp/enc/picture_csp_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_csp_enc.c
@@ -15,10 +15,12 @@
 #include <stdlib.h>
 #include <math.h>
 
-#include "./vp8i_enc.h"
-#include "../utils/random_utils.h"
-#include "../utils/utils.h"
-#include "../dsp/yuv.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/random_utils.h"
+#include "src/utils/utils.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/yuv.h"
 
 // Uncomment to disable gamma-compression during RGB->U/V averaging
 #define USE_GAMMA_COMPRESSION
@@ -39,12 +41,15 @@ static const union {
 static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
                           int x_step, int y_step) {
   if (alpha == NULL) return 0;
-  while (height-- > 0) {
-    int x;
-    for (x = 0; x < width * x_step; x += x_step) {
-      if (alpha[x] != 0xff) return 1;  // TODO(skal): check 4/8 bytes at a time.
+  WebPInitAlphaProcessing();
+  if (x_step == 1) {
+    for (; height-- > 0; alpha += y_step) {
+      if (WebPHasAlpha8b(alpha, width)) return 1;
+    }
+  } else {
+    for (; height-- > 0; alpha += y_step) {
+      if (WebPHasAlpha32b(alpha, width)) return 1;
     }
-    alpha += y_step;
   }
   return 0;
 }
@@ -56,15 +61,10 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
     return CheckNonOpaque(picture->a, picture->width, picture->height,
                           1, picture->a_stride);
   } else {
-    int x, y;
-    const uint32_t* argb = picture->argb;
-    if (argb == NULL) return 0;
-    for (y = 0; y < picture->height; ++y) {
-      for (x = 0; x < picture->width; ++x) {
-        if (argb[x] < 0xff000000u) return 1;   // test any alpha values != 0xff
-      }
-      argb += picture->argb_stride;
-    }
+    const int alpha_offset = ALPHA_IS_LAST ? 3 : 0;
+    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
+                          picture->width, picture->height,
+                          4, picture->argb_stride * sizeof(*picture->argb));
   }
   return 0;
 }
@@ -171,7 +171,7 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 #if defined(USE_GAMMA_COMPRESSION)
 
 // float variant of gamma-correction
-// We use tables of different size and precision for the Rec709
+// We use tables of different size and precision for the Rec709 / BT2020
 // transfer function.
 #define kGammaF (1./0.45)
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
@@ -183,8 +183,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
     int v;
     const double norm = 1. / MAX_Y_T;
     const double scale = 1. / kGammaTabSize;
-    const double a = 0.099;
-    const double thresh = 0.018;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
     for (v = 0; v <= MAX_Y_T; ++v) {
       const double g = norm * v;
       if (g <= thresh * 4.5) {
@@ -856,7 +856,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
     return 0;
   }
   if (has_alpha) {
-    WebPInitAlphaProcessing();
     assert(step == 4);
 #if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
     assert(kAlphaFix + kGammaFix <= 31);
@@ -1085,40 +1084,45 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
 // automatic import / conversion
 
 static int Import(WebPPicture* const picture,
-                  const uint8_t* const rgb, int rgb_stride,
+                  const uint8_t* rgb, int rgb_stride,
                   int step, int swap_rb, int import_alpha) {
   int y;
   const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
   const uint8_t* g_ptr = rgb + 1;
   const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
-  const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
   const int width = picture->width;
   const int height = picture->height;
 
   if (!picture->use_argb) {
+    const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
     return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
                               0.f /* no dithering */, 0, picture);
   }
   if (!WebPPictureAlloc(picture)) return 0;
 
-  VP8EncDspARGBInit();
+  VP8LDspInit();
+  WebPInitAlphaProcessing();
 
   if (import_alpha) {
     uint32_t* dst = picture->argb;
+    const int do_copy =
+        (!swap_rb && !ALPHA_IS_LAST) || (swap_rb && ALPHA_IS_LAST);
     assert(step == 4);
     for (y = 0; y < height; ++y) {
-      VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
-      a_ptr += rgb_stride;
-      r_ptr += rgb_stride;
-      g_ptr += rgb_stride;
-      b_ptr += rgb_stride;
+      if (do_copy) {
+        memcpy(dst, rgb, width * 4);
+      } else {
+        // RGBA input order. Need to swap R and B.
+        VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
+      }
+      rgb += rgb_stride;
       dst += picture->argb_stride;
     }
   } else {
     uint32_t* dst = picture->argb;
     assert(step >= 3);
     for (y = 0; y < height; ++y) {
-      VP8PackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
+      WebPPackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
       r_ptr += rgb_stride;
       g_ptr += rgb_stride;
       b_ptr += rgb_stride;
@@ -1130,12 +1134,7 @@ static int Import(WebPPicture* const picture,
 
 // Public API
 
-int WebPPictureImportRGB(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return (picture != NULL && rgb != NULL)
-             ? Import(picture, rgb, rgb_stride, 3, 0, 0)
-             : 0;
-}
+#if !defined(WEBP_REDUCE_CSP)
 
 int WebPPictureImportBGR(WebPPicture* picture,
                          const uint8_t* rgb, int rgb_stride) {
@@ -1144,31 +1143,41 @@ int WebPPictureImportBGR(WebPPicture* picture,
              : 0;
 }
 
-int WebPPictureImportRGBA(WebPPicture* picture,
+int WebPPictureImportBGRA(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
   return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 0, 1)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
              : 0;
 }
 
-int WebPPictureImportBGRA(WebPPicture* picture,
+
+int WebPPictureImportBGRX(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
   return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
              : 0;
 }
 
-int WebPPictureImportRGBX(WebPPicture* picture,
+#endif   // WEBP_REDUCE_CSP
+
+int WebPPictureImportRGB(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL && rgb != NULL)
+             ? Import(picture, rgb, rgb_stride, 3, 0, 0)
+             : 0;
+}
+
+int WebPPictureImportRGBA(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
   return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 1)
              : 0;
 }
 
-int WebPPictureImportBGRX(WebPPicture* picture,
+int WebPPictureImportRGBX(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
   return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
              : 0;
 }
 
diff --git a/thirdparty/libwebp/enc/picture_enc.c b/thirdparty/libwebp/src/enc/picture_enc.c
index dfa66510fb..c691622d03 100644
--- a/thirdparty/libwebp/enc/picture_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_enc.c
@@ -14,9 +14,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8i_enc.h"
-#include "../dsp/dsp.h"
-#include "../utils/utils.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // WebPPicture
@@ -76,13 +76,12 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
   // allocate a new buffer.
-  memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
+  memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
   if (memory == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-  // TODO(skal): align plane to cache line?
   picture->memory_argb_ = memory;
-  picture->argb = (uint32_t*)memory;
+  picture->argb = (uint32_t*)WEBP_ALIGN(memory);
   picture->argb_stride = width;
   return 1;
 }
@@ -92,8 +91,8 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
       (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
   const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
   const int y_stride = width;
-  const int uv_width = (width + 1) >> 1;
-  const int uv_height = (height + 1) >> 1;
+  const int uv_width = (int)(((int64_t)width + 1) >> 1);
+  const int uv_height = (int)(((int64_t)height + 1) >> 1);
   const int uv_stride = uv_width;
   int a_width, a_stride;
   uint64_t y_size, uv_size, a_size, total_size;
@@ -118,8 +117,8 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
   total_size = y_size + a_size + 2 * uv_size;
 
   // Security and validation checks
-  if (width <= 0 || height <= 0 ||         // luma/alpha param error
-      uv_width < 0 || uv_height < 0) {     // u/v param error
+  if (width <= 0 || height <= 0 ||           // luma/alpha param error
+      uv_width <= 0 || uv_height <= 0) {     // u/v param error
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
   // allocate a new buffer.
@@ -271,9 +270,11 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
 }
 
 ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
-ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
 ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
 ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
+#endif  // WEBP_REDUCE_CSP
 
 #undef ENCODE_FUNC
 
@@ -284,9 +285,11 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
 }
 
 LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
 LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
 LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
+#endif  // WEBP_REDUCE_CSP
 
 #undef LOSSLESS_ENCODE_FUNC
 
diff --git a/thirdparty/libwebp/enc/picture_psnr_enc.c b/thirdparty/libwebp/src/enc/picture_psnr_enc.c
index 9c0b229507..362a7c79be 100644
--- a/thirdparty/libwebp/enc/picture_psnr_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_psnr_enc.c
@@ -11,11 +11,15 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include "src/webp/encode.h"
+
+#if !(defined(WEBP_DISABLE_STATS) || defined(WEBP_REDUCE_SIZE))
+
 #include <math.h>
 #include <stdlib.h>
 
-#include "./vp8i_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/utils.h"
 
 typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
                                  const uint8_t* ref, int ref_stride,
@@ -210,4 +214,34 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
   return ok;
 }
 
-//------------------------------------------------------------------------------
+#else  // defined(WEBP_DISABLE_STATS)
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  (void)src;
+  (void)src_stride;
+  (void)ref;
+  (void)ref_stride;
+  (void)width;
+  (void)height;
+  (void)x_step;
+  (void)type;
+  if (distortion == NULL || result == NULL) return 0;
+  *distortion = 0.f;
+  *result = 0.f;
+  return 1;
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int i;
+  (void)src;
+  (void)ref;
+  (void)type;
+  if (results == NULL) return 0;
+  for (i = 0; i < 5; ++i) results[i] = 0.f;
+  return 1;
+}
+
+#endif  // !defined(WEBP_DISABLE_STATS)
diff --git a/thirdparty/libwebp/enc/picture_rescale_enc.c b/thirdparty/libwebp/src/enc/picture_rescale_enc.c
index 0b7181c0d7..58a6ae7b9d 100644
--- a/thirdparty/libwebp/enc/picture_rescale_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_rescale_enc.c
@@ -11,12 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include "src/webp/encode.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8i_enc.h"
-#include "../utils/rescaler_utils.h"
-#include "../utils/utils.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/utils/utils.h"
 
 #define HALVE(x) (((x) + 1) >> 1)
 
@@ -261,4 +265,45 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   return 1;
 }
 
-//------------------------------------------------------------------------------
+#else  // defined(WEBP_REDUCE_SIZE)
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+  (void)src;
+  (void)dst;
+  return 0;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+  (void)picture;
+  return 0;
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  (void)src;
+  (void)left;
+  (void)top;
+  (void)width;
+  (void)height;
+  (void)dst;
+  return 0;
+}
+
+int WebPPictureCrop(WebPPicture* pic,
+                    int left, int top, int width, int height) {
+  (void)pic;
+  (void)left;
+  (void)top;
+  (void)width;
+  (void)height;
+  return 0;
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+  (void)pic;
+  (void)width;
+  (void)height;
+  return 0;
+}
+#endif  // !defined(WEBP_REDUCE_SIZE)
diff --git a/thirdparty/libwebp/enc/picture_tools_enc.c b/thirdparty/libwebp/src/enc/picture_tools_enc.c
index 895df51156..be292d4391 100644
--- a/thirdparty/libwebp/enc/picture_tools_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_tools_enc.c
@@ -13,8 +13,8 @@
 
 #include <assert.h>
 
-#include "./vp8i_enc.h"
-#include "../dsp/yuv.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/yuv.h"
 
 static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
   return (0xff000000u | (r << 16) | (g << 8) | b);
@@ -25,20 +25,7 @@ static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
 
 #define SIZE 8
 #define SIZE2 (SIZE / 2)
-static int is_transparent_area(const uint8_t* ptr, int stride, int size) {
-  int y, x;
-  for (y = 0; y < size; ++y) {
-    for (x = 0; x < size; ++x) {
-      if (ptr[x]) {
-        return 0;
-      }
-    }
-    ptr += stride;
-  }
-  return 1;
-}
-
-static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) {
+static int IsTransparentARGBArea(const uint32_t* ptr, int stride, int size) {
   int y, x;
   for (y = 0; y < size; ++y) {
     for (x = 0; x < size; ++x) {
@@ -51,7 +38,7 @@ static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) {
   return 1;
 }
 
-static void flatten(uint8_t* ptr, int v, int stride, int size) {
+static void Flatten(uint8_t* ptr, int v, int stride, int size) {
   int y;
   for (y = 0; y < size; ++y) {
     memset(ptr, v, size);
@@ -59,7 +46,7 @@ static void flatten(uint8_t* ptr, int v, int stride, int size) {
   }
 }
 
-static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) {
+static void FlattenARGB(uint32_t* ptr, uint32_t v, int stride, int size) {
   int x, y;
   for (y = 0; y < size; ++y) {
     for (x = 0; x < size; ++x) ptr[x] = v;
@@ -67,54 +54,114 @@ static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) {
   }
 }
 
+// Smoothen the luma components of transparent pixels. Return true if the whole
+// block is transparent.
+static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
+                         int y_stride, int width, int height) {
+  int sum = 0, count = 0;
+  int x, y;
+  const uint8_t* alpha_ptr = a_ptr;
+  uint8_t* luma_ptr = y_ptr;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      if (alpha_ptr[x] != 0) {
+        ++count;
+        sum += luma_ptr[x];
+      }
+    }
+    alpha_ptr += a_stride;
+    luma_ptr += y_stride;
+  }
+  if (count > 0 && count < width * height) {
+    const uint8_t avg_u8 = (uint8_t)(sum / count);
+    alpha_ptr = a_ptr;
+    luma_ptr = y_ptr;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        if (alpha_ptr[x] == 0) luma_ptr[x] = avg_u8;
+      }
+      alpha_ptr += a_stride;
+      luma_ptr += y_stride;
+    }
+  }
+  return (count == 0);
+}
+
 void WebPCleanupTransparentArea(WebPPicture* pic) {
   int x, y, w, h;
   if (pic == NULL) return;
   w = pic->width / SIZE;
   h = pic->height / SIZE;
 
-  // note: we ignore the left-overs on right/bottom
+  // note: we ignore the left-overs on right/bottom, except for SmoothenBlock().
   if (pic->use_argb) {
     uint32_t argb_value = 0;
     for (y = 0; y < h; ++y) {
       int need_reset = 1;
       for (x = 0; x < w; ++x) {
         const int off = (y * pic->argb_stride + x) * SIZE;
-        if (is_transparent_argb_area(pic->argb + off, pic->argb_stride, SIZE)) {
+        if (IsTransparentARGBArea(pic->argb + off, pic->argb_stride, SIZE)) {
           if (need_reset) {
             argb_value = pic->argb[off];
             need_reset = 0;
           }
-          flatten_argb(pic->argb + off, argb_value, pic->argb_stride, SIZE);
+          FlattenARGB(pic->argb + off, argb_value, pic->argb_stride, SIZE);
         } else {
           need_reset = 1;
         }
       }
     }
   } else {
-    const uint8_t* const a_ptr = pic->a;
+    const int width = pic->width;
+    const int height = pic->height;
+    const int y_stride = pic->y_stride;
+    const int uv_stride = pic->uv_stride;
+    const int a_stride = pic->a_stride;
+    uint8_t* y_ptr = pic->y;
+    uint8_t* u_ptr = pic->u;
+    uint8_t* v_ptr = pic->v;
+    const uint8_t* a_ptr = pic->a;
     int values[3] = { 0 };
-    if (a_ptr == NULL) return;    // nothing to do
-    for (y = 0; y < h; ++y) {
+    if (a_ptr == NULL || y_ptr == NULL || u_ptr == NULL || v_ptr == NULL) {
+      return;
+    }
+    for (y = 0; y + SIZE <= height; y += SIZE) {
       int need_reset = 1;
-      for (x = 0; x < w; ++x) {
-        const int off_a = (y * pic->a_stride + x) * SIZE;
-        const int off_y = (y * pic->y_stride + x) * SIZE;
-        const int off_uv = (y * pic->uv_stride + x) * SIZE2;
-        if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) {
+      for (x = 0; x + SIZE <= width; x += SIZE) {
+        if (SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                          SIZE, SIZE)) {
           if (need_reset) {
-            values[0] = pic->y[off_y];
-            values[1] = pic->u[off_uv];
-            values[2] = pic->v[off_uv];
+            values[0] = y_ptr[x];
+            values[1] = u_ptr[x >> 1];
+            values[2] = v_ptr[x >> 1];
             need_reset = 0;
           }
-          flatten(pic->y + off_y, values[0], pic->y_stride, SIZE);
-          flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2);
-          flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2);
+          Flatten(y_ptr + x,        values[0], y_stride,  SIZE);
+          Flatten(u_ptr + (x >> 1), values[1], uv_stride, SIZE2);
+          Flatten(v_ptr + (x >> 1), values[2], uv_stride, SIZE2);
         } else {
           need_reset = 1;
         }
       }
+      if (x < width) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      width - x, SIZE);
+      }
+      a_ptr += SIZE * a_stride;
+      y_ptr += SIZE * y_stride;
+      u_ptr += SIZE2 * uv_stride;
+      v_ptr += SIZE2 * uv_stride;
+    }
+    if (y < height) {
+      const int sub_height = height - y;
+      for (x = 0; x + SIZE <= width; x += SIZE) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      SIZE, sub_height);
+      }
+      if (x < width) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      width - x, sub_height);
+      }
     }
   }
 }
@@ -144,9 +191,9 @@ void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
 // Blend color and remove transparency info
 
 #define BLEND(V0, V1, ALPHA) \
-    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
+    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 256) >> 16)
 #define BLEND_10BIT(V0, V1, ALPHA) \
-    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
+    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 1024) >> 18)
 
 void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
   const int red = (background_rgb >> 16) & 0xff;
diff --git a/thirdparty/libwebp/enc/predictor_enc.c b/thirdparty/libwebp/src/enc/predictor_enc.c
index 0639b74f1c..f3715f515e 100644
--- a/thirdparty/libwebp/enc/predictor_enc.c
+++ b/thirdparty/libwebp/src/enc/predictor_enc.c
@@ -14,9 +14,9 @@
 //          Urvang Joshi (urvang@google.com)
 //          Vincent Rabaud (vrabaud@google.com)
 
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "./vp8li_enc.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/enc/vp8li_enc.h"
 
 #define MAX_DIFF_COST (1e30f)
 
@@ -26,7 +26,6 @@ static const uint32_t kMaskAlpha = 0xff000000;
 
 // Mostly used to reduce code size + readability
 static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
-static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
 
 //------------------------------------------------------------------------------
 // Methods to calculate Entropy (Shannon).
@@ -90,6 +89,9 @@ static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
   }
 }
 
+#if (WEBP_NEAR_LOSSLESS == 1)
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
+
 static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
   const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
   const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
@@ -180,6 +182,7 @@ static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
 // max_quantization which is a power of 2, smaller than max_diff). Take care if
 // value and predict have undergone subtract green, which means that red and
 // blue are represented as offsets from green.
+#define NEAR_LOSSLESS_DIFF(a, b) (uint8_t)((((int)(a) - (int)(b))) & 0xff)
 static uint32_t NearLossless(uint32_t value, uint32_t predict,
                              int max_quantization, int max_diff,
                              int used_subtract_green) {
@@ -196,7 +199,7 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
   }
   if ((value >> 24) == 0 || (value >> 24) == 0xff) {
     // Preserve transparency of fully transparent or fully opaque pixels.
-    a = ((value >> 24) - (predict >> 24)) & 0xff;
+    a = NEAR_LOSSLESS_DIFF(value >> 24, predict >> 24);
   } else {
     a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
   }
@@ -209,15 +212,17 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
     // The amount by which green has been adjusted during quantization. It is
     // subtracted from red and blue for compensation, to avoid accumulating two
     // quantization errors in them.
-    green_diff = (new_green - (value >> 8)) & 0xff;
+    green_diff = NEAR_LOSSLESS_DIFF(new_green, value >> 8);
   }
-  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
+  r = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value >> 16, green_diff),
                             (predict >> 16) & 0xff, 0xff - new_green,
                             quantization);
-  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
-                            0xff - new_green, quantization);
+  b = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value, green_diff),
+                            predict & 0xff, 0xff - new_green, quantization);
   return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
 }
+#undef NEAR_LOSSLESS_DIFF
+#endif  // (WEBP_NEAR_LOSSLESS == 1)
 
 // Stores the difference between the pixel and its prediction in "out".
 // In case of a lossy encoding, updates the source image to avoid propagating
@@ -244,6 +249,7 @@ static WEBP_INLINE void GetResidual(
       } else {
         predict = pred_func(current_row[x - 1], upper_row + x);
       }
+#if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
           x == 0 || x == width - 1) {
         residual = VP8LSubPixels(current_row[x], predict);
@@ -254,6 +260,13 @@ static WEBP_INLINE void GetResidual(
         current_row[x] = VP8LAddPixels(predict, residual);
         // x is never 0 here so we do not need to update upper_row like below.
       }
+#else
+      (void)max_diffs;
+      (void)height;
+      (void)max_quantization;
+      (void)used_subtract_green;
+      residual = VP8LSubPixels(current_row[x], predict);
+#endif
       if ((current_row[x] & kMaskAlpha) == 0) {
         // If alpha is 0, cleanup RGB. We can choose the RGB values of the
         // residual for best compression. The prediction of alpha itself can be
@@ -296,11 +309,12 @@ static int GetBestPredictorForTile(int width, int height,
   const int max_x = GetMin(tile_size, width - start_x);
   // Whether there exist columns just outside the tile.
   const int have_left = (start_x > 0);
-  const int have_right = (max_x < width - start_x);
   // Position and size of the strip covering the tile and adjacent columns if
   // they exist.
   const int context_start_x = start_x - have_left;
-  const int context_width = max_x + have_left + have_right;
+#if (WEBP_NEAR_LOSSLESS == 1)
+  const int context_width = max_x + have_left + (max_x < width - start_x);
+#endif
   const int tiles_per_row = VP8LSubSampleSize(width, bits);
   // Prediction modes of the left and above neighbor tiles.
   const int left_mode = (tile_x > 0) ?
@@ -352,10 +366,12 @@ static int GetBestPredictorForTile(int width, int height,
       memcpy(current_row + context_start_x,
              argb + y * width + context_start_x,
              sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+#if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization > 1 && y >= 1 && y + 1 < height) {
         MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
                        max_diffs + context_start_x, used_subtract_green);
       }
+#endif
 
       GetResidual(width, height, upper_row, current_row, max_diffs, mode,
                   start_x, start_x + max_x, y, max_quantization, exact,
@@ -405,7 +421,9 @@ static void CopyImageWithPrediction(int width, int height,
   uint32_t* upper_row = argb_scratch;
   uint32_t* current_row = upper_row + width + 1;
   uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+#if (WEBP_NEAR_LOSSLESS == 1)
   uint8_t* lower_max_diffs = current_max_diffs + width;
+#endif
   int y;
 
   for (y = 0; y < height; ++y) {
@@ -420,6 +438,7 @@ static void CopyImageWithPrediction(int width, int height,
       PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
                    argb + y * width);
     } else {
+#if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization > 1) {
         // Compute max_diffs for the lower row now, because that needs the
         // contents of argb for the current row, which we will overwrite with
@@ -432,6 +451,7 @@ static void CopyImageWithPrediction(int width, int height,
                          used_subtract_green);
         }
       }
+#endif
       for (x = 0; x < width;) {
         const int mode =
             (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
diff --git a/thirdparty/libwebp/enc/quant_enc.c b/thirdparty/libwebp/src/enc/quant_enc.c
index b118fb2a13..3b1a3129b5 100644
--- a/thirdparty/libwebp/enc/quant_enc.c
+++ b/thirdparty/libwebp/src/enc/quant_enc.c
@@ -15,8 +15,8 @@
 #include <math.h>
 #include <stdlib.h>  // for abs()
 
-#include "./vp8i_enc.h"
-#include "./cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"
 
 #define DO_TRELLIS_I4  1
 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
@@ -457,11 +457,11 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
 // Form the predictions in cache
 
 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
-const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
-const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
+const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
+const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
 
 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
-const int VP8I4ModeOffsets[NUM_BMODES] = {
+const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
 };
 
@@ -492,14 +492,14 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // |YYYY|....| 12
 // +----+----+
 
-const int VP8Scan[16] = {  // Luma
+const uint16_t VP8Scan[16] = {  // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
 };
 
-static const int VP8ScanUV[4 + 4] = {
+static const uint16_t VP8ScanUV[4 + 4] = {
   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
@@ -1162,7 +1162,7 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
+      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
                           + VP8FixedCostsI16[mode] * lambda_d_i16;
       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
         continue;
diff --git a/thirdparty/libwebp/enc/syntax_enc.c b/thirdparty/libwebp/src/enc/syntax_enc.c
index 90665bd7e5..a9e5a6cf0f 100644
--- a/thirdparty/libwebp/enc/syntax_enc.c
+++ b/thirdparty/libwebp/src/enc/syntax_enc.c
@@ -13,10 +13,10 @@
 
 #include <assert.h>
 
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"  // RIFF constants
-#include "../webp/mux_types.h"         // ALPHA_FLAG
-#include "./vp8i_enc.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"  // RIFF constants
+#include "src/webp/mux_types.h"         // ALPHA_FLAG
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Helper functions
@@ -289,11 +289,17 @@ static int GeneratePartition0(VP8Encoder* const enc) {
 
   pos3 = VP8BitWriterPos(bw);
 
+#if !defined(WEBP_DISABLE_STATS)
   if (enc->pic_->stats) {
     enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
     enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
     enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
   }
+#else
+  (void)pos1;
+  (void)pos2;
+  (void)pos3;
+#endif
   if (bw->error_) {
     return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
diff --git a/thirdparty/libwebp/enc/token_enc.c b/thirdparty/libwebp/src/enc/token_enc.c
index 02a0d72cc6..3a2192acac 100644
--- a/thirdparty/libwebp/enc/token_enc.c
+++ b/thirdparty/libwebp/src/enc/token_enc.c
@@ -20,9 +20,9 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./cost_enc.h"
-#include "./vp8i_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/utils.h"
 
 #if !defined(DISABLE_TOKEN_BUFFER)
 
@@ -195,39 +195,6 @@ int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
 #undef TOKEN_ID
 
 //------------------------------------------------------------------------------
-// This function works, but isn't currently used. Saved for later.
-
-#if 0
-
-static void Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-}
-
-void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
-  const VP8Tokens* p = b->pages_;
-  while (p != NULL) {
-    const int N = (p->next_ == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    const token_t* const tokens = TOKEN_DATA(p);
-    while (n-- > N) {
-      const token_t token = tokens[n];
-      if (!(token & FIXED_PROBA_BIT)) {
-        Record((token >> 15) & 1, stats + (token & 0x3fffu));
-      }
-    }
-    p = p->next_;
-  }
-}
-
-#endif   // 0
-
-//------------------------------------------------------------------------------
 // Final coding pass, with known probabilities
 
 int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
@@ -283,8 +250,9 @@ size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
 
 #else     // DISABLE_TOKEN_BUFFER
 
-void VP8TBufferInit(VP8TBuffer* const b) {
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
   (void)b;
+  (void)page_size;
 }
 void VP8TBufferClear(VP8TBuffer* const b) {
   (void)b;
diff --git a/thirdparty/libwebp/enc/tree_enc.c b/thirdparty/libwebp/src/enc/tree_enc.c
index 2c40fe7f3d..64ed28360b 100644
--- a/thirdparty/libwebp/enc/tree_enc.c
+++ b/thirdparty/libwebp/src/enc/tree_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Default probabilities
diff --git a/thirdparty/libwebp/enc/vp8i_enc.h b/thirdparty/libwebp/src/enc/vp8i_enc.h
index 93c95ecbfb..3463491e9d 100644
--- a/thirdparty/libwebp/enc/vp8i_enc.h
+++ b/thirdparty/libwebp/src/enc/vp8i_enc.h
@@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_ENC_VP8ENCI_H_
-#define WEBP_ENC_VP8ENCI_H_
+#ifndef WEBP_ENC_VP8I_ENC_H_
+#define WEBP_ENC_VP8I_ENC_H_
 
 #include <string.h>     // for memcpy()
-#include "../dec/common_dec.h"
-#include "../dsp/dsp.h"
-#include "../utils/bit_writer_utils.h"
-#include "../utils/thread_utils.h"
-#include "../utils/utils.h"
-#include "../webp/encode.h"
+#include "src/dec/common_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_writer_utils.h"
+#include "src/utils/thread_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/encode.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 6
-#define ENC_REV_VERSION 0
+#define ENC_REV_VERSION 1
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -75,10 +75,10 @@ typedef enum {   // Rate-distortion optimization levels
 #define U_OFF_ENC    (16)
 #define V_OFF_ENC    (16 + 8)
 
-extern const int VP8Scan[16];           // in quant.c
-extern const int VP8UVModeOffsets[4];   // in analyze.c
-extern const int VP8I16ModeOffsets[4];
-extern const int VP8I4ModeOffsets[NUM_BMODES];
+extern const uint16_t VP8Scan[16];
+extern const uint16_t VP8UVModeOffsets[4];
+extern const uint16_t VP8I16ModeOffsets[4];
+extern const uint16_t VP8I4ModeOffsets[NUM_BMODES];
 
 // Layout of prediction blocks
 // intra 16x16
@@ -330,9 +330,6 @@ int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
 // Estimate the final coded size given a set of 'probas'.
 size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas);
 
-// unused for now
-void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats);
-
 #endif  // !DISABLE_TOKEN_BUFFER
 
 //------------------------------------------------------------------------------
@@ -502,19 +499,10 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 // compressibility (no guarantee, though). Assumes that pic->use_argb is true.
 void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
 
-  // in near_lossless.c
-// Near lossless preprocessing in RGB color-space.
-int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality);
-// Near lossless adjustment for predictors.
-void VP8ApplyNearLosslessPredict(int xsize, int ysize, int pred_bits,
-                                 const uint32_t* argb_orig,
-                                 uint32_t* argb, uint32_t* argb_scratch,
-                                 const uint32_t* const transform_data,
-                                 int quality, int subtract_green);
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_VP8ENCI_H_ */
+#endif  /* WEBP_ENC_VP8I_ENC_H_ */
diff --git a/thirdparty/libwebp/enc/vp8l_enc.c b/thirdparty/libwebp/src/enc/vp8l_enc.c
index b1a793d956..312e521906 100644
--- a/thirdparty/libwebp/enc/vp8l_enc.c
+++ b/thirdparty/libwebp/src/enc/vp8l_enc.c
@@ -15,20 +15,19 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "./vp8i_enc.h"
-#include "./vp8li_enc.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../utils/bit_writer_utils.h"
-#include "../utils/huffman_encode_utils.h"
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"
-
-#include "./delta_palettization_enc.h"
-
-#define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/vp8li_enc.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/bit_writer_utils.h"
+#include "src/utils/huffman_encode_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
+
+#include "src/enc/delta_palettization_enc.h"
+
 // Maximum number of histogram images (sub-blocks).
 #define MAX_HUFF_IMAGE_SIZE       2600
 
@@ -128,7 +127,10 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
                                    uint32_t palette[MAX_PALETTE_SIZE],
                                    int* const palette_size) {
   const int num_colors = WebPGetColorPalette(pic, palette);
-  if (num_colors > MAX_PALETTE_SIZE) return 0;
+  if (num_colors > MAX_PALETTE_SIZE) {
+    *palette_size = 0;
+    return 0;
+  }
   *palette_size = num_colors;
   qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
   if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) {
@@ -188,22 +190,33 @@ static WEBP_INLINE uint32_t HashPix(uint32_t pix) {
 static int AnalyzeEntropy(const uint32_t* argb,
                           int width, int height, int argb_stride,
                           int use_palette,
+                          int palette_size, int transform_bits,
                           EntropyIx* const min_entropy_ix,
                           int* const red_and_blue_always_zero) {
   // Allocate histogram set with cache_bits = 0.
-  uint32_t* const histo =
-      (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
+  uint32_t* histo;
+
+  if (use_palette && palette_size <= 16) {
+    // In the case of small palettes, we pack 2, 4 or 8 pixels together. In
+    // practice, small palettes are better than any other transform.
+    *min_entropy_ix = kPalette;
+    *red_and_blue_always_zero = 1;
+    return 1;
+  }
+  histo = (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
   if (histo != NULL) {
     int i, x, y;
-    const uint32_t* prev_row = argb;
-    const uint32_t* curr_row = argb + argb_stride;
-    for (y = 1; y < height; ++y) {
-      uint32_t prev_pix = curr_row[0];
-      for (x = 1; x < width; ++x) {
+    const uint32_t* prev_row = NULL;
+    const uint32_t* curr_row = argb;
+    uint32_t pix_prev = argb[0];  // Skip the first pixel.
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
         const uint32_t pix = curr_row[x];
-        const uint32_t pix_diff = VP8LSubPixels(pix, prev_pix);
-        if ((pix_diff == 0) || (pix == prev_row[x])) continue;
-        prev_pix = pix;
+        const uint32_t pix_diff = VP8LSubPixels(pix, pix_prev);
+        pix_prev = pix;
+        if ((pix_diff == 0) || (prev_row != NULL && pix == prev_row[x])) {
+          continue;
+        }
         AddSingle(pix,
                   &histo[kHistoAlpha * 256],
                   &histo[kHistoRed * 256],
@@ -264,8 +277,24 @@ static int AnalyzeEntropy(const uint32_t* argb,
           entropy_comp[kHistoRedPredSubGreen] +
           entropy_comp[kHistoGreenPred] +
           entropy_comp[kHistoBluePredSubGreen];
-      // Palette mode seems more efficient in a breakeven case. Bias with 1.0.
-      entropy[kPalette] = entropy_comp[kHistoPalette] - 1.0;
+      entropy[kPalette] = entropy_comp[kHistoPalette];
+
+      // When including transforms, there is an overhead in bits from
+      // storing them. This overhead is small but matters for small images.
+      // For spatial, there are 14 transformations.
+      entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) *
+                           VP8LSubSampleSize(height, transform_bits) *
+                           VP8LFastLog2(14);
+      // For color transforms: 24 as only 3 channels are considered in a
+      // ColorTransformElement.
+      entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) *
+                                   VP8LSubSampleSize(height, transform_bits) *
+                                   VP8LFastLog2(24);
+      // For palettes, add the cost of storing the palette.
+      // We empirically estimate the cost of a compressed entry as 8 bits.
+      // The palette is differential-coded when compressed hence a much
+      // lower cost than sizeof(uint32_t)*8.
+      entropy[kPalette] += palette_size * 8;
 
       *min_entropy_ix = kDirect;
       for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
@@ -273,6 +302,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
           *min_entropy_ix = (EntropyIx)k;
         }
       }
+      assert((int)*min_entropy_ix <= last_mode_to_analyze);
       *red_and_blue_always_zero = 1;
       // Let's check if the histogram of the chosen entropy mode has
       // non-zero red and blue values. If all are zero, we can later skip
@@ -325,60 +355,95 @@ static int GetTransformBits(int method, int histo_bits) {
   return res;
 }
 
-static int AnalyzeAndInit(VP8LEncoder* const enc) {
+// Set of parameters to be used in each iteration of the cruncher.
+#define CRUNCH_CONFIGS_LZ77_MAX 2
+typedef struct {
+  int entropy_idx_;
+  int lz77s_types_to_try_[CRUNCH_CONFIGS_LZ77_MAX];
+  int lz77s_types_to_try_size_;
+} CrunchConfig;
+
+#define CRUNCH_CONFIGS_MAX kNumEntropyIx
+
+static int EncoderAnalyze(VP8LEncoder* const enc,
+                          CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX],
+                          int* const crunch_configs_size,
+                          int* const red_and_blue_always_zero) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
-  const int pix_cnt = width * height;
   const WebPConfig* const config = enc->config_;
   const int method = config->method;
   const int low_effort = (config->method == 0);
-  // we round the block size up, so we're guaranteed to have
-  // at max MAX_REFS_BLOCK_PER_IMAGE blocks used:
-  int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
+  int i;
+  int use_palette;
+  int n_lz77s;
   assert(pic != NULL && pic->argb != NULL);
 
-  enc->use_cross_color_ = 0;
-  enc->use_predict_ = 0;
-  enc->use_subtract_green_ = 0;
-  enc->use_palette_ =
+  use_palette =
       AnalyzeAndCreatePalette(pic, low_effort,
                               enc->palette_, &enc->palette_size_);
 
   // TODO(jyrki): replace the decision to be based on an actual estimate
   // of entropy, or even spatial variance of entropy.
-  enc->histo_bits_ = GetHistoBits(method, enc->use_palette_,
+  enc->histo_bits_ = GetHistoBits(method, use_palette,
                                   pic->width, pic->height);
   enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
 
   if (low_effort) {
     // AnalyzeEntropy is somewhat slow.
-    enc->use_predict_ = !enc->use_palette_;
-    enc->use_subtract_green_ = !enc->use_palette_;
-    enc->use_cross_color_ = 0;
+    crunch_configs[0].entropy_idx_ = use_palette ? kPalette : kSpatialSubGreen;
+    n_lz77s = 1;
+    *crunch_configs_size = 1;
   } else {
-    int red_and_blue_always_zero;
     EntropyIx min_entropy_ix;
-    if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride,
-                        enc->use_palette_, &min_entropy_ix,
-                        &red_and_blue_always_zero)) {
+    // Try out multiple LZ77 on images with few colors.
+    n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
+    if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
+                        enc->palette_size_, enc->transform_bits_,
+                        &min_entropy_ix, red_and_blue_always_zero)) {
       return 0;
     }
-    enc->use_palette_ = (min_entropy_ix == kPalette);
-    enc->use_subtract_green_ =
-        (min_entropy_ix == kSubGreen) || (min_entropy_ix == kSpatialSubGreen);
-    enc->use_predict_ =
-        (min_entropy_ix == kSpatial) || (min_entropy_ix == kSpatialSubGreen);
-    enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
+    if (method == 6 && config->quality == 100) {
+      // Go brute force on all transforms.
+      *crunch_configs_size = 0;
+      for (i = 0; i < kNumEntropyIx; ++i) {
+        if (i != kPalette || use_palette) {
+          assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX);
+          crunch_configs[(*crunch_configs_size)++].entropy_idx_ = i;
+        }
+      }
+    } else {
+      // Only choose the guessed best transform.
+      *crunch_configs_size = 1;
+      crunch_configs[0].entropy_idx_ = min_entropy_ix;
+    }
+  }
+  // Fill in the different LZ77s.
+  assert(n_lz77s <= CRUNCH_CONFIGS_LZ77_MAX);
+  for (i = 0; i < *crunch_configs_size; ++i) {
+    int j;
+    for (j = 0; j < n_lz77s; ++j) {
+      crunch_configs[i].lz77s_types_to_try_[j] =
+          (j == 0) ? kLZ77Standard | kLZ77RLE : kLZ77Box;
+    }
+    crunch_configs[i].lz77s_types_to_try_size_ = n_lz77s;
   }
+  return 1;
+}
 
+static int EncoderInit(VP8LEncoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const int pix_cnt = width * height;
+  // we round the block size up, so we're guaranteed to have
+  // at most MAX_REFS_BLOCK_PER_IMAGE blocks used:
+  const int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
+  int i;
   if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
 
-  // palette-friendly input typically uses less literals
-  //  -> reduce block size a bit
-  if (enc->use_palette_) refs_block_size /= 2;
-  VP8LBackwardRefsInit(&enc->refs_[0], refs_block_size);
-  VP8LBackwardRefsInit(&enc->refs_[1], refs_block_size);
+  for (i = 0; i < 3; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
 
   return 1;
 }
@@ -571,11 +636,16 @@ static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
     length = write_trimmed_length ? trimmed_length : num_tokens;
     VP8LPutBits(bw, write_trimmed_length, 1);
     if (write_trimmed_length) {
-      const int nbits = VP8LBitsLog2Ceiling(trimmed_length - 1);
-      const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2;
-      VP8LPutBits(bw, nbitpairs - 1, 3);
-      assert(trimmed_length >= 2);
-      VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
+      if (trimmed_length == 2) {
+        VP8LPutBits(bw, 0, 3 + 2);     // nbitpairs=1, trimmed_length=2
+      } else {
+        const int nbits = BitsLog2Floor(trimmed_length - 2);
+        const int nbitpairs = nbits / 2 + 1;
+        assert(trimmed_length > 2);
+        assert(nbitpairs - 1 < 8);
+        VP8LPutBits(bw, nbitpairs - 1, 3);
+        VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
+      }
     }
     StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
   }
@@ -642,7 +712,7 @@ static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
 
 static WebPEncodingError StoreImageToBitMask(
     VP8LBitWriter* const bw, int width, int histo_bits,
-    VP8LBackwardRefs* const refs,
+    const VP8LBackwardRefs* const refs,
     const uint16_t* histogram_symbols,
     const HuffmanTreeCode* const huffman_codes) {
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
@@ -665,7 +735,7 @@ static WebPEncodingError StoreImageToBitMask(
       codes = huffman_codes + 5 * histogram_ix;
     }
     if (PixOrCopyIsLiteral(v)) {
-      static const int order[] = { 1, 2, 0, 3 };
+      static const uint8_t order[] = { 1, 2, 0, 3 };
       int k;
       for (k = 0; k < 4; ++k) {
         const int code = PixOrCopyLiteral(v, order[k]);
@@ -705,7 +775,8 @@ static WebPEncodingError StoreImageToBitMask(
 static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
                                               const uint32_t* const argb,
                                               VP8LHashChain* const hash_chain,
-                                              VP8LBackwardRefs refs_array[2],
+                                              VP8LBackwardRefs* const refs_tmp1,
+                                              VP8LBackwardRefs* const refs_tmp2,
                                               int width, int height,
                                               int quality, int low_effort) {
   int i;
@@ -730,8 +801,9 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
-  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, &cache_bits,
-                                   hash_chain, refs_array);
+  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0,
+                                   kLZ77Standard | kLZ77RLE, &cache_bits,
+                                   hash_chain, refs_tmp1, refs_tmp2);
   if (refs == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
@@ -788,39 +860,37 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   return err;
 }
 
-static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
-                                             const uint32_t* const argb,
-                                             VP8LHashChain* const hash_chain,
-                                             VP8LBackwardRefs refs_array[2],
-                                             int width, int height, int quality,
-                                             int low_effort,
-                                             int use_cache, int* cache_bits,
-                                             int histogram_bits,
-                                             size_t init_byte_position,
-                                             int* const hdr_size,
-                                             int* const data_size) {
+static WebPEncodingError EncodeImageInternal(
+    VP8LBitWriter* const bw, const uint32_t* const argb,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[3], int width,
+    int height, int quality, int low_effort, int use_cache,
+    const CrunchConfig* const config, int* cache_bits, int histogram_bits,
+    size_t init_byte_position, int* const hdr_size, int* const data_size) {
   WebPEncodingError err = VP8_ENC_OK;
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
   VP8LHistogramSet* histogram_image = NULL;
-  VP8LHistogramSet* tmp_histos = NULL;
+  VP8LHistogram* tmp_histo = NULL;
   int histogram_image_size = 0;
   size_t bit_array_size = 0;
-  HuffmanTree* huff_tree = NULL;
+  HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+      3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
-  VP8LBackwardRefs refs;
-  VP8LBackwardRefs* best_refs;
+  VP8LBackwardRefs* refs_best;
+  VP8LBackwardRefs* refs_tmp;
   uint16_t* const histogram_symbols =
       (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
                                 sizeof(*histogram_symbols));
+  int lz77s_idx;
+  VP8LBitWriter bw_init = *bw, bw_best;
+  int hdr_size_tmp;
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
   assert(hdr_size != NULL);
   assert(data_size != NULL);
 
-  VP8LBackwardRefsInit(&refs, refs_array[0].block_size_);
   if (histogram_symbols == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
@@ -836,142 +906,162 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
   // 'best_refs' is the reference to the best backward refs and points to one
   // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
-  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
-                         low_effort)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-  best_refs = VP8LGetBackwardReferences(width, height, argb, quality,
-                                        low_effort, cache_bits, hash_chain,
-                                        refs_array);
-  if (best_refs == NULL || !VP8LBackwardRefsCopy(best_refs, &refs)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-  histogram_image =
-      VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits);
-  tmp_histos = VP8LAllocateHistogramSet(2, *cache_bits);
-  if (histogram_image == NULL || tmp_histos == NULL) {
+  if (huff_tree == NULL ||
+      !VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort) ||
+      !VP8LBitWriterInit(&bw_best, 0) ||
+      (config->lz77s_types_to_try_size_ > 1 &&
+       !VP8LBitWriterClone(bw, &bw_best))) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
+  for (lz77s_idx = 0; lz77s_idx < config->lz77s_types_to_try_size_;
+       ++lz77s_idx) {
+    refs_best = VP8LGetBackwardReferences(
+        width, height, argb, quality, low_effort,
+        config->lz77s_types_to_try_[lz77s_idx], cache_bits, hash_chain,
+        &refs_array[0], &refs_array[1]);
+    if (refs_best == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    // Keep the best references aside and use the other element from the first
+    // two as a temporary for later usage.
+    refs_tmp = &refs_array[refs_best == &refs_array[0] ? 1 : 0];
+
+    histogram_image =
+        VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits);
+    tmp_histo = VP8LAllocateHistogram(*cache_bits);
+    if (histogram_image == NULL || tmp_histo == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
 
-  // Build histogram image and symbols from backward references.
-  if (!VP8LGetHistoImageSymbols(width, height, &refs, quality, low_effort,
-                                histogram_bits, *cache_bits, histogram_image,
-                                tmp_histos, histogram_symbols)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-  // Create Huffman bit lengths and codes for each histogram image.
-  histogram_image_size = histogram_image->size;
-  bit_array_size = 5 * histogram_image_size;
-  huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
-                                                   sizeof(*huffman_codes));
-  // Note: some histogram_image entries may point to tmp_histos[], so the latter
-  // need to outlive the following call to GetHuffBitLengthsAndCodes().
-  if (huffman_codes == NULL ||
-      !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-  // Free combined histograms.
-  VP8LFreeHistogramSet(histogram_image);
-  histogram_image = NULL;
+    // Build histogram image and symbols from backward references.
+    if (!VP8LGetHistoImageSymbols(width, height, refs_best, quality, low_effort,
+                                  histogram_bits, *cache_bits, histogram_image,
+                                  tmp_histo, histogram_symbols)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    // Create Huffman bit lengths and codes for each histogram image.
+    histogram_image_size = histogram_image->size;
+    bit_array_size = 5 * histogram_image_size;
+    huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+                                                     sizeof(*huffman_codes));
+    // Note: some histogram_image entries may point to tmp_histos[], so the
+    // latter need to outlive the following call to GetHuffBitLengthsAndCodes().
+    if (huffman_codes == NULL ||
+        !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    // Free combined histograms.
+    VP8LFreeHistogramSet(histogram_image);
+    histogram_image = NULL;
 
-  // Free scratch histograms.
-  VP8LFreeHistogramSet(tmp_histos);
-  tmp_histos = NULL;
+    // Free scratch histograms.
+    VP8LFreeHistogram(tmp_histo);
+    tmp_histo = NULL;
 
-  // Color Cache parameters.
-  if (*cache_bits > 0) {
-    VP8LPutBits(bw, 1, 1);
-    VP8LPutBits(bw, *cache_bits, 4);
-  } else {
-    VP8LPutBits(bw, 0, 1);
-  }
+    // Color Cache parameters.
+    if (*cache_bits > 0) {
+      VP8LPutBits(bw, 1, 1);
+      VP8LPutBits(bw, *cache_bits, 4);
+    } else {
+      VP8LPutBits(bw, 0, 1);
+    }
 
-  // Huffman image + meta huffman.
-  {
-    const int write_histogram_image = (histogram_image_size > 1);
-    VP8LPutBits(bw, write_histogram_image, 1);
-    if (write_histogram_image) {
-      uint32_t* const histogram_argb =
-          (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
-                                    sizeof(*histogram_argb));
-      int max_index = 0;
-      uint32_t i;
-      if (histogram_argb == NULL) {
-        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-        goto Error;
-      }
-      for (i = 0; i < histogram_image_xysize; ++i) {
-        const int symbol_index = histogram_symbols[i] & 0xffff;
-        histogram_argb[i] = (symbol_index << 8);
-        if (symbol_index >= max_index) {
-          max_index = symbol_index + 1;
+    // Huffman image + meta huffman.
+    {
+      const int write_histogram_image = (histogram_image_size > 1);
+      VP8LPutBits(bw, write_histogram_image, 1);
+      if (write_histogram_image) {
+        uint32_t* const histogram_argb =
+            (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
+                                      sizeof(*histogram_argb));
+        int max_index = 0;
+        uint32_t i;
+        if (histogram_argb == NULL) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        for (i = 0; i < histogram_image_xysize; ++i) {
+          const int symbol_index = histogram_symbols[i] & 0xffff;
+          histogram_argb[i] = (symbol_index << 8);
+          if (symbol_index >= max_index) {
+            max_index = symbol_index + 1;
+          }
         }
+        histogram_image_size = max_index;
+
+        VP8LPutBits(bw, histogram_bits - 2, 3);
+        err = EncodeImageNoHuffman(
+            bw, histogram_argb, hash_chain, refs_tmp, &refs_array[2],
+            VP8LSubSampleSize(width, histogram_bits),
+            VP8LSubSampleSize(height, histogram_bits), quality, low_effort);
+        WebPSafeFree(histogram_argb);
+        if (err != VP8_ENC_OK) goto Error;
       }
-      histogram_image_size = max_index;
-
-      VP8LPutBits(bw, histogram_bits - 2, 3);
-      err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
-                                 VP8LSubSampleSize(width, histogram_bits),
-                                 VP8LSubSampleSize(height, histogram_bits),
-                                 quality, low_effort);
-      WebPSafeFree(histogram_argb);
-      if (err != VP8_ENC_OK) goto Error;
     }
-  }
 
-  // Store Huffman codes.
-  {
-    int i;
-    int max_tokens = 0;
-    huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * CODE_LENGTH_CODES,
-                                             sizeof(*huff_tree));
-    if (huff_tree == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
-    // Find maximum number of symbols for the huffman tree-set.
-    for (i = 0; i < 5 * histogram_image_size; ++i) {
-      HuffmanTreeCode* const codes = &huffman_codes[i];
-      if (max_tokens < codes->num_symbols) {
-        max_tokens = codes->num_symbols;
+    // Store Huffman codes.
+    {
+      int i;
+      int max_tokens = 0;
+      // Find maximum number of symbols for the huffman tree-set.
+      for (i = 0; i < 5 * histogram_image_size; ++i) {
+        HuffmanTreeCode* const codes = &huffman_codes[i];
+        if (max_tokens < codes->num_symbols) {
+          max_tokens = codes->num_symbols;
+        }
+      }
+      tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+      if (tokens == NULL) {
+        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        goto Error;
+      }
+      for (i = 0; i < 5 * histogram_image_size; ++i) {
+        HuffmanTreeCode* const codes = &huffman_codes[i];
+        StoreHuffmanCode(bw, huff_tree, tokens, codes);
+        ClearHuffmanTreeIfOnlyOneSymbol(codes);
       }
     }
-    tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens,
-                                               sizeof(*tokens));
-    if (tokens == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
+    // Store actual literals.
+    hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
+    err = StoreImageToBitMask(bw, width, histogram_bits, refs_best,
+                              histogram_symbols, huffman_codes);
+    // Keep track of the smallest image so far.
+    if (lz77s_idx == 0 ||
+        VP8LBitWriterNumBytes(bw) < VP8LBitWriterNumBytes(&bw_best)) {
+      *hdr_size = hdr_size_tmp;
+      *data_size =
+          (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+      VP8LBitWriterSwap(bw, &bw_best);
     }
-    for (i = 0; i < 5 * histogram_image_size; ++i) {
-      HuffmanTreeCode* const codes = &huffman_codes[i];
-      StoreHuffmanCode(bw, huff_tree, tokens, codes);
-      ClearHuffmanTreeIfOnlyOneSymbol(codes);
+    // Reset the bit writer for the following iteration if any.
+    if (config->lz77s_types_to_try_size_ > 1) VP8LBitWriterReset(&bw_init, bw);
+    WebPSafeFree(tokens);
+    tokens = NULL;
+    if (huffman_codes != NULL) {
+      WebPSafeFree(huffman_codes->codes);
+      WebPSafeFree(huffman_codes);
+      huffman_codes = NULL;
     }
   }
-
-  *hdr_size = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
-  // Store actual literals.
-  err = StoreImageToBitMask(bw, width, histogram_bits, &refs,
-                            histogram_symbols, huffman_codes);
-  *data_size =
-        (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+  VP8LBitWriterSwap(bw, &bw_best);
 
  Error:
   WebPSafeFree(tokens);
   WebPSafeFree(huff_tree);
   VP8LFreeHistogramSet(histogram_image);
-  VP8LFreeHistogramSet(tmp_histos);
-  VP8LBackwardRefsClear(&refs);
+  VP8LFreeHistogram(tmp_histo);
   if (huffman_codes != NULL) {
     WebPSafeFree(huffman_codes->codes);
     WebPSafeFree(huffman_codes);
   }
   WebPSafeFree(histogram_symbols);
+  VP8LBitWriterWipeOut(&bw_best);
   return err;
 }
 
@@ -1005,11 +1095,11 @@ static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   assert(pred_bits >= 2);
   VP8LPutBits(bw, pred_bits - 2, 3);
-  return EncodeImageNoHuffman(bw, enc->transform_data_,
-                              (VP8LHashChain*)&enc->hash_chain_,
-                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
-                              transform_width, transform_height,
-                              quality, low_effort);
+  return EncodeImageNoHuffman(
+      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
+      (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height,
+      quality, low_effort);
 }
 
 static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
@@ -1026,11 +1116,11 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
   VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
   assert(ccolor_transform_bits >= 2);
   VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
-  return EncodeImageNoHuffman(bw, enc->transform_data_,
-                              (VP8LHashChain*)&enc->hash_chain_,
-                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
-                              transform_width, transform_height,
-                              quality, low_effort);
+  return EncodeImageNoHuffman(
+      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
+      (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height,
+      quality, low_effort);
 }
 
 // -----------------------------------------------------------------------------
@@ -1144,6 +1234,7 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
     }
     enc->transform_mem_ = mem;
     enc->transform_mem_size_ = (size_t)mem_size;
+    enc->argb_content_ = kEncoderNone;
   }
   enc->argb_ = mem;
   mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
@@ -1164,11 +1255,13 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
   int y;
   err = AllocateTransformBuffer(enc, width, height);
   if (err != VP8_ENC_OK) return err;
+  if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK;
   for (y = 0; y < height; ++y) {
     memcpy(enc->argb_ + y * width,
            picture->argb + y * picture->argb_stride,
            width * sizeof(*enc->argb_));
   }
+  enc->argb_content_ = kEncoderARGB;
   assert(enc->current_width_ == width);
   return VP8_ENC_OK;
 }
@@ -1215,12 +1308,13 @@ static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) {
 
 static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) {
   // Forget about alpha.
-  return ((color & 0x00ffffffu) * 4222244071u) >> (32 - PALETTE_INV_SIZE_BITS);
+  return ((uint32_t)((color & 0x00ffffffu) * 4222244071ull)) >>
+         (32 - PALETTE_INV_SIZE_BITS);
 }
 
 static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
   // Forget about alpha.
-  return (color & 0x00ffffffu) * ((1u << 31) - 1) >>
+  return ((uint32_t)((color & 0x00ffffffu) * ((1ull << 31) - 1))) >>
          (32 - PALETTE_INV_SIZE_BITS);
 }
 
@@ -1346,6 +1440,7 @@ static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
   err = ApplyPalette(src, src_stride,
                      enc->argb_, enc->current_width_,
                      palette, palette_size, width, height, xbits);
+  enc->argb_content_ = kEncoderPalette;
   return err;
 }
 
@@ -1364,8 +1459,9 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
     tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
   }
   tmp_palette[0] = palette[0];
-  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, enc->refs_,
-                              palette_size, 1, 20 /* quality */, low_effort);
+  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
+                              &enc->refs_[0], &enc->refs_[1], palette_size, 1,
+                              20 /* quality */, low_effort);
 }
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
@@ -1400,10 +1496,11 @@ static WebPEncodingError EncodeDeltaPalettePredictorImage(
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   VP8LPutBits(bw, pred_bits - 2, 3);
-  err = EncodeImageNoHuffman(bw, predictors, &enc->hash_chain_,
-                             (VP8LBackwardRefs*)enc->refs_,  // cast const away
-                             transform_width, transform_height,
-                             quality, low_effort);
+  err = EncodeImageNoHuffman(
+      bw, predictors, &enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
+      (VP8LBackwardRefs*)&enc->refs_[1],
+      transform_width, transform_height, quality, low_effort);
   WebPSafeFree(predictors);
   return err;
 }
@@ -1422,6 +1519,7 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
   }
   enc->config_ = config;
   enc->pic_ = picture;
+  enc->argb_content_ = kEncoderNone;
 
   VP8LEncDspInit();
 
@@ -1430,9 +1528,9 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
 
 static void VP8LEncoderDelete(VP8LEncoder* enc) {
   if (enc != NULL) {
+    int i;
     VP8LHashChainClear(&enc->hash_chain_);
-    VP8LBackwardRefsClear(&enc->refs_[0]);
-    VP8LBackwardRefsClear(&enc->refs_[1]);
+    for (i = 0; i < 3; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
     ClearTransformBuffer(enc);
     WebPSafeFree(enc);
   }
@@ -1441,134 +1539,347 @@ static void VP8LEncoderDelete(VP8LEncoder* enc) {
 // -----------------------------------------------------------------------------
 // Main call
 
-WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
-                                   const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw, int use_cache) {
+typedef struct {
+  const WebPConfig* config_;
+  const WebPPicture* picture_;
+  VP8LBitWriter* bw_;
+  VP8LEncoder* enc_;
+  int use_cache_;
+  CrunchConfig crunch_configs_[CRUNCH_CONFIGS_MAX];
+  int num_crunch_configs_;
+  int red_and_blue_always_zero_;
+  WebPEncodingError err_;
+  WebPAuxStats* stats_;
+} StreamEncodeContext;
+
+static int EncodeStreamHook(void* input, void* data2) {
+  StreamEncodeContext* const params = (StreamEncodeContext*)input;
+  const WebPConfig* const config = params->config_;
+  const WebPPicture* const picture = params->picture_;
+  VP8LBitWriter* const bw = params->bw_;
+  VP8LEncoder* const enc = params->enc_;
+  const int use_cache = params->use_cache_;
+  const CrunchConfig* const crunch_configs = params->crunch_configs_;
+  const int num_crunch_configs = params->num_crunch_configs_;
+  const int red_and_blue_always_zero = params->red_and_blue_always_zero_;
+#if !defined(WEBP_DISABLE_STATS)
+  WebPAuxStats* const stats = params->stats_;
+#endif
   WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
   const int low_effort = (config->method == 0);
+#if (WEBP_NEAR_LOSSLESS == 1) || defined(WEBP_EXPERIMENTAL_FEATURES)
   const int width = picture->width;
+#endif
   const int height = picture->height;
-  VP8LEncoder* const enc = VP8LEncoderNew(config, picture);
   const size_t byte_position = VP8LBitWriterNumBytes(bw);
+#if (WEBP_NEAR_LOSSLESS == 1)
   int use_near_lossless = 0;
+#endif
   int hdr_size = 0;
   int data_size = 0;
   int use_delta_palette = 0;
+  int idx;
+  size_t best_size = 0;
+  VP8LBitWriter bw_init = *bw, bw_best;
+  (void)data2;
 
-  if (enc == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-
-  // ---------------------------------------------------------------------------
-  // Analyze image (entropy, num_palettes etc)
-
-  if (!AnalyzeAndInit(enc)) {
+  if (!VP8LBitWriterInit(&bw_best, 0) ||
+      (num_crunch_configs > 1 && !VP8LBitWriterClone(bw, &bw_best))) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
 
-  // Apply near-lossless preprocessing.
-  use_near_lossless =
-      (config->near_lossless < 100) && !enc->use_palette_ && !enc->use_predict_;
-  if (use_near_lossless) {
-    if (!VP8ApplyNearLossless(width, height, picture->argb,
-                              config->near_lossless)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
+  for (idx = 0; idx < num_crunch_configs; ++idx) {
+    const int entropy_idx = crunch_configs[idx].entropy_idx_;
+    enc->use_palette_ = (entropy_idx == kPalette);
+    enc->use_subtract_green_ =
+        (entropy_idx == kSubGreen) || (entropy_idx == kSpatialSubGreen);
+    enc->use_predict_ =
+        (entropy_idx == kSpatial) || (entropy_idx == kSpatialSubGreen);
+    if (low_effort) {
+      enc->use_cross_color_ = 0;
+    } else {
+      enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
     }
-  }
+    // Reset any parameter in the encoder that is set in the previous iteration.
+    enc->cache_bits_ = 0;
+    VP8LBackwardRefsClear(&enc->refs_[0]);
+    VP8LBackwardRefsClear(&enc->refs_[1]);
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (config->use_delta_palette) {
-    enc->use_predict_ = 1;
-    enc->use_cross_color_ = 0;
-    enc->use_subtract_green_ = 0;
-    enc->use_palette_ = 1;
-    err = MakeInputImageCopy(enc);
-    if (err != VP8_ENC_OK) goto Error;
-    err = WebPSearchOptimalDeltaPalette(enc);
-    if (err != VP8_ENC_OK) goto Error;
-    if (enc->use_palette_) {
+#if (WEBP_NEAR_LOSSLESS == 1)
+    // Apply near-lossless preprocessing.
+    use_near_lossless = (config->near_lossless < 100) && !enc->use_palette_ &&
+                        !enc->use_predict_;
+    if (use_near_lossless) {
       err = AllocateTransformBuffer(enc, width, height);
       if (err != VP8_ENC_OK) goto Error;
-      err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
+      if ((enc->argb_content_ != kEncoderNearLossless) &&
+          !VP8ApplyNearLossless(picture, config->near_lossless, enc->argb_)) {
+        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        goto Error;
+      }
+      enc->argb_content_ = kEncoderNearLossless;
+    } else {
+      enc->argb_content_ = kEncoderNone;
+    }
+#else
+    enc->argb_content_ = kEncoderNone;
+#endif
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (config->use_delta_palette) {
+      enc->use_predict_ = 1;
+      enc->use_cross_color_ = 0;
+      enc->use_subtract_green_ = 0;
+      enc->use_palette_ = 1;
+      if (enc->argb_content_ != kEncoderNearLossless &&
+          enc->argb_content_ != kEncoderPalette) {
+        err = MakeInputImageCopy(enc);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+      err = WebPSearchOptimalDeltaPalette(enc);
       if (err != VP8_ENC_OK) goto Error;
-      use_delta_palette = 1;
+      if (enc->use_palette_) {
+        err = AllocateTransformBuffer(enc, width, height);
+        if (err != VP8_ENC_OK) goto Error;
+        err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
+        if (err != VP8_ENC_OK) goto Error;
+        use_delta_palette = 1;
+      }
     }
-  }
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
-  // Encode palette
-  if (enc->use_palette_) {
-    err = EncodePalette(bw, low_effort, enc);
-    if (err != VP8_ENC_OK) goto Error;
-    err = MapImageFromPalette(enc, use_delta_palette);
-    if (err != VP8_ENC_OK) goto Error;
-    // If using a color cache, do not have it bigger than the number of colors.
-    if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
-      enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
-    }
-  }
-  if (!use_delta_palette) {
-    // In case image is not packed.
-    if (enc->argb_ == NULL) {
-      err = MakeInputImageCopy(enc);
+    // Encode palette
+    if (enc->use_palette_) {
+      err = EncodePalette(bw, low_effort, enc);
+      if (err != VP8_ENC_OK) goto Error;
+      err = MapImageFromPalette(enc, use_delta_palette);
       if (err != VP8_ENC_OK) goto Error;
+      // If using a color cache, do not have it bigger than the number of
+      // colors.
+      if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+        enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
+      }
     }
+    if (!use_delta_palette) {
+      // In case image is not packed.
+      if (enc->argb_content_ != kEncoderNearLossless &&
+          enc->argb_content_ != kEncoderPalette) {
+        err = MakeInputImageCopy(enc);
+        if (err != VP8_ENC_OK) goto Error;
+      }
 
-    // -------------------------------------------------------------------------
-    // Apply transforms and write transform data.
+      // -----------------------------------------------------------------------
+      // Apply transforms and write transform data.
 
-    if (enc->use_subtract_green_) {
-      ApplySubtractGreen(enc, enc->current_width_, height, bw);
-    }
+      if (enc->use_subtract_green_) {
+        ApplySubtractGreen(enc, enc->current_width_, height, bw);
+      }
 
-    if (enc->use_predict_) {
-      err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
-                               low_effort, enc->use_subtract_green_, bw);
-      if (err != VP8_ENC_OK) goto Error;
+      if (enc->use_predict_) {
+        err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                                 low_effort, enc->use_subtract_green_, bw);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+
+      if (enc->use_cross_color_) {
+        err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+                                    low_effort, bw);
+        if (err != VP8_ENC_OK) goto Error;
+      }
     }
 
-    if (enc->use_cross_color_) {
-      err = ApplyCrossColorFilter(enc, enc->current_width_,
-                                  height, quality, low_effort, bw);
-      if (err != VP8_ENC_OK) goto Error;
+    VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
+
+    // -------------------------------------------------------------------------
+    // Encode and write the transformed image.
+    err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
+                              enc->current_width_, height, quality, low_effort,
+                              use_cache, &crunch_configs[idx],
+                              &enc->cache_bits_, enc->histo_bits_,
+                              byte_position, &hdr_size, &data_size);
+    if (err != VP8_ENC_OK) goto Error;
+
+    // If we are better than what we already have.
+    if (idx == 0 || VP8LBitWriterNumBytes(bw) < best_size) {
+      best_size = VP8LBitWriterNumBytes(bw);
+      // Store the BitWriter.
+      VP8LBitWriterSwap(bw, &bw_best);
+#if !defined(WEBP_DISABLE_STATS)
+      // Update the stats.
+      if (stats != NULL) {
+        stats->lossless_features = 0;
+        if (enc->use_predict_) stats->lossless_features |= 1;
+        if (enc->use_cross_color_) stats->lossless_features |= 2;
+        if (enc->use_subtract_green_) stats->lossless_features |= 4;
+        if (enc->use_palette_) stats->lossless_features |= 8;
+        stats->histogram_bits = enc->histo_bits_;
+        stats->transform_bits = enc->transform_bits_;
+        stats->cache_bits = enc->cache_bits_;
+        stats->palette_size = enc->palette_size_;
+        stats->lossless_size = (int)(best_size - byte_position);
+        stats->lossless_hdr_size = hdr_size;
+        stats->lossless_data_size = data_size;
+      }
+#endif
     }
+    // Reset the bit writer for the following iteration if any.
+    if (num_crunch_configs > 1) VP8LBitWriterReset(&bw_init, bw);
   }
+  VP8LBitWriterSwap(&bw_best, bw);
 
-  VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
+Error:
+  VP8LBitWriterWipeOut(&bw_best);
+  params->err_ = err;
+  // The hook should return false in case of error.
+  return (err == VP8_ENC_OK);
+}
 
-  // ---------------------------------------------------------------------------
-  // Encode and write the transformed image.
-  err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
-                            enc->current_width_, height, quality, low_effort,
-                            use_cache, &enc->cache_bits_, enc->histo_bits_,
-                            byte_position, &hdr_size, &data_size);
-  if (err != VP8_ENC_OK) goto Error;
+WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
+                                   const WebPPicture* const picture,
+                                   VP8LBitWriter* const bw_main,
+                                   int use_cache) {
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LEncoder* const enc_main = VP8LEncoderNew(config, picture);
+  VP8LEncoder* enc_side = NULL;
+  CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX];
+  int num_crunch_configs_main, num_crunch_configs_side = 0;
+  int idx;
+  int red_and_blue_always_zero = 0;
+  WebPWorker worker_main, worker_side;
+  StreamEncodeContext params_main, params_side;
+  // The main thread uses picture->stats, the side thread uses stats_side.
+  WebPAuxStats stats_side;
+  VP8LBitWriter bw_side;
+  const WebPWorkerInterface* const worker_interface = WebPGetWorkerInterface();
+  int ok_main;
 
-  if (picture->stats != NULL) {
-    WebPAuxStats* const stats = picture->stats;
-    stats->lossless_features = 0;
-    if (enc->use_predict_) stats->lossless_features |= 1;
-    if (enc->use_cross_color_) stats->lossless_features |= 2;
-    if (enc->use_subtract_green_) stats->lossless_features |= 4;
-    if (enc->use_palette_) stats->lossless_features |= 8;
-    stats->histogram_bits = enc->histo_bits_;
-    stats->transform_bits = enc->transform_bits_;
-    stats->cache_bits = enc->cache_bits_;
-    stats->palette_size = enc->palette_size_;
-    stats->lossless_size = (int)(VP8LBitWriterNumBytes(bw) - byte_position);
-    stats->lossless_hdr_size = hdr_size;
-    stats->lossless_data_size = data_size;
+  // Analyze image (entropy, num_palettes etc)
+  if (enc_main == NULL ||
+      !EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
+                      &red_and_blue_always_zero) ||
+      !EncoderInit(enc_main) || !VP8LBitWriterInit(&bw_side, 0)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
   }
 
- Error:
-  VP8LEncoderDelete(enc);
+  // Split the configs between the main and side threads (if any).
+  if (config->thread_level > 0) {
+    num_crunch_configs_side = num_crunch_configs_main / 2;
+    for (idx = 0; idx < num_crunch_configs_side; ++idx) {
+      params_side.crunch_configs_[idx] =
+          crunch_configs[num_crunch_configs_main - num_crunch_configs_side +
+                         idx];
+    }
+    params_side.num_crunch_configs_ = num_crunch_configs_side;
+  }
+  num_crunch_configs_main -= num_crunch_configs_side;
+  for (idx = 0; idx < num_crunch_configs_main; ++idx) {
+    params_main.crunch_configs_[idx] = crunch_configs[idx];
+  }
+  params_main.num_crunch_configs_ = num_crunch_configs_main;
+
+  // Fill in the parameters for the thread workers.
+  {
+    const int params_size = (num_crunch_configs_side > 0) ? 2 : 1;
+    for (idx = 0; idx < params_size; ++idx) {
+      // Create the parameters for each worker.
+      WebPWorker* const worker = (idx == 0) ? &worker_main : &worker_side;
+      StreamEncodeContext* const param =
+          (idx == 0) ? &params_main : &params_side;
+      param->config_ = config;
+      param->picture_ = picture;
+      param->use_cache_ = use_cache;
+      param->red_and_blue_always_zero_ = red_and_blue_always_zero;
+      if (idx == 0) {
+        param->stats_ = picture->stats;
+        param->bw_ = bw_main;
+        param->enc_ = enc_main;
+      } else {
+        param->stats_ = (picture->stats == NULL) ? NULL : &stats_side;
+        // Create a side bit writer.
+        if (!VP8LBitWriterClone(bw_main, &bw_side)) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        param->bw_ = &bw_side;
+        // Create a side encoder.
+        enc_side = VP8LEncoderNew(config, picture);
+        if (enc_side == NULL || !EncoderInit(enc_side)) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        // Copy the values that were computed for the main encoder.
+        enc_side->histo_bits_ = enc_main->histo_bits_;
+        enc_side->transform_bits_ = enc_main->transform_bits_;
+        enc_side->palette_size_ = enc_main->palette_size_;
+        memcpy(enc_side->palette_, enc_main->palette_,
+               sizeof(enc_main->palette_));
+        param->enc_ = enc_side;
+      }
+      // Create the workers.
+      worker_interface->Init(worker);
+      worker->data1 = param;
+      worker->data2 = NULL;
+      worker->hook = (WebPWorkerHook)EncodeStreamHook;
+    }
+  }
+
+  // Start the second thread if needed.
+  if (num_crunch_configs_side != 0) {
+    if (!worker_interface->Reset(&worker_side)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+#if !defined(WEBP_DISABLE_STATS)
+    // This line is here and not in the param initialization above to remove a
+    // Clang static analyzer warning.
+    if (picture->stats != NULL) {
+      memcpy(&stats_side, picture->stats, sizeof(stats_side));
+    }
+#endif
+    // This line is only useful to remove a Clang static analyzer warning.
+    params_side.err_ = VP8_ENC_OK;
+    worker_interface->Launch(&worker_side);
+  }
+  // Execute the main thread.
+  worker_interface->Execute(&worker_main);
+  ok_main = worker_interface->Sync(&worker_main);
+  worker_interface->End(&worker_main);
+  if (num_crunch_configs_side != 0) {
+    // Wait for the second thread.
+    const int ok_side = worker_interface->Sync(&worker_side);
+    worker_interface->End(&worker_side);
+    if (!ok_main || !ok_side) {
+      err = ok_main ? params_side.err_ : params_main.err_;
+      goto Error;
+    }
+    if (VP8LBitWriterNumBytes(&bw_side) < VP8LBitWriterNumBytes(bw_main)) {
+      VP8LBitWriterSwap(bw_main, &bw_side);
+#if !defined(WEBP_DISABLE_STATS)
+      if (picture->stats != NULL) {
+        memcpy(picture->stats, &stats_side, sizeof(*picture->stats));
+      }
+#endif
+    }
+  } else {
+    if (!ok_main) {
+      err = params_main.err_;
+      goto Error;
+    }
+  }
+
+Error:
+  VP8LBitWriterWipeOut(&bw_side);
+  VP8LEncoderDelete(enc_main);
+  VP8LEncoderDelete(enc_side);
   return err;
 }
 
+#undef CRUNCH_CONFIGS_MAX
+#undef CRUNCH_CONFIGS_LZ77_MAX
+
 int VP8LEncodeImage(const WebPConfig* const config,
                     const WebPPicture* const picture) {
   int width, height;
@@ -1642,11 +1953,13 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
   if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
 
+#if !defined(WEBP_DISABLE_STATS)
   // Save size.
   if (picture->stats != NULL) {
     picture->stats->coded_size += (int)coded_size;
     picture->stats->lossless_size = (int)coded_size;
   }
+#endif
 
   if (picture->extra_info != NULL) {
     const int mb_w = (width + 15) >> 4;
diff --git a/thirdparty/libwebp/enc/vp8li_enc.h b/thirdparty/libwebp/src/enc/vp8li_enc.h
index 8c5fbcbb2e..298a4a0014 100644
--- a/thirdparty/libwebp/enc/vp8li_enc.h
+++ b/thirdparty/libwebp/src/enc/vp8li_enc.h
@@ -11,14 +11,23 @@
 //
 // Author: Vikas Arora (vikaas.arora@gmail.com)
 
-#ifndef WEBP_ENC_VP8LI_H_
-#define WEBP_ENC_VP8LI_H_
+#ifndef WEBP_ENC_VP8LI_ENC_H_
+#define WEBP_ENC_VP8LI_ENC_H_
 
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "../utils/bit_writer_utils.h"
-#include "../webp/encode.h"
-#include "../webp/format_constants.h"
+#ifdef HAVE_CONFIG_H
+#include "src/webp/config.h"
+#endif
+// Either WEBP_NEAR_LOSSLESS is defined as 0 in config.h when compiling to
+// disable near-lossless, or it is enabled by default.
+#ifndef WEBP_NEAR_LOSSLESS
+#define WEBP_NEAR_LOSSLESS 1
+#endif
+
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/utils/bit_writer_utils.h"
+#include "src/webp/encode.h"
+#include "src/webp/format_constants.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,16 +36,24 @@ extern "C" {
 // maximum value of transform_bits_ in VP8LEncoder.
 #define MAX_TRANSFORM_BITS 6
 
+typedef enum {
+  kEncoderNone = 0,
+  kEncoderARGB,
+  kEncoderNearLossless,
+  kEncoderPalette
+} VP8LEncoderARGBContent;
+
 typedef struct {
   const WebPConfig* config_;      // user configuration and parameters
   const WebPPicture* pic_;        // input picture.
 
-  uint32_t* argb_;                // Transformed argb image data.
-  uint32_t* argb_scratch_;        // Scratch memory for argb rows
-                                  // (used for prediction).
-  uint32_t* transform_data_;      // Scratch memory for transform data.
-  uint32_t* transform_mem_;       // Currently allocated memory.
-  size_t    transform_mem_size_;  // Currently allocated memory size.
+  uint32_t* argb_;                       // Transformed argb image data.
+  VP8LEncoderARGBContent argb_content_;  // Content type of the argb buffer.
+  uint32_t* argb_scratch_;               // Scratch memory for argb rows
+                                         // (used for prediction).
+  uint32_t* transform_data_;             // Scratch memory for transform data.
+  uint32_t* transform_mem_;              // Currently allocated memory.
+  size_t    transform_mem_size_;         // Currently allocated memory size.
 
   int       current_width_;       // Corresponds to packed image width.
 
@@ -54,8 +71,7 @@ typedef struct {
   uint32_t palette_[MAX_PALETTE_SIZE];
 
   // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[2];  // Backward Refs array corresponding to
-                                     // LZ77 & RLE coding.
+  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
   VP8LHashChain hash_chain_;         // HashChain data for constructing
                                      // backward references.
 } VP8LEncoder;
@@ -75,6 +91,13 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    const WebPPicture* const picture,
                                    VP8LBitWriter* const bw, int use_cache);
 
+#if (WEBP_NEAR_LOSSLESS == 1)
+// in near_lossless.c
+// Near lossless preprocessing in RGB color-space.
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+                         uint32_t* const argb_dst);
+#endif
+
 //------------------------------------------------------------------------------
 // Image transforms in predictor.c.
 
@@ -92,4 +115,4 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_VP8LI_H_ */
+#endif  /* WEBP_ENC_VP8LI_ENC_H_ */
diff --git a/thirdparty/libwebp/enc/webp_enc.c b/thirdparty/libwebp/src/enc/webp_enc.c
index f18461ef92..283cda8e7b 100644
--- a/thirdparty/libwebp/enc/webp_enc.c
+++ b/thirdparty/libwebp/src/enc/webp_enc.c
@@ -16,10 +16,10 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost_enc.h"
-#include "./vp8i_enc.h"
-#include "./vp8li_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/vp8li_enc.h"
+#include "src/utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
 
@@ -207,7 +207,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->preds_w_ = preds_w;
   enc->mb_info_ = (VP8MBInfo*)mem;
   mem += info_size;
-  enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
+  enc->preds_ = mem + 1 + enc->preds_w_;
   mem += preds_size;
   enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
   mem += nz_size;
@@ -216,7 +216,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
 
   // top samples (all 16-aligned)
   mem = (uint8_t*)WEBP_ALIGN(mem);
-  enc->y_top_ = (uint8_t*)mem;
+  enc->y_top_ = mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
   assert(mem <= (uint8_t*)enc + size);
@@ -256,6 +256,7 @@ static int DeleteVP8Encoder(VP8Encoder* enc) {
 
 //------------------------------------------------------------------------------
 
+#if !defined(WEBP_DISABLE_STATS)
 static double GetPSNR(uint64_t err, uint64_t size) {
   return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
 }
@@ -270,8 +271,10 @@ static void FinalizePSNR(const VP8Encoder* const enc) {
   stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
   stats->PSNR[4] = (float)GetPSNR(sse[3], size);
 }
+#endif  // !defined(WEBP_DISABLE_STATS)
 
 static void StoreStats(VP8Encoder* const enc) {
+#if !defined(WEBP_DISABLE_STATS)
   WebPAuxStats* const stats = enc->pic_->stats;
   if (stats != NULL) {
     int i, s;
@@ -288,7 +291,9 @@ static void StoreStats(VP8Encoder* const enc) {
       stats->block_count[i] = enc->block_count_[i];
     }
   }
+#else  // defined(WEBP_DISABLE_STATS)
   WebPReportProgress(enc->pic_, 100, &enc->percent_);  // done!
+#endif  // !defined(WEBP_DISABLE_STATS)
 }
 
 int WebPEncodingSetError(const WebPPicture* const pic,
@@ -336,10 +341,6 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
 
-    if (!config->exact) {
-      WebPCleanupTransparentArea(pic);
-    }
-
     if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
       if (config->use_sharp_yuv || (config->preprocessing & 4)) {
@@ -361,6 +362,10 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
       }
     }
 
+    if (!config->exact) {
+      WebPCleanupTransparentArea(pic);
+    }
+
     enc = InitVP8Encoder(config, pic);
     if (enc == NULL) return 0;  // pic->error is already set.
     // Note: each of the tasks below account for 20% in the progress report.
diff --git a/thirdparty/libwebp/mux/anim_encode.c b/thirdparty/libwebp/src/mux/anim_encode.c
index 6066388727..7be99068f6 100644
--- a/thirdparty/libwebp/mux/anim_encode.c
+++ b/thirdparty/libwebp/src/mux/anim_encode.c
@@ -16,12 +16,12 @@
 #include <stdio.h>
 #include <stdlib.h>  // for abs()
 
-#include "../mux/animi.h"
-#include "../utils/utils.h"
-#include "../webp/decode.h"
-#include "../webp/encode.h"
-#include "../webp/format_constants.h"
-#include "../webp/mux.h"
+#include "src/mux/animi.h"
+#include "src/utils/utils.h"
+#include "src/webp/decode.h"
+#include "src/webp/encode.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/mux.h"
 
 #if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf
@@ -35,7 +35,7 @@
 // Stores frame rectangle dimensions.
 typedef struct {
   int x_offset_, y_offset_, width_, height_;
-} FrameRect;
+} FrameRectangle;
 
 // Used to store two candidates of encoded data for an animation frame. One of
 // the two will be chosen later.
@@ -50,7 +50,7 @@ struct WebPAnimEncoder {
   const int canvas_height_;                 // Canvas height.
   const WebPAnimEncoderOptions options_;    // Global encoding options.
 
-  FrameRect prev_rect_;               // Previous WebP frame rectangle.
+  FrameRectangle prev_rect_;          // Previous WebP frame rectangle.
   WebPConfig last_config_;            // Cached in case a re-encode is needed.
   WebPConfig last_config_reversed_;   // If 'last_config_' uses lossless, then
                                       // this config uses lossy and vice versa;
@@ -206,7 +206,7 @@ static void ClearRectangle(WebPPicture* const picture,
 }
 
 static void WebPUtilClearPic(WebPPicture* const picture,
-                             const FrameRect* const rect) {
+                             const FrameRectangle* const rect) {
   if (rect != NULL) {
     ClearRectangle(picture, rect->x_offset_, rect->y_offset_,
                    rect->width_, rect->height_);
@@ -400,7 +400,7 @@ static WEBP_INLINE int ComparePixelsLossy(const uint32_t* src, int src_step,
   return 1;
 }
 
-static int IsEmptyRect(const FrameRect* const rect) {
+static int IsEmptyRect(const FrameRectangle* const rect) {
   return (rect->width_ == 0) || (rect->height_ == 0);
 }
 
@@ -413,7 +413,7 @@ static int QualityToMaxDiff(float quality) {
 // Assumes that an initial valid guess of change rectangle 'rect' is passed.
 static void MinimizeChangeRectangle(const WebPPicture* const src,
                                     const WebPPicture* const dst,
-                                    FrameRect* const rect,
+                                    FrameRectangle* const rect,
                                     int is_lossless, float quality) {
   int i, j;
   const ComparePixelsFunc compare_pixels =
@@ -498,7 +498,7 @@ static void MinimizeChangeRectangle(const WebPPicture* const src,
 }
 
 // Snap rectangle to even offsets (and adjust dimensions if needed).
-static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) {
+static WEBP_INLINE void SnapToEvenOffsets(FrameRectangle* const rect) {
   rect->width_ += (rect->x_offset_ & 1);
   rect->height_ += (rect->y_offset_ & 1);
   rect->x_offset_ &= ~1;
@@ -508,9 +508,9 @@ static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) {
 typedef struct {
   int should_try_;               // Should try this set of parameters.
   int empty_rect_allowed_;       // Frame with empty rectangle can be skipped.
-  FrameRect rect_ll_;            // Frame rectangle for lossless compression.
+  FrameRectangle rect_ll_;       // Frame rectangle for lossless compression.
   WebPPicture sub_frame_ll_;     // Sub-frame pic for lossless compression.
-  FrameRect rect_lossy_;         // Frame rectangle for lossy compression.
+  FrameRectangle rect_lossy_;    // Frame rectangle for lossy compression.
                                  // Could be smaller than rect_ll_ as pixels
                                  // with small diffs can be ignored.
   WebPPicture sub_frame_lossy_;  // Sub-frame pic for lossless compression.
@@ -538,7 +538,8 @@ static void SubFrameParamsFree(SubFrameParams* const params) {
 static int GetSubRect(const WebPPicture* const prev_canvas,
                       const WebPPicture* const curr_canvas, int is_key_frame,
                       int is_first_frame, int empty_rect_allowed,
-                      int is_lossless, float quality, FrameRect* const rect,
+                      int is_lossless, float quality,
+                      FrameRectangle* const rect,
                       WebPPicture* const sub_frame) {
   if (!is_key_frame || is_first_frame) {  // Optimize frame rectangle.
     // Note: This behaves as expected for first frame, as 'prev_canvas' is
@@ -594,7 +595,7 @@ int WebPAnimEncoderRefineRect(
     const WebPPicture* const prev_canvas, const WebPPicture* const curr_canvas,
     int is_lossless, float quality, int* const x_offset, int* const y_offset,
     int* const width, int* const height) {
-  FrameRect rect;
+  FrameRectangle rect;
   const int right = clip(*x_offset + *width, 0, curr_canvas->width);
   const int left = clip(*x_offset, 0, curr_canvas->width - 1);
   const int bottom = clip(*y_offset + *height, 0, curr_canvas->height);
@@ -620,7 +621,7 @@ int WebPAnimEncoderRefineRect(
 }
 
 static void DisposeFrameRectangle(int dispose_method,
-                                  const FrameRect* const rect,
+                                  const FrameRectangle* const rect,
                                   WebPPicture* const curr_canvas) {
   assert(rect != NULL);
   if (dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
@@ -628,13 +629,13 @@ static void DisposeFrameRectangle(int dispose_method,
   }
 }
 
-static uint32_t RectArea(const FrameRect* const rect) {
+static uint32_t RectArea(const FrameRectangle* const rect) {
   return (uint32_t)rect->width_ * rect->height_;
 }
 
 static int IsLosslessBlendingPossible(const WebPPicture* const src,
                                       const WebPPicture* const dst,
-                                      const FrameRect* const rect) {
+                                      const FrameRectangle* const rect) {
   int i, j;
   assert(src->width == dst->width && src->height == dst->height);
   assert(rect->x_offset_ + rect->width_ <= dst->width);
@@ -656,7 +657,7 @@ static int IsLosslessBlendingPossible(const WebPPicture* const src,
 
 static int IsLossyBlendingPossible(const WebPPicture* const src,
                                    const WebPPicture* const dst,
-                                   const FrameRect* const rect,
+                                   const FrameRectangle* const rect,
                                    float quality) {
   const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
   int i, j;
@@ -683,7 +684,7 @@ static int IsLossyBlendingPossible(const WebPPicture* const src,
 // transparent pixels.
 // Returns true if at least one pixel gets modified.
 static int IncreaseTransparency(const WebPPicture* const src,
-                                const FrameRect* const rect,
+                                const FrameRectangle* const rect,
                                 WebPPicture* const dst) {
   int i, j;
   int modified = 0;
@@ -709,7 +710,7 @@ static int IncreaseTransparency(const WebPPicture* const src,
 // Assumes lossy compression is being used.
 // Returns true if at least one pixel gets modified.
 static int FlattenSimilarBlocks(const WebPPicture* const src,
-                                const FrameRect* const rect,
+                                const FrameRectangle* const rect,
                                 WebPPicture* const dst, float quality) {
   const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
   int i, j;
@@ -778,13 +779,13 @@ static int EncodeFrame(const WebPConfig* const config, WebPPicture* const pic,
 typedef struct {
   WebPMemoryWriter  mem_;
   WebPMuxFrameInfo  info_;
-  FrameRect         rect_;
+  FrameRectangle    rect_;
   int               evaluate_;  // True if this candidate should be evaluated.
 } Candidate;
 
 // Generates a candidate encoded frame given a picture and metadata.
 static WebPEncodingError EncodeCandidate(WebPPicture* const sub_frame,
-                                         const FrameRect* const rect,
+                                         const FrameRectangle* const rect,
                                          const WebPConfig* const encoder_config,
                                          int use_blending,
                                          Candidate* const candidate) {
@@ -958,7 +959,7 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) {
   if (new_duration >= MAX_DURATION) {  // Special case.
     // Separate out previous frame from earlier merged frames to avoid overflow.
     // We add a 1x1 transparent frame for the previous frame, with blending on.
-    const FrameRect rect = { 0, 0, 1, 1 };
+    const FrameRectangle rect = { 0, 0, 1, 1 };
     const uint8_t lossless_1x1_bytes[] = {
       0x52, 0x49, 0x46, 0x46, 0x14, 0x00, 0x00, 0x00, 0x57, 0x45, 0x42, 0x50,
       0x56, 0x50, 0x38, 0x4c, 0x08, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
@@ -1223,7 +1224,7 @@ static int CacheFrame(WebPAnimEncoder* const enc,
       enc->prev_candidate_undecided_ = 0;
     } else {
       int64_t curr_delta;
-      FrameRect prev_rect_key, prev_rect_sub;
+      FrameRectangle prev_rect_key, prev_rect_sub;
 
       // Add this as a frame rectangle to enc.
       error_code = SetFrame(enc, config, 0, encoded_frame, &frame_skipped);
@@ -1535,7 +1536,8 @@ int WebPAnimEncoderAssemble(WebPAnimEncoder* enc, WebPData* webp_data) {
 
   if (!enc->got_null_frame_ && enc->in_frame_count_ > 1 && enc->count_ > 0) {
     // set duration of the last frame to be avg of durations of previous frames.
-    const double delta_time = enc->prev_timestamp_ - enc->first_timestamp_;
+    const double delta_time =
+        (uint32_t)enc->prev_timestamp_ - enc->first_timestamp_;
     const int average_duration = (int)(delta_time / (enc->in_frame_count_ - 1));
     if (!IncreasePreviousDuration(enc, average_duration)) {
       return 0;
diff --git a/thirdparty/libwebp/mux/animi.h b/thirdparty/libwebp/src/mux/animi.h
index cecaf1fee5..88899532aa 100644
--- a/thirdparty/libwebp/mux/animi.h
+++ b/thirdparty/libwebp/src/mux/animi.h
@@ -14,7 +14,7 @@
 #ifndef WEBP_MUX_ANIMI_H_
 #define WEBP_MUX_ANIMI_H_
 
-#include "../webp/mux.h"
+#include "src/webp/mux.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/thirdparty/libwebp/mux/muxedit.c b/thirdparty/libwebp/src/mux/muxedit.c
index d2c5305372..7a027b3cb4 100644
--- a/thirdparty/libwebp/mux/muxedit.c
+++ b/thirdparty/libwebp/src/mux/muxedit.c
@@ -13,8 +13,8 @@
 //          Vikas (vikasa@google.com)
 
 #include <assert.h>
-#include "./muxi.h"
-#include "../utils/utils.h"
+#include "src/mux/muxi.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Life of a mux object.
diff --git a/thirdparty/libwebp/mux/muxi.h b/thirdparty/libwebp/src/mux/muxi.h
index e6606aa5d1..b73e3fbd7a 100644
--- a/thirdparty/libwebp/mux/muxi.h
+++ b/thirdparty/libwebp/src/mux/muxi.h
@@ -15,9 +15,9 @@
 #define WEBP_MUX_MUXI_H_
 
 #include <stdlib.h>
-#include "../dec/vp8i_dec.h"
-#include "../dec/vp8li_dec.h"
-#include "../webp/mux.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/webp/mux.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,7 +28,7 @@ extern "C" {
 
 #define MUX_MAJ_VERSION 0
 #define MUX_MIN_VERSION 4
-#define MUX_REV_VERSION 0
+#define MUX_REV_VERSION 1
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
diff --git a/thirdparty/libwebp/mux/muxinternal.c b/thirdparty/libwebp/src/mux/muxinternal.c
index 387b57e8fe..1473f100e5 100644
--- a/thirdparty/libwebp/mux/muxinternal.c
+++ b/thirdparty/libwebp/src/mux/muxinternal.c
@@ -13,8 +13,8 @@
 //          Vikas (vikasa@google.com)
 
 #include <assert.h>
-#include "./muxi.h"
-#include "../utils/utils.h"
+#include "src/mux/muxi.h"
+#include "src/utils/utils.h"
 
 #define UNDEFINED_CHUNK_SIZE ((uint32_t)(-1))
 
@@ -504,6 +504,20 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
     if (!has_animation && (num_anim == 1 || num_frames > 0)) {
       return WEBP_MUX_INVALID_ARGUMENT;
     }
+    if (!has_animation) {
+      const WebPMuxImage* images = mux->images_;
+      // There can be only one image.
+      if (images == NULL || images->next_ != NULL) {
+        return WEBP_MUX_INVALID_ARGUMENT;
+      }
+      // Size must match.
+      if (mux->canvas_width_ > 0) {
+        if (images->width_ != mux->canvas_width_ ||
+            images->height_ != mux->canvas_height_) {
+          return WEBP_MUX_INVALID_ARGUMENT;
+        }
+      }
+    }
   }
 
   // Verify either VP8X chunk is present OR there is only one elem in
@@ -515,6 +529,7 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
   if (num_vp8x == 0 && num_images != 1) return WEBP_MUX_INVALID_ARGUMENT;
 
   // ALPHA_FLAG & alpha chunk(s) are consistent.
+  // Note: ALPHA_FLAG can be set when there is actually no Alpha data present.
   if (MuxHasAlpha(mux->images_)) {
     if (num_vp8x > 0) {
       // VP8X chunk is present, so it should contain ALPHA_FLAG.
@@ -525,8 +540,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
       if (err != WEBP_MUX_OK) return err;
       if (num_alpha > 0) return WEBP_MUX_INVALID_ARGUMENT;
     }
-  } else {  // Mux doesn't need alpha. So, ALPHA_FLAG should NOT be present.
-    if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT;
   }
 
   return WEBP_MUX_OK;
diff --git a/thirdparty/libwebp/mux/muxread.c b/thirdparty/libwebp/src/mux/muxread.c
index 410acd9119..0b55286862 100644
--- a/thirdparty/libwebp/mux/muxread.c
+++ b/thirdparty/libwebp/src/mux/muxread.c
@@ -13,8 +13,8 @@
 //          Vikas (vikasa@google.com)
 
 #include <assert.h>
-#include "./muxi.h"
-#include "../utils/utils.h"
+#include "src/mux/muxi.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Helper method(s).
@@ -43,7 +43,7 @@ static WebPMuxError MuxGet(const WebPMux* const mux, CHUNK_INDEX idx,
   SWITCH_ID_LIST(IDX_ANIM, mux->anim_);
   SWITCH_ID_LIST(IDX_EXIF, mux->exif_);
   SWITCH_ID_LIST(IDX_XMP, mux->xmp_);
-  SWITCH_ID_LIST(IDX_UNKNOWN, mux->unknown_);
+  assert(idx != IDX_UNKNOWN);
   return WEBP_MUX_NOT_FOUND;
 }
 #undef SWITCH_ID_LIST
@@ -270,6 +270,9 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
     ChunkInit(&chunk);
   }
 
+  // Incomplete image.
+  if (wpi->is_partial_) goto Err;
+
   // Validate mux if complete.
   if (MuxValidate(mux) != WEBP_MUX_OK) goto Err;
 
diff --git a/thirdparty/libwebp/utils/bit_reader_inl_utils.h b/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h
index fd7fb0446c..2ccc6ed326 100644
--- a/thirdparty/libwebp/utils/bit_reader_inl_utils.h
+++ b/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -13,19 +13,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_BIT_READER_INL_H_
-#define WEBP_UTILS_BIT_READER_INL_H_
+#ifndef WEBP_UTILS_BIT_READER_INL_UTILS_H_
+#define WEBP_UTILS_BIT_READER_INL_UTILS_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <string.h>  // for memcpy
 
-#include "../dsp/dsp.h"
-#include "./bit_reader_utils.h"
-#include "./endian_inl_utils.h"
-#include "./utils.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_reader_utils.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -187,4 +187,4 @@ static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
 }    // extern "C"
 #endif
 
-#endif   // WEBP_UTILS_BIT_READER_INL_H_
+#endif   // WEBP_UTILS_BIT_READER_INL_UTILS_H_
diff --git a/thirdparty/libwebp/utils/bit_reader_utils.c b/thirdparty/libwebp/src/utils/bit_reader_utils.c
index 053b710bb8..5fa3ae7795 100644
--- a/thirdparty/libwebp/utils/bit_reader_utils.c
+++ b/thirdparty/libwebp/src/utils/bit_reader_utils.c
@@ -12,11 +12,11 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "./bit_reader_inl_utils.h"
-#include "../utils/utils.h"
+#include "src/utils/bit_reader_inl_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // VP8BitReader
diff --git a/thirdparty/libwebp/utils/bit_reader_utils.h b/thirdparty/libwebp/src/utils/bit_reader_utils.h
index ea5c584eb4..04f9804409 100644
--- a/thirdparty/libwebp/utils/bit_reader_utils.h
+++ b/thirdparty/libwebp/src/utils/bit_reader_utils.h
@@ -12,14 +12,14 @@
 // Author: Skal (pascal.massimino@gmail.com)
 //         Vikas Arora (vikaas.arora@gmail.com)
 
-#ifndef WEBP_UTILS_BIT_READER_H_
-#define WEBP_UTILS_BIT_READER_H_
+#ifndef WEBP_UTILS_BIT_READER_UTILS_H_
+#define WEBP_UTILS_BIT_READER_UTILS_H_
 
 #include <assert.h>
 #ifdef _MSC_VER
 #include <stdlib.h>  // _byteswap_ulong
 #endif
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -165,9 +165,10 @@ static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {
 
 // For jumping over a number of bits in the bit stream when accessed with
 // VP8LPrefetchBits and VP8LFillBitWindow.
+// This function does *not* set br->eos_, since it's speed-critical.
+// Use with extreme care!
 static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
   br->bit_pos_ = val;
-  br->eos_ = VP8LIsEndOfStream(br);
 }
 
 // Advances the read buffer by 4 bytes to make room for reading next 32 bits.
@@ -181,4 +182,4 @@ static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_BIT_READER_H_ */
+#endif  /* WEBP_UTILS_BIT_READER_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/bit_writer_utils.c b/thirdparty/libwebp/src/utils/bit_writer_utils.c
index ab0c49dce8..f4f476ce3f 100644
--- a/thirdparty/libwebp/utils/bit_writer_utils.c
+++ b/thirdparty/libwebp/src/utils/bit_writer_utils.c
@@ -16,9 +16,9 @@
 #include <string.h>   // for memcpy()
 #include <stdlib.h>
 
-#include "./bit_writer_utils.h"
-#include "./endian_inl_utils.h"
-#include "./utils.h"
+#include "src/utils/bit_writer_utils.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // VP8BitWriter
@@ -239,6 +239,18 @@ int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
   return VP8LBitWriterResize(bw, expected_size);
 }
 
+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+                       VP8LBitWriter* const dst) {
+  const size_t current_size = src->cur_ - src->buf_;
+  assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_);
+  if (!VP8LBitWriterResize(dst, current_size)) return 0;
+  memcpy(dst->buf_, src->buf_, current_size);
+  dst->bits_ = src->bits_;
+  dst->used_ = src->used_;
+  dst->error_ = src->error_;
+  return 1;
+}
+
 void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
   if (bw != NULL) {
     WebPSafeFree(bw->buf_);
@@ -246,6 +258,21 @@ void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
   }
 }
 
+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+                        VP8LBitWriter* const bw) {
+  bw->bits_ = bw_init->bits_;
+  bw->used_ = bw_init->used_;
+  bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_);
+  assert(bw->cur_ <= bw->end_);
+  bw->error_ = bw_init->error_;
+}
+
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) {
+  const VP8LBitWriter tmp = *src;
+  *src = *dst;
+  *dst = tmp;
+}
+
 void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
   // If needed, make some room by flushing some bits out.
   if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
diff --git a/thirdparty/libwebp/utils/bit_writer_utils.h b/thirdparty/libwebp/src/utils/bit_writer_utils.h
index 9c02bbc06d..2cf5976fe3 100644
--- a/thirdparty/libwebp/utils/bit_writer_utils.h
+++ b/thirdparty/libwebp/src/utils/bit_writer_utils.h
@@ -11,10 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_BIT_WRITER_H_
-#define WEBP_UTILS_BIT_WRITER_H_
+#ifndef WEBP_UTILS_BIT_WRITER_UTILS_H_
+#define WEBP_UTILS_BIT_WRITER_UTILS_H_
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -100,16 +100,24 @@ typedef struct {
   int error_;
 } VP8LBitWriter;
 
-static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
+static WEBP_INLINE size_t VP8LBitWriterNumBytes(const VP8LBitWriter* const bw) {
   return (bw->cur_ - bw->buf_) + ((bw->used_ + 7) >> 3);
 }
 
 // Returns false in case of memory allocation error.
 int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);
+// Returns false in case of memory allocation error.
+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+                       VP8LBitWriter* const dst);
 // Finalize the bitstream coding. Returns a pointer to the internal buffer.
 uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw);
 // Release any pending memory and zeroes the object.
 void VP8LBitWriterWipeOut(VP8LBitWriter* const bw);
+// Resets the cursor of the BitWriter bw to when it was like in bw_init.
+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+                        VP8LBitWriter* const bw);
+// Swaps the memory held by two BitWriters.
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst);
 
 // Internal function for VP8LPutBits flushing 32 bits from the written state.
 void VP8LPutBitsFlushBits(VP8LBitWriter* const bw);
@@ -143,4 +151,4 @@ static WEBP_INLINE void VP8LPutBits(VP8LBitWriter* const bw,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_BIT_WRITER_H_ */
+#endif  /* WEBP_UTILS_BIT_WRITER_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/color_cache_utils.c b/thirdparty/libwebp/src/utils/color_cache_utils.c
index 0172590c48..b09f538e8b 100644
--- a/thirdparty/libwebp/utils/color_cache_utils.c
+++ b/thirdparty/libwebp/src/utils/color_cache_utils.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./color_cache_utils.h"
-#include "./utils.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // VP8LColorCache.
diff --git a/thirdparty/libwebp/utils/color_cache_utils.h b/thirdparty/libwebp/src/utils/color_cache_utils.h
index c373e6b361..20b7be11c9 100644
--- a/thirdparty/libwebp/utils/color_cache_utils.h
+++ b/thirdparty/libwebp/src/utils/color_cache_utils.h
@@ -12,10 +12,12 @@
 // Authors: Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 
-#ifndef WEBP_UTILS_COLOR_CACHE_H_
-#define WEBP_UTILS_COLOR_CACHE_H_
+#ifndef WEBP_UTILS_COLOR_CACHE_UTILS_H_
+#define WEBP_UTILS_COLOR_CACHE_UTILS_H_
 
-#include "../webp/types.h"
+#include <assert.h>
+
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,7 +32,7 @@ typedef struct {
 
 static const uint64_t kHashMul = 0x1e35a7bdull;
 
-static WEBP_INLINE int HashPix(uint32_t argb, int shift) {
+static WEBP_INLINE int VP8LHashPix(uint32_t argb, int shift) {
   return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
 }
 
@@ -48,19 +50,19 @@ static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc,
 
 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  const int key = HashPix(argb, cc->hash_shift_);
+  const int key = VP8LHashPix(argb, cc->hash_shift_);
   cc->colors_[key] = argb;
 }
 
 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  return HashPix(argb, cc->hash_shift_);
+  return VP8LHashPix(argb, cc->hash_shift_);
 }
 
 // Return the key if cc contains argb, and -1 otherwise.
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  const int key = HashPix(argb, cc->hash_shift_);
+  const int key = VP8LHashPix(argb, cc->hash_shift_);
   return (cc->colors_[key] == argb) ? key : -1;
 }
 
@@ -82,4 +84,4 @@ void VP8LColorCacheClear(VP8LColorCache* const color_cache);
 }
 #endif
 
-#endif  // WEBP_UTILS_COLOR_CACHE_H_
+#endif  // WEBP_UTILS_COLOR_CACHE_UTILS_H_
diff --git a/thirdparty/libwebp/utils/endian_inl_utils.h b/thirdparty/libwebp/src/utils/endian_inl_utils.h
index e11260ff7d..4b2f91dfb8 100644
--- a/thirdparty/libwebp/utils/endian_inl_utils.h
+++ b/thirdparty/libwebp/src/utils/endian_inl_utils.h
@@ -9,15 +9,15 @@
 //
 // Endian related functions.
 
-#ifndef WEBP_UTILS_ENDIAN_INL_H_
-#define WEBP_UTILS_ENDIAN_INL_H_
+#ifndef WEBP_UTILS_ENDIAN_INL_UTILS_H_
+#define WEBP_UTILS_ENDIAN_INL_UTILS_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "../dsp/dsp.h"
-#include "../webp/types.h"
+#include "src/dsp/dsp.h"
+#include "src/webp/types.h"
 
 // some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
 #if !defined(WORDS_BIGENDIAN) && \
@@ -97,4 +97,4 @@ static WEBP_INLINE uint64_t BSwap64(uint64_t x) {
 #endif  // HAVE_BUILTIN_BSWAP64
 }
 
-#endif  // WEBP_UTILS_ENDIAN_INL_H_
+#endif  // WEBP_UTILS_ENDIAN_INL_UTILS_H_
diff --git a/thirdparty/libwebp/utils/filters_utils.c b/thirdparty/libwebp/src/utils/filters_utils.c
index 49c1d18a22..bbc2c34d93 100644
--- a/thirdparty/libwebp/utils/filters_utils.c
+++ b/thirdparty/libwebp/src/utils/filters_utils.c
@@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#include "./filters_utils.h"
+#include "src/utils/filters_utils.h"
 #include <stdlib.h>
 #include <string.h>
 
diff --git a/thirdparty/libwebp/utils/filters_utils.h b/thirdparty/libwebp/src/utils/filters_utils.h
index 088b132fc5..410f2fcdf2 100644
--- a/thirdparty/libwebp/utils/filters_utils.h
+++ b/thirdparty/libwebp/src/utils/filters_utils.h
@@ -11,11 +11,11 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#ifndef WEBP_UTILS_FILTERS_H_
-#define WEBP_UTILS_FILTERS_H_
+#ifndef WEBP_UTILS_FILTERS_UTILS_H_
+#define WEBP_UTILS_FILTERS_UTILS_H_
 
-#include "../webp/types.h"
-#include "../dsp/dsp.h"
+#include "src/webp/types.h"
+#include "src/dsp/dsp.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -29,4 +29,4 @@ WEBP_FILTER_TYPE WebPEstimateBestFilter(const uint8_t* data,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_FILTERS_H_ */
+#endif  /* WEBP_UTILS_FILTERS_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/huffman_encode_utils.c b/thirdparty/libwebp/src/utils/huffman_encode_utils.c
index f9504658ea..6f3b1bbe02 100644
--- a/thirdparty/libwebp/utils/huffman_encode_utils.c
+++ b/thirdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -14,9 +14,9 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman_encode_utils.h"
-#include "./utils.h"
-#include "../webp/format_constants.h"
+#include "src/utils/huffman_encode_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
 // Util function to optimize the symbol map for RLE coding
diff --git a/thirdparty/libwebp/utils/huffman_encode_utils.h b/thirdparty/libwebp/src/utils/huffman_encode_utils.h
index a157165148..3e6763ce49 100644
--- a/thirdparty/libwebp/utils/huffman_encode_utils.h
+++ b/thirdparty/libwebp/src/utils/huffman_encode_utils.h
@@ -11,10 +11,10 @@
 //
 // Entropy encoding (Huffman) for webp lossless
 
-#ifndef WEBP_UTILS_HUFFMAN_ENCODE_H_
-#define WEBP_UTILS_HUFFMAN_ENCODE_H_
+#ifndef WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_
+#define WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -57,4 +57,4 @@ void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
 }
 #endif
 
-#endif  // WEBP_UTILS_HUFFMAN_ENCODE_H_
+#endif  // WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_
diff --git a/thirdparty/libwebp/utils/huffman_utils.c b/thirdparty/libwebp/src/utils/huffman_utils.c
index 008b5d746f..7a69963c3e 100644
--- a/thirdparty/libwebp/utils/huffman_utils.c
+++ b/thirdparty/libwebp/src/utils/huffman_utils.c
@@ -14,9 +14,9 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman_utils.h"
-#include "./utils.h"
-#include "../webp/format_constants.h"
+#include "src/utils/huffman_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 // Huffman data read via DecodeImageStream is represented in two (red and green)
 // bytes.
diff --git a/thirdparty/libwebp/utils/huffman_utils.h b/thirdparty/libwebp/src/utils/huffman_utils.h
index c6dd6aaa45..ff7ef17f3b 100644
--- a/thirdparty/libwebp/utils/huffman_utils.h
+++ b/thirdparty/libwebp/src/utils/huffman_utils.h
@@ -11,12 +11,12 @@
 //
 // Author: Urvang Joshi (urvang@google.com)
 
-#ifndef WEBP_UTILS_HUFFMAN_H_
-#define WEBP_UTILS_HUFFMAN_H_
+#ifndef WEBP_UTILS_HUFFMAN_UTILS_H_
+#define WEBP_UTILS_HUFFMAN_UTILS_H_
 
 #include <assert.h>
-#include "../webp/format_constants.h"
-#include "../webp/types.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -85,4 +85,4 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 }    // extern "C"
 #endif
 
-#endif  // WEBP_UTILS_HUFFMAN_H_
+#endif  // WEBP_UTILS_HUFFMAN_UTILS_H_
diff --git a/thirdparty/libwebp/utils/quant_levels_dec_utils.c b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
index d4d23d3147..3818a78b93 100644
--- a/thirdparty/libwebp/utils/quant_levels_dec_utils.c
+++ b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -14,11 +14,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./quant_levels_dec_utils.h"
+#include "src/utils/quant_levels_dec_utils.h"
 
 #include <string.h>   // for memset
 
-#include "./utils.h"
+#include "src/utils/utils.h"
 
 // #define USE_DITHERING   // uncomment to enable ordered dithering (not vital)
 
@@ -71,10 +71,11 @@ typedef struct {
 
 //------------------------------------------------------------------------------
 
-#define CLIP_MASK (int)(~0U << (8 + DFIX))
+#define CLIP_8b_MASK (int)(~0U << (8 + DFIX))
 static WEBP_INLINE uint8_t clip_8b(int v) {
-  return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
+  return (!(v & CLIP_8b_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
 }
+#undef CLIP_8b_MASK
 
 // vertical accumulation
 static void VFilter(SmoothParams* const p) {
diff --git a/thirdparty/libwebp/utils/quant_levels_dec_utils.h b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.h
index 59a13495d3..f822107a72 100644
--- a/thirdparty/libwebp/utils/quant_levels_dec_utils.h
+++ b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.h
@@ -11,10 +11,10 @@
 //
 // Author:  Vikas Arora (vikasa@google.com)
 
-#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_H_
-#define WEBP_UTILS_QUANT_LEVELS_DEC_H_
+#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_
+#define WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,4 +32,4 @@ int WebPDequantizeLevels(uint8_t* const data, int width, int height, int stride,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_QUANT_LEVELS_DEC_H_ */
+#endif  /* WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/quant_levels_utils.c b/thirdparty/libwebp/src/utils/quant_levels_utils.c
index 73174e8ab9..d65ad3c29d 100644
--- a/thirdparty/libwebp/utils/quant_levels_utils.c
+++ b/thirdparty/libwebp/src/utils/quant_levels_utils.c
@@ -14,7 +14,7 @@
 
 #include <assert.h>
 
-#include "./quant_levels_utils.h"
+#include "src/utils/quant_levels_utils.h"
 
 #define NUM_SYMBOLS     256
 
diff --git a/thirdparty/libwebp/utils/quant_levels_utils.h b/thirdparty/libwebp/src/utils/quant_levels_utils.h
index 1cb5a32cae..75df2ba6a4 100644
--- a/thirdparty/libwebp/utils/quant_levels_utils.h
+++ b/thirdparty/libwebp/src/utils/quant_levels_utils.h
@@ -11,12 +11,12 @@
 //
 // Author:  Vikas Arora (vikasa@google.com)
 
-#ifndef WEBP_UTILS_QUANT_LEVELS_H_
-#define WEBP_UTILS_QUANT_LEVELS_H_
+#ifndef WEBP_UTILS_QUANT_LEVELS_UTILS_H_
+#define WEBP_UTILS_QUANT_LEVELS_UTILS_H_
 
 #include <stdlib.h>
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,4 +33,4 @@ int QuantizeLevels(uint8_t* const data, int width, int height, int num_levels,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_QUANT_LEVELS_H_ */
+#endif  /* WEBP_UTILS_QUANT_LEVELS_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/random_utils.c b/thirdparty/libwebp/src/utils/random_utils.c
index 9f1e4154a6..7edb3fefbb 100644
--- a/thirdparty/libwebp/utils/random_utils.c
+++ b/thirdparty/libwebp/src/utils/random_utils.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <string.h>
-#include "./random_utils.h"
+#include "src/utils/random_utils.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/utils/random_utils.h b/thirdparty/libwebp/src/utils/random_utils.h
index c392a615ca..6d36c667e7 100644
--- a/thirdparty/libwebp/utils/random_utils.h
+++ b/thirdparty/libwebp/src/utils/random_utils.h
@@ -11,11 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_RANDOM_H_
-#define WEBP_UTILS_RANDOM_H_
+#ifndef WEBP_UTILS_RANDOM_UTILS_H_
+#define WEBP_UTILS_RANDOM_UTILS_H_
 
 #include <assert.h>
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -60,4 +60,4 @@ static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_RANDOM_H_ */
+#endif  /* WEBP_UTILS_RANDOM_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/rescaler_utils.c b/thirdparty/libwebp/src/utils/rescaler_utils.c
index 0d1f80da24..90e2ea76a1 100644
--- a/thirdparty/libwebp/utils/rescaler_utils.c
+++ b/thirdparty/libwebp/src/utils/rescaler_utils.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "../dsp/dsp.h"
-#include "./rescaler_utils.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -85,11 +85,13 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
 
     // if width is unspecified, scale original proportionally to height ratio.
     if (width == 0) {
-      width = (src_width * height + src_height / 2) / src_height;
+      width =
+          (int)(((uint64_t)src_width * height + src_height / 2) / src_height);
     }
     // if height is unspecified, scale original proportionally to width ratio.
     if (height == 0) {
-      height = (src_height * width + src_width / 2) / src_width;
+      height =
+          (int)(((uint64_t)src_height * width + src_width / 2) / src_width);
     }
     // Check if the overall dimensions still make sense.
     if (width <= 0 || height <= 0) {
diff --git a/thirdparty/libwebp/utils/rescaler_utils.h b/thirdparty/libwebp/src/utils/rescaler_utils.h
index 98b01a76d0..8890e6fa13 100644
--- a/thirdparty/libwebp/utils/rescaler_utils.h
+++ b/thirdparty/libwebp/src/utils/rescaler_utils.h
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_RESCALER_H_
-#define WEBP_UTILS_RESCALER_H_
+#ifndef WEBP_UTILS_RESCALER_UTILS_H_
+#define WEBP_UTILS_RESCALER_UTILS_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #define WEBP_RESCALER_RFIX 32   // fixed-point precision for multiplies
 #define WEBP_RESCALER_ONE (1ull << WEBP_RESCALER_RFIX)
@@ -98,4 +98,4 @@ int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_RESCALER_H_ */
+#endif  /* WEBP_UTILS_RESCALER_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/thread_utils.c b/thirdparty/libwebp/src/utils/thread_utils.c
index 1729060c70..2052b6b006 100644
--- a/thirdparty/libwebp/utils/thread_utils.c
+++ b/thirdparty/libwebp/src/utils/thread_utils.c
@@ -13,8 +13,8 @@
 
 #include <assert.h>
 #include <string.h>   // for memset()
-#include "./thread_utils.h"
-#include "./utils.h"
+#include "src/utils/thread_utils.h"
+#include "src/utils/utils.h"
 
 #ifdef WEBP_USE_THREAD
 
@@ -50,11 +50,11 @@ typedef struct {
 
 #endif  // _WIN32
 
-struct WebPWorkerImpl {
+typedef struct {
   pthread_mutex_t mutex_;
   pthread_cond_t  condition_;
   pthread_t       thread_;
-};
+} WebPWorkerImpl;
 
 #if defined(_WIN32)
 
@@ -201,25 +201,24 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
 
 //------------------------------------------------------------------------------
 
-static void Execute(WebPWorker* const worker);  // Forward declaration.
-
 static THREADFN ThreadLoop(void* ptr) {
   WebPWorker* const worker = (WebPWorker*)ptr;
+  WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_;
   int done = 0;
   while (!done) {
-    pthread_mutex_lock(&worker->impl_->mutex_);
+    pthread_mutex_lock(&impl->mutex_);
     while (worker->status_ == OK) {   // wait in idling mode
-      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+      pthread_cond_wait(&impl->condition_, &impl->mutex_);
     }
     if (worker->status_ == WORK) {
-      Execute(worker);
+      WebPGetWorkerInterface()->Execute(worker);
       worker->status_ = OK;
     } else if (worker->status_ == NOT_OK) {   // finish the worker
       done = 1;
     }
     // signal to the main thread that we're done (for Sync())
-    pthread_cond_signal(&worker->impl_->condition_);
-    pthread_mutex_unlock(&worker->impl_->mutex_);
+    pthread_cond_signal(&impl->condition_);
+    pthread_mutex_unlock(&impl->mutex_);
   }
   return THREAD_RETURN(NULL);    // Thread is finished
 }
@@ -229,21 +228,22 @@ static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) {
   // No-op when attempting to change state on a thread that didn't come up.
   // Checking status_ without acquiring the lock first would result in a data
   // race.
-  if (worker->impl_ == NULL) return;
+  WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_;
+  if (impl == NULL) return;
 
-  pthread_mutex_lock(&worker->impl_->mutex_);
+  pthread_mutex_lock(&impl->mutex_);
   if (worker->status_ >= OK) {
     // wait for the worker to finish
     while (worker->status_ != OK) {
-      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+      pthread_cond_wait(&impl->condition_, &impl->mutex_);
     }
     // assign new status and release the working thread if needed
     if (new_status != OK) {
       worker->status_ = new_status;
-      pthread_cond_signal(&worker->impl_->condition_);
+      pthread_cond_signal(&impl->condition_);
     }
   }
-  pthread_mutex_unlock(&worker->impl_->mutex_);
+  pthread_mutex_unlock(&impl->mutex_);
 }
 
 #endif  // WEBP_USE_THREAD
@@ -268,26 +268,28 @@ static int Reset(WebPWorker* const worker) {
   worker->had_error = 0;
   if (worker->status_ < OK) {
 #ifdef WEBP_USE_THREAD
-    worker->impl_ = (WebPWorkerImpl*)WebPSafeCalloc(1, sizeof(*worker->impl_));
+    WebPWorkerImpl* const impl =
+        (WebPWorkerImpl*)WebPSafeCalloc(1, sizeof(WebPWorkerImpl));
+    worker->impl_ = (void*)impl;
     if (worker->impl_ == NULL) {
       return 0;
     }
-    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+    if (pthread_mutex_init(&impl->mutex_, NULL)) {
       goto Error;
     }
-    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
-      pthread_mutex_destroy(&worker->impl_->mutex_);
+    if (pthread_cond_init(&impl->condition_, NULL)) {
+      pthread_mutex_destroy(&impl->mutex_);
       goto Error;
     }
-    pthread_mutex_lock(&worker->impl_->mutex_);
-    ok = !pthread_create(&worker->impl_->thread_, NULL, ThreadLoop, worker);
+    pthread_mutex_lock(&impl->mutex_);
+    ok = !pthread_create(&impl->thread_, NULL, ThreadLoop, worker);
     if (ok) worker->status_ = OK;
-    pthread_mutex_unlock(&worker->impl_->mutex_);
+    pthread_mutex_unlock(&impl->mutex_);
     if (!ok) {
-      pthread_mutex_destroy(&worker->impl_->mutex_);
-      pthread_cond_destroy(&worker->impl_->condition_);
+      pthread_mutex_destroy(&impl->mutex_);
+      pthread_cond_destroy(&impl->condition_);
  Error:
-      WebPSafeFree(worker->impl_);
+      WebPSafeFree(impl);
       worker->impl_ = NULL;
       return 0;
     }
@@ -318,11 +320,12 @@ static void Launch(WebPWorker* const worker) {
 static void End(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
   if (worker->impl_ != NULL) {
+    WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_;
     ChangeState(worker, NOT_OK);
-    pthread_join(worker->impl_->thread_, NULL);
-    pthread_mutex_destroy(&worker->impl_->mutex_);
-    pthread_cond_destroy(&worker->impl_->condition_);
-    WebPSafeFree(worker->impl_);
+    pthread_join(impl->thread_, NULL);
+    pthread_mutex_destroy(&impl->mutex_);
+    pthread_cond_destroy(&impl->condition_);
+    WebPSafeFree(impl);
     worker->impl_ = NULL;
   }
 #else
diff --git a/thirdparty/libwebp/utils/thread_utils.h b/thirdparty/libwebp/src/utils/thread_utils.h
index 8408311855..c8ae6c9033 100644
--- a/thirdparty/libwebp/utils/thread_utils.h
+++ b/thirdparty/libwebp/src/utils/thread_utils.h
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_THREAD_H_
-#define WEBP_UTILS_THREAD_H_
+#ifndef WEBP_UTILS_THREAD_UTILS_H_
+#define WEBP_UTILS_THREAD_UTILS_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,12 +35,9 @@ typedef enum {
 // arguments (data1 and data2), and should return false in case of error.
 typedef int (*WebPWorkerHook)(void*, void*);
 
-// Platform-dependent implementation details for the worker.
-typedef struct WebPWorkerImpl WebPWorkerImpl;
-
 // Synchronization object used to launch job in the worker thread
 typedef struct {
-  WebPWorkerImpl* impl_;
+  void* impl_;            // platform-dependent implementation worker details
   WebPWorkerStatus status_;
   WebPWorkerHook hook;    // hook to call
   void* data1;            // first argument passed to 'hook'
@@ -78,11 +75,11 @@ typedef struct {
 // decoding takes place. The contents of the interface struct are copied, it
 // is safe to free the corresponding memory after this call. This function is
 // not thread-safe. Return false in case of invalid pointer or methods.
-WEBP_EXTERN(int) WebPSetWorkerInterface(
+WEBP_EXTERN int WebPSetWorkerInterface(
     const WebPWorkerInterface* const winterface);
 
 // Retrieve the currently set thread worker interface.
-WEBP_EXTERN(const WebPWorkerInterface*) WebPGetWorkerInterface(void);
+WEBP_EXTERN const WebPWorkerInterface* WebPGetWorkerInterface(void);
 
 //------------------------------------------------------------------------------
 
@@ -90,4 +87,4 @@ WEBP_EXTERN(const WebPWorkerInterface*) WebPGetWorkerInterface(void);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_THREAD_H_ */
+#endif  /* WEBP_UTILS_THREAD_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/utils.c b/thirdparty/libwebp/src/utils/utils.c
index 504d924b60..44d5c14f01 100644
--- a/thirdparty/libwebp/utils/utils.c
+++ b/thirdparty/libwebp/src/utils/utils.c
@@ -13,10 +13,11 @@
 
 #include <stdlib.h>
 #include <string.h>  // for memcpy()
-#include "../webp/decode.h"
-#include "../webp/encode.h"
-#include "../webp/format_constants.h"  // for MAX_PALETTE_SIZE
-#include "./utils.h"
+#include "src/webp/decode.h"
+#include "src/webp/encode.h"
+#include "src/webp/format_constants.h"  // for MAX_PALETTE_SIZE
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
 
 // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
 // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
@@ -252,7 +253,6 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
   int num_colors = 0;
   uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
   uint32_t colors[COLOR_HASH_SIZE];
-  static const uint64_t kHashMul = 0x1e35a7bdull;
   const uint32_t* argb = pic->argb;
   const int width = pic->width;
   const int height = pic->height;
@@ -267,7 +267,7 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
         continue;
       }
       last_pix = argb[x];
-      key = ((last_pix * kHashMul) & 0xffffffffu) >> COLOR_HASH_RIGHT_SHIFT;
+      key = VP8LHashPix(last_pix, COLOR_HASH_RIGHT_SHIFT);
       while (1) {
         if (!in_use[key]) {
           colors[key] = last_pix;
diff --git a/thirdparty/libwebp/utils/utils.h b/thirdparty/libwebp/src/utils/utils.h
index 3ab459050a..52921bf24e 100644
--- a/thirdparty/libwebp/utils/utils.h
+++ b/thirdparty/libwebp/src/utils/utils.h
@@ -16,14 +16,14 @@
 #define WEBP_UTILS_UTILS_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <assert.h>
 #include <limits.h>
 
-#include "../dsp/dsp.h"
-#include "../webp/types.h"
+#include "src/dsp/dsp.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -48,13 +48,13 @@ extern "C" {
 // somewhere (like: malloc(num_pixels * sizeof(*something))). That's why this
 // safe malloc() borrows the signature from calloc(), pointing at the dangerous
 // underlying multiply involved.
-WEBP_EXTERN(void*) WebPSafeMalloc(uint64_t nmemb, size_t size);
+WEBP_EXTERN void* WebPSafeMalloc(uint64_t nmemb, size_t size);
 // Note that WebPSafeCalloc() expects the second argument type to be 'size_t'
 // in order to favor the "calloc(num_foo, sizeof(foo))" pattern.
-WEBP_EXTERN(void*) WebPSafeCalloc(uint64_t nmemb, size_t size);
+WEBP_EXTERN void* WebPSafeCalloc(uint64_t nmemb, size_t size);
 
 // Companion deallocation function to the above allocations.
-WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
+WEBP_EXTERN void WebPSafeFree(void* const ptr);
 
 //------------------------------------------------------------------------------
 // Alignment
@@ -66,7 +66,7 @@ WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
 static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
   uint32_t A;
-  memcpy(&A, (const int*)ptr, sizeof(A));
+  memcpy(&A, ptr, sizeof(A));
   return A;
 }
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
@@ -112,12 +112,12 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
 #define WEBP_NEED_LOG_TABLE_8BIT
 extern const uint8_t WebPLogTable8bit[256];
 static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
-  int log = 0;
+  int log_value = 0;
   while (n >= 256) {
-    log += 8;
+    log_value += 8;
     n >>= 8;
   }
-  return log + WebPLogTable8bit[n];
+  return log_value + WebPLogTable8bit[n];
 }
 
 // Returns (int)floor(log2(n)). n must be > 0.
@@ -147,14 +147,14 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
 struct WebPPicture;
 
 // Copy width x height pixels from 'src' to 'dst' honoring the strides.
-WEBP_EXTERN(void) WebPCopyPlane(const uint8_t* src, int src_stride,
-                                uint8_t* dst, int dst_stride,
-                                int width, int height);
+WEBP_EXTERN void WebPCopyPlane(const uint8_t* src, int src_stride,
+                               uint8_t* dst, int dst_stride,
+                               int width, int height);
 
 // Copy ARGB pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are
 // assumed to be already allocated and using ARGB data.
-WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
-                                 struct WebPPicture* const dst);
+WEBP_EXTERN void WebPCopyPixels(const struct WebPPicture* const src,
+                                struct WebPPicture* const dst);
 
 //------------------------------------------------------------------------------
 // Unique colors.
@@ -166,8 +166,8 @@ WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
 // MAX_PALETTE_SIZE, also outputs the actual unique colors into 'palette'.
 // Note: 'palette' is assumed to be an array already allocated with at least
 // MAX_PALETTE_SIZE elements.
-WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic,
-                                     uint32_t* const palette);
+WEBP_EXTERN int WebPGetColorPalette(const struct WebPPicture* const pic,
+                                    uint32_t* const palette);
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/webp/decode.h b/thirdparty/libwebp/src/webp/decode.h
index 4c5e74ac36..2165e96c95 100644
--- a/thirdparty/libwebp/webp/decode.h
+++ b/thirdparty/libwebp/src/webp/decode.h
@@ -36,39 +36,39 @@ typedef struct WebPDecoderConfig WebPDecoderConfig;
 
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetDecoderVersion(void);
+WEBP_EXTERN int WebPGetDecoderVersion(void);
 
 // Retrieve basic header information: width, height.
 // This function will also validate the header, returning true on success,
 // false otherwise. '*width' and '*height' are only valid on successful return.
 // Pointers 'width' and 'height' can be passed NULL if deemed irrelevant.
-WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, size_t data_size,
-                             int* width, int* height);
+WEBP_EXTERN int WebPGetInfo(const uint8_t* data, size_t data_size,
+                            int* width, int* height);
 
 // Decodes WebP images pointed to by 'data' and returns RGBA samples, along
 // with the dimensions in *width and *height. The ordering of samples in
 // memory is R, G, B, A, R, G, B, A... in scan order (endian-independent).
 // The returned pointer should be deleted calling WebPFree().
 // Returns NULL in case of error.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, size_t data_size,
-                                     int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeRGBA(const uint8_t* data, size_t data_size,
+                                    int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning A, R, G, B, A, R, G, B... ordered data.
-WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, size_t data_size,
-                                     int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeARGB(const uint8_t* data, size_t data_size,
+                                    int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning B, G, R, A, B, G, R, A... ordered data.
-WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, size_t data_size,
-                                     int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
+                                    int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning R, G, B, R, G, B... ordered data.
 // If the bitstream contains transparency, it is ignored.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, size_t data_size,
-                                    int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
+                                   int* width, int* height);
 
 // Same as WebPDecodeRGB, but returning B, G, R, B, G, R... ordered data.
-WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size,
-                                    int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
+                                   int* width, int* height);
 
 
 // Decode WebP images pointed to by 'data' to Y'UV format(*). The pointer
@@ -80,13 +80,13 @@ WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size,
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
 // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
-WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, size_t data_size,
-                                    int* width, int* height,
-                                    uint8_t** u, uint8_t** v,
-                                    int* stride, int* uv_stride);
+WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
+                                   int* width, int* height,
+                                   uint8_t** u, uint8_t** v,
+                                   int* stride, int* uv_stride);
 
 // Releases memory returned by the WebPDecode*() functions above.
-WEBP_EXTERN(void) WebPFree(void* ptr);
+WEBP_EXTERN void WebPFree(void* ptr);
 
 // These five functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
@@ -96,22 +96,22 @@ WEBP_EXTERN(void) WebPFree(void* ptr);
 // The parameter 'output_stride' specifies the distance (in bytes)
 // between scanlines. Hence, output_buffer_size is expected to be at least
 // output_stride x picture-height.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto(
+WEBP_EXTERN uint8_t* WebPDecodeRGBAInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto(
+WEBP_EXTERN uint8_t* WebPDecodeARGBInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto(
+WEBP_EXTERN uint8_t* WebPDecodeBGRAInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
 // RGB and BGR variants. Here too the transparency information, if present,
 // will be dropped and ignored.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto(
+WEBP_EXTERN uint8_t* WebPDecodeRGBInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
+WEBP_EXTERN uint8_t* WebPDecodeBGRInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
@@ -122,7 +122,7 @@ WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
 // 'u_size' and 'v_size' respectively.
 // Pointer to the luma plane ('*luma') is returned or NULL if an error occurred
 // during decoding (or because some buffers were found to be too small).
-WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto(
+WEBP_EXTERN uint8_t* WebPDecodeYUVInto(
     const uint8_t* data, size_t data_size,
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
@@ -213,7 +213,7 @@ struct WebPDecBuffer {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer*, int);
+WEBP_EXTERN int WebPInitDecBufferInternal(WebPDecBuffer*, int);
 
 // Initialize the structure as empty. Must be called before any other use.
 // Returns false in case of version mismatch
@@ -223,7 +223,7 @@ static WEBP_INLINE int WebPInitDecBuffer(WebPDecBuffer* buffer) {
 
 // Free any memory associated with the buffer. Must always be called last.
 // Note: doesn't free the 'buffer' structure itself.
-WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* buffer);
+WEBP_EXTERN void WebPFreeDecBuffer(WebPDecBuffer* buffer);
 
 //------------------------------------------------------------------------------
 // Enumeration of the status codes
@@ -277,7 +277,7 @@ typedef enum VP8StatusCode {
 // within valid bounds.
 // All other fields of WebPDecBuffer MUST remain constant between calls.
 // Returns NULL if the allocation failed.
-WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
+WEBP_EXTERN WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer);
 
 // This function allocates and initializes an incremental-decoder object, which
 // will output the RGB/A samples specified by 'csp' into a preallocated
@@ -289,7 +289,7 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
 // colorspace 'csp' is taken into account for allocating this buffer. All other
 // parameters are ignored.
 // Returns NULL if the allocation failed, or if some parameters are invalid.
-WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
+WEBP_EXTERN WebPIDecoder* WebPINewRGB(
     WEBP_CSP_MODE csp,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
@@ -304,7 +304,7 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
 // In this case, the output buffer will be automatically allocated (using
 // MODE_YUVA) when decoding starts. All parameters are then ignored.
 // Returns NULL if the allocation failed or if a parameter is invalid.
-WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
+WEBP_EXTERN WebPIDecoder* WebPINewYUVA(
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
     uint8_t* v, size_t v_size, int v_stride,
@@ -312,19 +312,19 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
 
 // Deprecated version of the above, without the alpha plane.
 // Kept for backward compatibility.
-WEBP_EXTERN(WebPIDecoder*) WebPINewYUV(
+WEBP_EXTERN WebPIDecoder* WebPINewYUV(
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
     uint8_t* v, size_t v_size, int v_stride);
 
 // Deletes the WebPIDecoder object and associated memory. Must always be called
 // if WebPINewDecoder, WebPINewRGB or WebPINewYUV succeeded.
-WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* idec);
+WEBP_EXTERN void WebPIDelete(WebPIDecoder* idec);
 
 // Copies and decodes the next available data. Returns VP8_STATUS_OK when
 // the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more
 // data is expected. Returns error in other cases.
-WEBP_EXTERN(VP8StatusCode) WebPIAppend(
+WEBP_EXTERN VP8StatusCode WebPIAppend(
     WebPIDecoder* idec, const uint8_t* data, size_t data_size);
 
 // A variant of the above function to be used when data buffer contains
@@ -332,7 +332,7 @@ WEBP_EXTERN(VP8StatusCode) WebPIAppend(
 // to the internal memory.
 // Note that the value of the 'data' pointer can change between calls to
 // WebPIUpdate, for instance when the data buffer is resized to fit larger data.
-WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
+WEBP_EXTERN VP8StatusCode WebPIUpdate(
     WebPIDecoder* idec, const uint8_t* data, size_t data_size);
 
 // Returns the RGB/A image decoded so far. Returns NULL if output params
@@ -340,15 +340,16 @@ WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
 // specified during call to WebPINewDecoder() or WebPINewRGB().
 // *last_y is the index of last decoded row in raster scan order. Some pointers
 // (*last_y, *width etc.) can be NULL if corresponding information is not
-// needed.
-WEBP_EXTERN(uint8_t*) WebPIDecGetRGB(
+// needed. The values in these pointers are only valid on successful (non-NULL)
+// return.
+WEBP_EXTERN uint8_t* WebPIDecGetRGB(
     const WebPIDecoder* idec, int* last_y,
     int* width, int* height, int* stride);
 
 // Same as above function to get a YUVA image. Returns pointer to the luma
 // plane or NULL in case of error. If there is no alpha information
 // the alpha pointer '*a' will be returned NULL.
-WEBP_EXTERN(uint8_t*) WebPIDecGetYUVA(
+WEBP_EXTERN uint8_t* WebPIDecGetYUVA(
     const WebPIDecoder* idec, int* last_y,
     uint8_t** u, uint8_t** v, uint8_t** a,
     int* width, int* height, int* stride, int* uv_stride, int* a_stride);
@@ -368,7 +369,7 @@ static WEBP_INLINE uint8_t* WebPIDecGetYUV(
 // Returns NULL in case the incremental decoder object is in an invalid state.
 // Otherwise returns the pointer to the internal representation. This structure
 // is read-only, tied to WebPIDecoder's lifespan and should not be modified.
-WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
+WEBP_EXTERN const WebPDecBuffer* WebPIDecodedArea(
     const WebPIDecoder* idec, int* left, int* top, int* width, int* height);
 
 //------------------------------------------------------------------------------
@@ -416,7 +417,7 @@ struct WebPBitstreamFeatures {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
+WEBP_EXTERN VP8StatusCode WebPGetFeaturesInternal(
     const uint8_t*, size_t, WebPBitstreamFeatures*, int);
 
 // Retrieve features from the bitstream. The *features structure is filled
@@ -457,7 +458,7 @@ struct WebPDecoderConfig {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
+WEBP_EXTERN int WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
 
 // Initialize the configuration as empty. This function must always be
 // called first, unless WebPGetFeatures() is to be called.
@@ -477,14 +478,14 @@ static WEBP_INLINE int WebPInitDecoderConfig(WebPDecoderConfig* config) {
 // The return WebPIDecoder object must always be deleted calling WebPIDelete().
 // Returns NULL in case of error (and config->status will then reflect
 // the error condition, if available).
-WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, size_t data_size,
-                                       WebPDecoderConfig* config);
+WEBP_EXTERN WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
+                                      WebPDecoderConfig* config);
 
 // Non-incremental version. This version decodes the full data at once, taking
 // 'config' into account. Returns decoding status (which should be VP8_STATUS_OK
 // if the decoding was successful). Note that 'config' cannot be NULL.
-WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, size_t data_size,
-                                      WebPDecoderConfig* config);
+WEBP_EXTERN VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
+                                     WebPDecoderConfig* config);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/thirdparty/libwebp/webp/demux.h b/thirdparty/libwebp/src/webp/demux.h
index 454f6914b2..555d641338 100644
--- a/thirdparty/libwebp/webp/demux.h
+++ b/thirdparty/libwebp/src/webp/demux.h
@@ -71,7 +71,7 @@ typedef struct WebPAnimDecoderOptions WebPAnimDecoderOptions;
 
 // Returns the version number of the demux library, packed in hexadecimal using
 // 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetDemuxVersion(void);
+WEBP_EXTERN int WebPGetDemuxVersion(void);
 
 //------------------------------------------------------------------------------
 // Life of a Demux object
@@ -85,7 +85,7 @@ typedef enum WebPDemuxState {
 } WebPDemuxState;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(WebPDemuxer*) WebPDemuxInternal(
+WEBP_EXTERN WebPDemuxer* WebPDemuxInternal(
     const WebPData*, int, WebPDemuxState*, int);
 
 // Parses the full WebP file given by 'data'. For single images the WebP file
@@ -109,27 +109,32 @@ static WEBP_INLINE WebPDemuxer* WebPDemuxPartial(
 }
 
 // Frees memory associated with 'dmux'.
-WEBP_EXTERN(void) WebPDemuxDelete(WebPDemuxer* dmux);
+WEBP_EXTERN void WebPDemuxDelete(WebPDemuxer* dmux);
 
 //------------------------------------------------------------------------------
 // Data/information extraction.
 
 typedef enum WebPFormatFeature {
-  WEBP_FF_FORMAT_FLAGS,  // Extended format flags present in the 'VP8X' chunk.
+  WEBP_FF_FORMAT_FLAGS,      // bit-wise combination of WebPFeatureFlags
+                             // corresponding to the 'VP8X' chunk (if present).
   WEBP_FF_CANVAS_WIDTH,
   WEBP_FF_CANVAS_HEIGHT,
-  WEBP_FF_LOOP_COUNT,
-  WEBP_FF_BACKGROUND_COLOR,
-  WEBP_FF_FRAME_COUNT    // Number of frames present in the demux object.
-                         // In case of a partial demux, this is the number of
-                         // frames seen so far, with the last frame possibly
-                         // being partial.
+  WEBP_FF_LOOP_COUNT,        // only relevant for animated file
+  WEBP_FF_BACKGROUND_COLOR,  // idem.
+  WEBP_FF_FRAME_COUNT        // Number of frames present in the demux object.
+                             // In case of a partial demux, this is the number
+                             // of frames seen so far, with the last frame
+                             // possibly being partial.
 } WebPFormatFeature;
 
 // Get the 'feature' value from the 'dmux'.
 // NOTE: values are only valid if WebPDemux() was used or WebPDemuxPartial()
 // returned a state > WEBP_DEMUX_PARSING_HEADER.
-WEBP_EXTERN(uint32_t) WebPDemuxGetI(
+// If 'feature' is WEBP_FF_FORMAT_FLAGS, the returned value is a bit-wise
+// combination of WebPFeatureFlags values.
+// If 'feature' is WEBP_FF_LOOP_COUNT, WEBP_FF_BACKGROUND_COLOR, the returned
+// value is only meaningful if the bitstream is animated.
+WEBP_EXTERN uint32_t WebPDemuxGetI(
     const WebPDemuxer* dmux, WebPFormatFeature feature);
 
 //------------------------------------------------------------------------------
@@ -159,20 +164,20 @@ struct WebPIterator {
 // Returns false if 'dmux' is NULL or frame 'frame_number' is not present.
 // Call WebPDemuxReleaseIterator() when use of the iterator is complete.
 // NOTE: 'dmux' must persist for the lifetime of 'iter'.
-WEBP_EXTERN(int) WebPDemuxGetFrame(
+WEBP_EXTERN int WebPDemuxGetFrame(
     const WebPDemuxer* dmux, int frame_number, WebPIterator* iter);
 
 // Sets 'iter->fragment' to point to the next ('iter->frame_num' + 1) or
 // previous ('iter->frame_num' - 1) frame. These functions do not loop.
 // Returns true on success, false otherwise.
-WEBP_EXTERN(int) WebPDemuxNextFrame(WebPIterator* iter);
-WEBP_EXTERN(int) WebPDemuxPrevFrame(WebPIterator* iter);
+WEBP_EXTERN int WebPDemuxNextFrame(WebPIterator* iter);
+WEBP_EXTERN int WebPDemuxPrevFrame(WebPIterator* iter);
 
 // Releases any memory associated with 'iter'.
 // Must be called before any subsequent calls to WebPDemuxGetChunk() on the same
 // iter. Also, must be called before destroying the associated WebPDemuxer with
 // WebPDemuxDelete().
-WEBP_EXTERN(void) WebPDemuxReleaseIterator(WebPIterator* iter);
+WEBP_EXTERN void WebPDemuxReleaseIterator(WebPIterator* iter);
 
 //------------------------------------------------------------------------------
 // Chunk iteration.
@@ -197,20 +202,20 @@ struct WebPChunkIterator {
 // payloads are accessed through WebPDemuxGetFrame() and related functions.
 // Call WebPDemuxReleaseChunkIterator() when use of the iterator is complete.
 // NOTE: 'dmux' must persist for the lifetime of the iterator.
-WEBP_EXTERN(int) WebPDemuxGetChunk(const WebPDemuxer* dmux,
-                                   const char fourcc[4], int chunk_number,
-                                   WebPChunkIterator* iter);
+WEBP_EXTERN int WebPDemuxGetChunk(const WebPDemuxer* dmux,
+                                  const char fourcc[4], int chunk_number,
+                                  WebPChunkIterator* iter);
 
 // Sets 'iter->chunk' to point to the next ('iter->chunk_num' + 1) or previous
 // ('iter->chunk_num' - 1) chunk. These functions do not loop.
 // Returns true on success, false otherwise.
-WEBP_EXTERN(int) WebPDemuxNextChunk(WebPChunkIterator* iter);
-WEBP_EXTERN(int) WebPDemuxPrevChunk(WebPChunkIterator* iter);
+WEBP_EXTERN int WebPDemuxNextChunk(WebPChunkIterator* iter);
+WEBP_EXTERN int WebPDemuxPrevChunk(WebPChunkIterator* iter);
 
 // Releases any memory associated with 'iter'.
 // Must be called before destroying the associated WebPDemuxer with
 // WebPDemuxDelete().
-WEBP_EXTERN(void) WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter);
+WEBP_EXTERN void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter);
 
 //------------------------------------------------------------------------------
 // WebPAnimDecoder API
@@ -252,7 +257,7 @@ struct WebPAnimDecoderOptions {
 };
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN(int) WebPAnimDecoderOptionsInitInternal(
+WEBP_EXTERN int WebPAnimDecoderOptionsInitInternal(
     WebPAnimDecoderOptions*, int);
 
 // Should always be called, to initialize a fresh WebPAnimDecoderOptions
@@ -266,7 +271,7 @@ static WEBP_INLINE int WebPAnimDecoderOptionsInit(
 }
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN(WebPAnimDecoder*) WebPAnimDecoderNewInternal(
+WEBP_EXTERN WebPAnimDecoder* WebPAnimDecoderNewInternal(
     const WebPData*, const WebPAnimDecoderOptions*, int);
 
 // Creates and initializes a WebPAnimDecoder object.
@@ -301,8 +306,8 @@ struct WebPAnimInfo {
 //   info - (out) global information fetched from the animation.
 // Returns:
 //   True on success.
-WEBP_EXTERN(int) WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
-                                        WebPAnimInfo* info);
+WEBP_EXTERN int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
+                                       WebPAnimInfo* info);
 
 // Fetch the next frame from 'dec' based on options supplied to
 // WebPAnimDecoderNew(). This will be a fully reconstructed canvas of size
@@ -316,8 +321,8 @@ WEBP_EXTERN(int) WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
 // Returns:
 //   False if any of the arguments are NULL, or if there is a parsing or
 //   decoding error, or if there are no more frames. Otherwise, returns true.
-WEBP_EXTERN(int) WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
-                                        uint8_t** buf, int* timestamp);
+WEBP_EXTERN int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
+                                       uint8_t** buf, int* timestamp);
 
 // Check if there are more frames left to decode.
 // Parameters:
@@ -325,7 +330,7 @@ WEBP_EXTERN(int) WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
 // Returns:
 //   True if 'dec' is not NULL and some frames are yet to be decoded.
 //   Otherwise, returns false.
-WEBP_EXTERN(int) WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec);
+WEBP_EXTERN int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec);
 
 // Resets the WebPAnimDecoder object, so that next call to
 // WebPAnimDecoderGetNext() will restart decoding from 1st frame. This would be
@@ -333,7 +338,7 @@ WEBP_EXTERN(int) WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec);
 // info.loop_count times) without destroying and recreating the 'dec' object.
 // Parameters:
 //   dec - (in/out) decoder instance to be reset
-WEBP_EXTERN(void) WebPAnimDecoderReset(WebPAnimDecoder* dec);
+WEBP_EXTERN void WebPAnimDecoderReset(WebPAnimDecoder* dec);
 
 // Grab the internal demuxer object.
 // Getting the demuxer object can be useful if one wants to use operations only
@@ -343,13 +348,13 @@ WEBP_EXTERN(void) WebPAnimDecoderReset(WebPAnimDecoder* dec);
 //
 // Parameters:
 //   dec - (in) decoder instance from which the demuxer object is to be fetched.
-WEBP_EXTERN(const WebPDemuxer*) WebPAnimDecoderGetDemuxer(
+WEBP_EXTERN const WebPDemuxer* WebPAnimDecoderGetDemuxer(
     const WebPAnimDecoder* dec);
 
 // Deletes the WebPAnimDecoder object.
 // Parameters:
 //   dec - (in/out) decoder instance to be deleted
-WEBP_EXTERN(void) WebPAnimDecoderDelete(WebPAnimDecoder* dec);
+WEBP_EXTERN void WebPAnimDecoderDelete(WebPAnimDecoder* dec);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/thirdparty/libwebp/webp/encode.h b/thirdparty/libwebp/src/webp/encode.h
index 35fde1d052..7ec3543dc2 100644
--- a/thirdparty/libwebp/webp/encode.h
+++ b/thirdparty/libwebp/src/webp/encode.h
@@ -35,7 +35,7 @@ typedef struct WebPMemoryWriter WebPMemoryWriter;
 
 // Return the encoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetEncoderVersion(void);
+WEBP_EXTERN int WebPGetEncoderVersion(void);
 
 //------------------------------------------------------------------------------
 // One-stop-shop call! No questions asked:
@@ -46,37 +46,37 @@ WEBP_EXTERN(int) WebPGetEncoderVersion(void);
 // These functions compress using the lossy format, and the quality_factor
 // can go from 0 (smaller output, lower quality) to 100 (best quality,
 // larger output).
-WEBP_EXTERN(size_t) WebPEncodeRGB(const uint8_t* rgb,
+WEBP_EXTERN size_t WebPEncodeRGB(const uint8_t* rgb,
+                                 int width, int height, int stride,
+                                 float quality_factor, uint8_t** output);
+WEBP_EXTERN size_t WebPEncodeBGR(const uint8_t* bgr,
+                                 int width, int height, int stride,
+                                 float quality_factor, uint8_t** output);
+WEBP_EXTERN size_t WebPEncodeRGBA(const uint8_t* rgba,
                                   int width, int height, int stride,
                                   float quality_factor, uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeBGR(const uint8_t* bgr,
+WEBP_EXTERN size_t WebPEncodeBGRA(const uint8_t* bgra,
                                   int width, int height, int stride,
                                   float quality_factor, uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeRGBA(const uint8_t* rgba,
-                                   int width, int height, int stride,
-                                   float quality_factor, uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeBGRA(const uint8_t* bgra,
-                                   int width, int height, int stride,
-                                   float quality_factor, uint8_t** output);
 
 // These functions are the equivalent of the above, but compressing in a
 // lossless manner. Files are usually larger than lossy format, but will
 // not suffer any compression loss.
-WEBP_EXTERN(size_t) WebPEncodeLosslessRGB(const uint8_t* rgb,
+WEBP_EXTERN size_t WebPEncodeLosslessRGB(const uint8_t* rgb,
+                                         int width, int height, int stride,
+                                         uint8_t** output);
+WEBP_EXTERN size_t WebPEncodeLosslessBGR(const uint8_t* bgr,
+                                         int width, int height, int stride,
+                                         uint8_t** output);
+WEBP_EXTERN size_t WebPEncodeLosslessRGBA(const uint8_t* rgba,
                                           int width, int height, int stride,
                                           uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeLosslessBGR(const uint8_t* bgr,
+WEBP_EXTERN size_t WebPEncodeLosslessBGRA(const uint8_t* bgra,
                                           int width, int height, int stride,
                                           uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeLosslessRGBA(const uint8_t* rgba,
-                                           int width, int height, int stride,
-                                           uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeLosslessBGRA(const uint8_t* bgra,
-                                           int width, int height, int stride,
-                                           uint8_t** output);
 
 // Releases memory returned by the WebPEncode*() functions above.
-WEBP_EXTERN(void) WebPFree(void* ptr);
+WEBP_EXTERN void WebPFree(void* ptr);
 
 //------------------------------------------------------------------------------
 // Coding parameters
@@ -93,12 +93,15 @@ typedef enum WebPImageHint {
 // Compression parameters.
 struct WebPConfig {
   int lossless;           // Lossless encoding (0=lossy(default), 1=lossless).
-  float quality;          // between 0 (smallest file) and 100 (biggest)
+  float quality;          // between 0 and 100. For lossy, 0 gives the smallest
+                          // size and 100 the largest. For lossless, this
+                          // parameter is the amount of effort put into the
+                          // compression: 0 is the fastest but gives larger
+                          // files compared to the slowest, but best, 100.
   int method;             // quality/speed trade-off (0=fast, 6=slower-better)
 
   WebPImageHint image_hint;  // Hint for image type (lossless only for now).
 
-  // Parameters related to lossy compression only:
   int target_size;        // if non-zero, set the desired target size in bytes.
                           // Takes precedence over the 'compression' parameter.
   float target_PSNR;      // if non-zero, specifies the minimal distortion to
@@ -159,7 +162,7 @@ typedef enum WebPPreset {
 } WebPPreset;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPConfigInitInternal(WebPConfig*, WebPPreset, float, int);
+WEBP_EXTERN int WebPConfigInitInternal(WebPConfig*, WebPPreset, float, int);
 
 // Should always be called, to initialize a fresh WebPConfig structure before
 // modification. Returns false in case of version mismatch. WebPConfigInit()
@@ -186,15 +189,15 @@ static WEBP_INLINE int WebPConfigPreset(WebPConfig* config,
 // speed and final compressed size.
 // This function will overwrite several fields from config: 'method', 'quality'
 // and 'lossless'. Returns false in case of parameter error.
-WEBP_EXTERN(int) WebPConfigLosslessPreset(WebPConfig* config, int level);
+WEBP_EXTERN int WebPConfigLosslessPreset(WebPConfig* config, int level);
 
 // Returns true if 'config' is non-NULL and all configuration parameters are
 // within their valid ranges.
-WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* config);
+WEBP_EXTERN int WebPValidateConfig(const WebPConfig* config);
 
 //------------------------------------------------------------------------------
 // Input / Output
-// Structure for storing auxiliary statistics (mostly for lossy encoding).
+// Structure for storing auxiliary statistics.
 
 struct WebPAuxStats {
   int coded_size;         // final size
@@ -242,16 +245,16 @@ struct WebPMemoryWriter {
 };
 
 // The following must be called first before any use.
-WEBP_EXTERN(void) WebPMemoryWriterInit(WebPMemoryWriter* writer);
+WEBP_EXTERN void WebPMemoryWriterInit(WebPMemoryWriter* writer);
 
 // The following must be called to deallocate writer->mem memory. The 'writer'
 // object itself is not deallocated.
-WEBP_EXTERN(void) WebPMemoryWriterClear(WebPMemoryWriter* writer);
+WEBP_EXTERN void WebPMemoryWriterClear(WebPMemoryWriter* writer);
 // The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon
 // completion, writer.mem and writer.size will hold the coded data.
 // writer.mem must be freed by calling WebPMemoryWriterClear.
-WEBP_EXTERN(int) WebPMemoryWrite(const uint8_t* data, size_t data_size,
-                                 const WebPPicture* picture);
+WEBP_EXTERN int WebPMemoryWrite(const uint8_t* data, size_t data_size,
+                                const WebPPicture* picture);
 
 // Progress hook, called from time to time to report progress. It can return
 // false to request an abort of the encoding process, or true otherwise if
@@ -354,7 +357,7 @@ struct WebPPicture {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPPictureInitInternal(WebPPicture*, int);
+WEBP_EXTERN int WebPPictureInitInternal(WebPPicture*, int);
 
 // Should always be called, to initialize the structure. Returns false in case
 // of version mismatch. WebPPictureInit() must have succeeded before using the
@@ -371,20 +374,20 @@ static WEBP_INLINE int WebPPictureInit(WebPPicture* picture) {
 // Allocate y/u/v buffers as per colorspace/width/height specification.
 // Note! This function will free the previous buffer if needed.
 // Returns false in case of memory error.
-WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* picture);
+WEBP_EXTERN int WebPPictureAlloc(WebPPicture* picture);
 
 // Release the memory allocated by WebPPictureAlloc() or WebPPictureImport*().
 // Note that this function does _not_ free the memory used by the 'picture'
 // object itself.
 // Besides memory (which is reclaimed) all other fields of 'picture' are
 // preserved.
-WEBP_EXTERN(void) WebPPictureFree(WebPPicture* picture);
+WEBP_EXTERN void WebPPictureFree(WebPPicture* picture);
 
 // Copy the pixels of *src into *dst, using WebPPictureAlloc. Upon return, *dst
 // will fully own the copied pixels (this is not a view). The 'dst' picture need
 // not be initialized as its content is overwritten.
 // Returns false in case of memory allocation error.
-WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
+WEBP_EXTERN int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
 
 // Compute the single distortion for packed planes of samples.
 // 'src' will be compared to 'ref', and the raw distortion stored into
@@ -393,19 +396,19 @@ WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
 // 'x_step' is the horizontal stride (in bytes) between samples.
 // 'src/ref_stride' is the byte distance between rows.
 // Returns false in case of error (bad parameter, memory allocation error, ...).
-WEBP_EXTERN(int) WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
-                                     const uint8_t* ref, size_t ref_stride,
-                                     int width, int height,
-                                     size_t x_step,
-                                     int type,   // 0 = PSNR, 1 = SSIM, 2 = LSIM
-                                     float* distortion, float* result);
+WEBP_EXTERN int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                                    const uint8_t* ref, size_t ref_stride,
+                                    int width, int height,
+                                    size_t x_step,
+                                    int type,   // 0 = PSNR, 1 = SSIM, 2 = LSIM
+                                    float* distortion, float* result);
 
 // Compute PSNR, SSIM or LSIM distortion metric between two pictures. Results
 // are in dB, stored in result[] in the B/G/R/A/All order. The distortion is
 // always performed using ARGB samples. Hence if the input is YUV(A), the
 // picture will be internally converted to ARGB (just for the measurement).
 // Warning: this function is rather CPU-intensive.
-WEBP_EXTERN(int) WebPPictureDistortion(
+WEBP_EXTERN int WebPPictureDistortion(
     const WebPPicture* src, const WebPPicture* ref,
     int metric_type,           // 0 = PSNR, 1 = SSIM, 2 = LSIM
     float result[5]);
@@ -418,8 +421,8 @@ WEBP_EXTERN(int) WebPPictureDistortion(
 // must be fully be comprised inside the 'src' source picture. If the source
 // picture uses the YUV420 colorspace, the top and left coordinates will be
 // snapped to even values.
-WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* picture,
-                                 int left, int top, int width, int height);
+WEBP_EXTERN int WebPPictureCrop(WebPPicture* picture,
+                                int left, int top, int width, int height);
 
 // Extracts a view from 'src' picture into 'dst'. The rectangle for the view
 // is defined by the top-left corner pixel coordinates (left, top) as well
@@ -432,42 +435,42 @@ WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* picture,
 // with WebPPictureInit() if it is different from 'src', since its content will
 // be overwritten.
 // Returns false in case of memory allocation error or invalid parameters.
-WEBP_EXTERN(int) WebPPictureView(const WebPPicture* src,
-                                 int left, int top, int width, int height,
-                                 WebPPicture* dst);
+WEBP_EXTERN int WebPPictureView(const WebPPicture* src,
+                                int left, int top, int width, int height,
+                                WebPPicture* dst);
 
 // Returns true if the 'picture' is actually a view and therefore does
 // not own the memory for pixels.
-WEBP_EXTERN(int) WebPPictureIsView(const WebPPicture* picture);
+WEBP_EXTERN int WebPPictureIsView(const WebPPicture* picture);
 
 // Rescale a picture to new dimension width x height.
 // If either 'width' or 'height' (but not both) is 0 the corresponding
 // dimension will be calculated preserving the aspect ratio.
 // No gamma correction is applied.
 // Returns false in case of error (invalid parameter or insufficient memory).
-WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* pic, int width, int height);
+WEBP_EXTERN int WebPPictureRescale(WebPPicture* pic, int width, int height);
 
 // Colorspace conversion function to import RGB samples.
 // Previous buffer will be free'd, if any.
 // *rgb buffer should have a size of at least height * rgb_stride.
 // Returns false in case of memory error.
-WEBP_EXTERN(int) WebPPictureImportRGB(
+WEBP_EXTERN int WebPPictureImportRGB(
     WebPPicture* picture, const uint8_t* rgb, int rgb_stride);
 // Same, but for RGBA buffer.
-WEBP_EXTERN(int) WebPPictureImportRGBA(
+WEBP_EXTERN int WebPPictureImportRGBA(
     WebPPicture* picture, const uint8_t* rgba, int rgba_stride);
 // Same, but for RGBA buffer. Imports the RGB direct from the 32-bit format
 // input buffer ignoring the alpha channel. Avoids needing to copy the data
 // to a temporary 24-bit RGB buffer to import the RGB only.
-WEBP_EXTERN(int) WebPPictureImportRGBX(
+WEBP_EXTERN int WebPPictureImportRGBX(
     WebPPicture* picture, const uint8_t* rgbx, int rgbx_stride);
 
 // Variants of the above, but taking BGR(A|X) input.
-WEBP_EXTERN(int) WebPPictureImportBGR(
+WEBP_EXTERN int WebPPictureImportBGR(
     WebPPicture* picture, const uint8_t* bgr, int bgr_stride);
-WEBP_EXTERN(int) WebPPictureImportBGRA(
+WEBP_EXTERN int WebPPictureImportBGRA(
     WebPPicture* picture, const uint8_t* bgra, int bgra_stride);
-WEBP_EXTERN(int) WebPPictureImportBGRX(
+WEBP_EXTERN int WebPPictureImportBGRX(
     WebPPicture* picture, const uint8_t* bgrx, int bgrx_stride);
 
 // Converts picture->argb data to the YUV420A format. The 'colorspace'
@@ -476,14 +479,14 @@ WEBP_EXTERN(int) WebPPictureImportBGRX(
 // non-opaque transparent values is detected, and 'colorspace' will be
 // adjusted accordingly. Note that this method is lossy.
 // Returns false in case of error.
-WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
-                                       WebPEncCSP /*colorspace = WEBP_YUV420*/);
+WEBP_EXTERN int WebPPictureARGBToYUVA(WebPPicture* picture,
+                                      WebPEncCSP /*colorspace = WEBP_YUV420*/);
 
 // Same as WebPPictureARGBToYUVA(), but the conversion is done using
 // pseudo-random dithering with a strength 'dithering' between
 // 0.0 (no dithering) and 1.0 (maximum dithering). This is useful
 // for photographic picture.
-WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
+WEBP_EXTERN int WebPPictureARGBToYUVADithered(
     WebPPicture* picture, WebPEncCSP colorspace, float dithering);
 
 // Performs 'sharp' RGBA->YUVA420 downsampling and colorspace conversion.
@@ -491,9 +494,9 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
 // method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better
 // and sharper YUV representation.
 // Returns false in case of error.
-WEBP_EXTERN(int) WebPPictureSharpARGBToYUVA(WebPPicture* picture);
+WEBP_EXTERN int WebPPictureSharpARGBToYUVA(WebPPicture* picture);
 // kept for backward compatibility:
-WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
+WEBP_EXTERN int WebPPictureSmartARGBToYUVA(WebPPicture* picture);
 
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
 // The input format must be YUV_420 or YUV_420A. The conversion from YUV420 to
@@ -501,22 +504,22 @@ WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
 // Note that the use of this colorspace is discouraged if one has access to the
 // raw ARGB samples, since using YUV420 is comparatively lossy.
 // Returns false in case of error.
-WEBP_EXTERN(int) WebPPictureYUVAToARGB(WebPPicture* picture);
+WEBP_EXTERN int WebPPictureYUVAToARGB(WebPPicture* picture);
 
 // Helper function: given a width x height plane of RGBA or YUV(A) samples
-// clean-up the YUV or RGB samples under fully transparent area, to help
-// compressibility (no guarantee, though).
-WEBP_EXTERN(void) WebPCleanupTransparentArea(WebPPicture* picture);
+// clean-up or smoothen the YUV or RGB samples under fully transparent area,
+// to help compressibility (no guarantee, though).
+WEBP_EXTERN void WebPCleanupTransparentArea(WebPPicture* picture);
 
 // Scan the picture 'picture' for the presence of non fully opaque alpha values.
 // Returns true in such case. Otherwise returns false (indicating that the
 // alpha plane can be ignored altogether e.g.).
-WEBP_EXTERN(int) WebPPictureHasTransparency(const WebPPicture* picture);
+WEBP_EXTERN int WebPPictureHasTransparency(const WebPPicture* picture);
 
 // Remove the transparency information (if present) by blending the color with
 // the background color 'background_rgb' (specified as 24bit RGB triplet).
 // After this call, all alpha values are reset to 0xff.
-WEBP_EXTERN(void) WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
+WEBP_EXTERN void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
 
 //------------------------------------------------------------------------------
 // Main call
@@ -531,7 +534,7 @@ WEBP_EXTERN(void) WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
 // the former for lossy encoding, and the latter for lossless encoding
 // (when config.lossless is true). Automatic conversion from one format to
 // another is provided but they both incur some loss.
-WEBP_EXTERN(int) WebPEncode(const WebPConfig* config, WebPPicture* picture);
+WEBP_EXTERN int WebPEncode(const WebPConfig* config, WebPPicture* picture);
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/webp/format_constants.h b/thirdparty/libwebp/src/webp/format_constants.h
index 329fc8a3b0..329fc8a3b0 100644
--- a/thirdparty/libwebp/webp/format_constants.h
+++ b/thirdparty/libwebp/src/webp/format_constants.h
diff --git a/thirdparty/libwebp/webp/mux.h b/thirdparty/libwebp/src/webp/mux.h
index daccc65e86..28bb4a41c9 100644
--- a/thirdparty/libwebp/webp/mux.h
+++ b/thirdparty/libwebp/src/webp/mux.h
@@ -98,13 +98,13 @@ typedef enum WebPChunkId {
 
 // Returns the version number of the mux library, packed in hexadecimal using
 // 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetMuxVersion(void);
+WEBP_EXTERN int WebPGetMuxVersion(void);
 
 //------------------------------------------------------------------------------
 // Life of a Mux object
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(WebPMux*) WebPNewInternal(int);
+WEBP_EXTERN WebPMux* WebPNewInternal(int);
 
 // Creates an empty mux object.
 // Returns:
@@ -117,13 +117,13 @@ static WEBP_INLINE WebPMux* WebPMuxNew(void) {
 // Deletes the mux object.
 // Parameters:
 //   mux - (in/out) object to be deleted
-WEBP_EXTERN(void) WebPMuxDelete(WebPMux* mux);
+WEBP_EXTERN void WebPMuxDelete(WebPMux* mux);
 
 //------------------------------------------------------------------------------
 // Mux creation.
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(WebPMux*) WebPMuxCreateInternal(const WebPData*, int, int);
+WEBP_EXTERN WebPMux* WebPMuxCreateInternal(const WebPData*, int, int);
 
 // Creates a mux object from raw data given in WebP RIFF format.
 // Parameters:
@@ -160,7 +160,7 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
 //                               or if fourcc corresponds to an image chunk.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetChunk(
+WEBP_EXTERN WebPMuxError WebPMuxSetChunk(
     WebPMux* mux, const char fourcc[4], const WebPData* chunk_data,
     int copy_data);
 
@@ -176,7 +176,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetChunk(
 //                               or if fourcc corresponds to an image chunk.
 //   WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given id.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetChunk(
+WEBP_EXTERN WebPMuxError WebPMuxGetChunk(
     const WebPMux* mux, const char fourcc[4], WebPData* chunk_data);
 
 // Deletes the chunk with the given 'fourcc' from the mux object.
@@ -189,7 +189,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetChunk(
 //                               or if fourcc corresponds to an image chunk.
 //   WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given fourcc.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxDeleteChunk(
+WEBP_EXTERN WebPMuxError WebPMuxDeleteChunk(
     WebPMux* mux, const char fourcc[4]);
 
 //------------------------------------------------------------------------------
@@ -222,7 +222,7 @@ struct WebPMuxFrameInfo {
 //   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(
+WEBP_EXTERN WebPMuxError WebPMuxSetImage(
     WebPMux* mux, const WebPData* bitstream, int copy_data);
 
 // Adds a frame at the end of the mux object.
@@ -241,7 +241,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(
 //                               or if content of 'frame' is invalid.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxPushFrame(
+WEBP_EXTERN WebPMuxError WebPMuxPushFrame(
     WebPMux* mux, const WebPMuxFrameInfo* frame, int copy_data);
 
 // Gets the nth frame from the mux object.
@@ -259,7 +259,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxPushFrame(
 //   WEBP_MUX_BAD_DATA - if nth frame chunk in mux is invalid.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetFrame(
+WEBP_EXTERN WebPMuxError WebPMuxGetFrame(
     const WebPMux* mux, uint32_t nth, WebPMuxFrameInfo* frame);
 
 // Deletes a frame from the mux object.
@@ -272,7 +272,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetFrame(
 //   WEBP_MUX_NOT_FOUND - If there are less than nth frames in the mux object
 //                        before deletion.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth);
+WEBP_EXTERN WebPMuxError WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth);
 
 //------------------------------------------------------------------------------
 // Animation.
@@ -296,7 +296,7 @@ struct WebPMuxAnimParams {
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams(
+WEBP_EXTERN WebPMuxError WebPMuxSetAnimationParams(
     WebPMux* mux, const WebPMuxAnimParams* params);
 
 // Gets the animation parameters from the mux object.
@@ -307,7 +307,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams(
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL.
 //   WEBP_MUX_NOT_FOUND - if ANIM chunk is not present in mux object.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams(
+WEBP_EXTERN WebPMuxError WebPMuxGetAnimationParams(
     const WebPMux* mux, WebPMuxAnimParams* params);
 
 //------------------------------------------------------------------------------
@@ -328,8 +328,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams(
 //   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL; or
 //                               width or height are invalid or out of bounds
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetCanvasSize(WebPMux* mux,
-                                               int width, int height);
+WEBP_EXTERN WebPMuxError WebPMuxSetCanvasSize(WebPMux* mux,
+                                              int width, int height);
 
 // Gets the canvas size from the mux object.
 // Note: This method assumes that the VP8X chunk, if present, is up-to-date.
@@ -343,8 +343,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetCanvasSize(WebPMux* mux,
 //   WEBP_MUX_INVALID_ARGUMENT - if mux, width or height is NULL.
 //   WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetCanvasSize(const WebPMux* mux,
-                                               int* width, int* height);
+WEBP_EXTERN WebPMuxError WebPMuxGetCanvasSize(const WebPMux* mux,
+                                              int* width, int* height);
 
 // Gets the feature flags from the mux object.
 // Note: This method assumes that the VP8X chunk, if present, is up-to-date.
@@ -359,8 +359,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetCanvasSize(const WebPMux* mux,
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or flags is NULL.
 //   WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetFeatures(const WebPMux* mux,
-                                             uint32_t* flags);
+WEBP_EXTERN WebPMuxError WebPMuxGetFeatures(const WebPMux* mux,
+                                            uint32_t* flags);
 
 // Gets number of chunks with the given 'id' in the mux object.
 // Parameters:
@@ -370,8 +370,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetFeatures(const WebPMux* mux,
 // Returns:
 //   WEBP_MUX_INVALID_ARGUMENT - if mux, or num_elements is NULL.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
-                                           WebPChunkId id, int* num_elements);
+WEBP_EXTERN WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
+                                          WebPChunkId id, int* num_elements);
 
 // Assembles all chunks in WebP RIFF format and returns in 'assembled_data'.
 // This function also validates the mux object.
@@ -388,8 +388,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or assembled_data is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxAssemble(WebPMux* mux,
-                                          WebPData* assembled_data);
+WEBP_EXTERN WebPMuxError WebPMuxAssemble(WebPMux* mux,
+                                         WebPData* assembled_data);
 
 //------------------------------------------------------------------------------
 // WebPAnimEncoder API
@@ -442,7 +442,7 @@ struct WebPAnimEncoderOptions {
 };
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN(int) WebPAnimEncoderOptionsInitInternal(
+WEBP_EXTERN int WebPAnimEncoderOptionsInitInternal(
     WebPAnimEncoderOptions*, int);
 
 // Should always be called, to initialize a fresh WebPAnimEncoderOptions
@@ -455,7 +455,7 @@ static WEBP_INLINE int WebPAnimEncoderOptionsInit(
 }
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN(WebPAnimEncoder*) WebPAnimEncoderNewInternal(
+WEBP_EXTERN WebPAnimEncoder* WebPAnimEncoderNewInternal(
     int, int, const WebPAnimEncoderOptions*, int);
 
 // Creates and initializes a WebPAnimEncoder object.
@@ -490,7 +490,7 @@ static WEBP_INLINE WebPAnimEncoder* WebPAnimEncoderNew(
 // Returns:
 //   On error, returns false and frame->error_code is set appropriately.
 //   Otherwise, returns true.
-WEBP_EXTERN(int) WebPAnimEncoderAdd(
+WEBP_EXTERN int WebPAnimEncoderAdd(
     WebPAnimEncoder* enc, struct WebPPicture* frame, int timestamp_ms,
     const struct WebPConfig* config);
 
@@ -503,8 +503,8 @@ WEBP_EXTERN(int) WebPAnimEncoderAdd(
 //   webp_data - (out) generated WebP bitstream.
 // Returns:
 //   True on success.
-WEBP_EXTERN(int) WebPAnimEncoderAssemble(WebPAnimEncoder* enc,
-                                         WebPData* webp_data);
+WEBP_EXTERN int WebPAnimEncoderAssemble(WebPAnimEncoder* enc,
+                                        WebPData* webp_data);
 
 // Get error string corresponding to the most recent call using 'enc'. The
 // returned string is owned by 'enc' and is valid only until the next call to
@@ -514,12 +514,12 @@ WEBP_EXTERN(int) WebPAnimEncoderAssemble(WebPAnimEncoder* enc,
 // Returns:
 //   NULL if 'enc' is NULL. Otherwise, returns the error string if the last call
 //   to 'enc' had an error, or an empty string if the last call was a success.
-WEBP_EXTERN(const char*) WebPAnimEncoderGetError(WebPAnimEncoder* enc);
+WEBP_EXTERN const char* WebPAnimEncoderGetError(WebPAnimEncoder* enc);
 
 // Deletes the WebPAnimEncoder object.
 // Parameters:
 //   enc - (in/out) object to be deleted
-WEBP_EXTERN(void) WebPAnimEncoderDelete(WebPAnimEncoder* enc);
+WEBP_EXTERN void WebPAnimEncoderDelete(WebPAnimEncoder* enc);
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/webp/mux_types.h b/thirdparty/libwebp/src/webp/mux_types.h
index b37e2c67aa..b37e2c67aa 100644
--- a/thirdparty/libwebp/webp/mux_types.h
+++ b/thirdparty/libwebp/src/webp/mux_types.h
diff --git a/thirdparty/libwebp/webp/types.h b/thirdparty/libwebp/src/webp/types.h
index 98fff35a11..989a763f0d 100644
--- a/thirdparty/libwebp/webp/types.h
+++ b/thirdparty/libwebp/src/webp/types.h
@@ -40,9 +40,9 @@ typedef long long int int64_t;
 // This explicitly marks library functions and allows for changing the
 // signature for e.g., Windows DLL builds.
 # if defined(__GNUC__) && __GNUC__ >= 4
-#  define WEBP_EXTERN(type) extern __attribute__ ((visibility ("default"))) type
+#  define WEBP_EXTERN extern __attribute__ ((visibility ("default")))
 # else
-#  define WEBP_EXTERN(type) extern type
+#  define WEBP_EXTERN extern
 # endif  /* __GNUC__ >= 4 */
 #endif  /* WEBP_EXTERN */
 
diff --git a/thirdparty/thekla_atlas/nvcore/Debug.cpp b/thirdparty/thekla_atlas/nvcore/Debug.cpp
index 75ac6beb75..4980ffa916 100644
--- a/thirdparty/thekla_atlas/nvcore/Debug.cpp
+++ b/thirdparty/thekla_atlas/nvcore/Debug.cpp
@@ -14,18 +14,18 @@
 #   define VC_EXTRALEAN
 #   include <windows.h>
 #   include <direct.h>
-#   if NV_CC_MSVC
-#       include <crtdbg.h>
-#       if _MSC_VER < 1300
-#           define DECLSPEC_DEPRECATED
+// -- GODOT start -
+#   include <crtdbg.h>
+#   if _MSC_VER < 1300
+#       define DECLSPEC_DEPRECATED
 // VC6: change this path to your Platform SDK headers
-#           include <dbghelp.h> // must be XP version of file
-//          include "M:\\dev7\\vs\\devtools\\common\\win32sdk\\include\\dbghelp.h"
-#       else
+#       include <dbghelp.h> // must be XP version of file
+//      include "M:\\dev7\\vs\\devtools\\common\\win32sdk\\include\\dbghelp.h"
+#   else
 // VC7: ships with updated headers
-#           include <dbghelp.h>
-#       endif
+#       include <dbghelp.h>
 #   endif
+// -- GODOT end -
 #   pragma comment(lib,"dbghelp.lib")
 #endif
 
@@ -109,8 +109,9 @@ namespace
 
 #endif
 
-
-#if (NV_OS_WIN32 && NV_CC_MSVC) || NV_OS_DURANGO
+// -- GODOT start -
+#if NV_OS_WIN32 || NV_OS_DURANGO
+// -- GODOT end -
 
     // We should try to simplify the top level filter as much as possible.
     // http://www.nynaeve.net/?p=128
@@ -393,8 +394,10 @@ namespace
 #pragma warning(disable:4748)
     static NV_NOINLINE int backtrace(void * trace[], int maxcount) {
         CONTEXT ctx = { 0 };
+// -- GODOT start --
 #if NV_CPU_X86 && !NV_CPU_X86_64
         ctx.ContextFlags = CONTEXT_CONTROL;
+#if NV_CC_MSVC
         _asm {
              call x
           x: pop eax
@@ -403,6 +406,13 @@ namespace
              mov ctx.Esp, esp
         }
 #else
+        register long unsigned int ebp asm("ebp");
+        ctx.Eip = (DWORD) __builtin_return_address(0);
+        ctx.Ebp = ebp;
+        ctx.Esp = (DWORD) __builtin_frame_address(0);
+#endif
+// -- GODOT end --
+#else
         RtlCaptureContext(&ctx); // Not implemented correctly in x86.
 #endif
 
diff --git a/thirdparty/thekla_atlas/nvcore/DefsGnucWin32.h b/thirdparty/thekla_atlas/nvcore/DefsGnucWin32.h
index f35ed88575..e1c8d6e4f8 100644
--- a/thirdparty/thekla_atlas/nvcore/DefsGnucWin32.h
+++ b/thirdparty/thekla_atlas/nvcore/DefsGnucWin32.h
@@ -19,7 +19,9 @@
 #endif
 
 #define NV_FASTCALL		__attribute__((fastcall))
-#define NV_FORCEINLINE	__attribute__((always_inline))
+// -- GODOT start -
+#define NV_FORCEINLINE	__attribute__((always_inline)) inline
+// -- GODOT end -
 #define NV_DEPRECATED   __attribute__((deprecated))
 
 #if __GNUC__ > 2
diff --git a/thirdparty/thekla_atlas/nvmath/ftoi.h b/thirdparty/thekla_atlas/nvmath/ftoi.h
index bee15c0908..182c56d1c3 100644
--- a/thirdparty/thekla_atlas/nvmath/ftoi.h
+++ b/thirdparty/thekla_atlas/nvmath/ftoi.h
@@ -53,7 +53,10 @@ namespace nv
         return (val<0) ? ftoi_ceil_xs(val) : ftoi_floor_xs(val);
     }
 
-#if NV_CPU_X86 || NV_CPU_X86_64
+// -- GODOT start --
+//#if NV_CPU_X86 || NV_CPU_X86_64
+#if NV_USE_SSE
+// -- GODOT end --
 
     NV_FORCEINLINE int ftoi_round_sse(float f) {
         return _mm_cvt_ss2si(_mm_set_ss(f));
diff --git a/thirdparty/thekla_atlas/nvmath/nvmath.h b/thirdparty/thekla_atlas/nvmath/nvmath.h
index 695f452c1d..f2b69426e1 100644
--- a/thirdparty/thekla_atlas/nvmath/nvmath.h
+++ b/thirdparty/thekla_atlas/nvmath/nvmath.h
@@ -14,10 +14,12 @@
 #include <float.h>  // finite, isnan
 #endif
 
-#if NV_CPU_X86 || NV_CPU_X86_64
-    //#include <intrin.h>
-    #include <xmmintrin.h>
-#endif
+// -- GODOT start --
+//#if NV_CPU_X86 || NV_CPU_X86_64
+//    //#include <intrin.h>
+//    #include <xmmintrin.h>
+//#endif
+// -- GODOT end --
 
 
 
@@ -65,6 +67,13 @@
 #endif
 
 
+// -- GODOT start --
+#if NV_USE_SSE
+    //#include <intrin.h>
+    #include <xmmintrin.h>
+#endif
+// -- GODOT end --
+
 
 #ifndef PI
 #define PI                  float(3.1415926535897932384626433833)