89 files changed, 15599 insertions, 8210 deletions
diff --git a/core/error/error_macros.cpp b/core/error/error_macros.cpp
index 928ddd3397..ceccd43259 100644
--- a/core/error/error_macros.cpp
+++ b/core/error/error_macros.cpp
@@ -118,3 +118,7 @@ void _err_print_index_error(const char *p_function, const char *p_file, int p_li
 void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const String &p_message, bool p_editor_notify, bool p_fatal) {
 	_err_print_index_error(p_function, p_file, p_line, p_index, p_size, p_index_str, p_size_str, p_message.utf8().get_data(), p_fatal);
 }
+
+void _err_flush_stdout() {
+	fflush(stdout);
+}
diff --git a/core/error/error_macros.h b/core/error/error_macros.h
index 802d7f9ef4..7b032fb4cd 100644
--- a/core/error/error_macros.h
+++ b/core/error/error_macros.h
@@ -69,6 +69,7 @@ void _err_print_error(const char *p_function, const char *p_file, int p_line, co
 void _err_print_error(const char *p_function, const char *p_file, int p_line, const String &p_error, const String &p_message, bool p_editor_notify = false, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
 void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const char *p_message = "", bool p_editor_notify = false, bool fatal = false);
 void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const String &p_message, bool p_editor_notify = false, bool fatal = false);
+void _err_flush_stdout();
 
 #ifdef __GNUC__
 //#define FUNCTION_STR __PRETTY_FUNCTION__ - too annoying
@@ -789,6 +790,7 @@ void _err_print_index_error(const char *p_function, const char *p_file, int p_li
 #define CRASH_NOW()                                                                           \
 	if (true) {                                                                               \
 		_err_print_error(FUNCTION_STR, __FILE__, __LINE__, "FATAL: Method/function failed."); \
+		_err_flush_stdout();                                                                  \
 		GENERATE_TRAP();                                                                      \
 	} else                                                                                    \
 		((void)0)
@@ -801,6 +803,7 @@ void _err_print_index_error(const char *p_function, const char *p_file, int p_li
 #define CRASH_NOW_MSG(m_msg)                                                                         \
 	if (true) {                                                                                      \
 		_err_print_error(FUNCTION_STR, __FILE__, __LINE__, "FATAL: Method/function failed.", m_msg); \
+		_err_flush_stdout();                                                                         \
 		GENERATE_TRAP();                                                                             \
 	} else                                                                                           \
 		((void)0)
diff --git a/core/io/file_access_network.cpp b/core/io/file_access_network.cpp
index 307004b1c2..cb38ac0928 100644
--- a/core/io/file_access_network.cpp
+++ b/core/io/file_access_network.cpp
@@ -487,7 +487,6 @@ FileAccessNetwork::~FileAccessNetwork() {
 
 	FileAccessNetworkClient *nc = FileAccessNetworkClient::singleton;
 	nc->lock_mutex();
-	id = nc->last_id++;
 	nc->accesses.erase(id);
 	nc->unlock_mutex();
 }
diff --git a/core/io/image.cpp b/core/io/image.cpp
index 4f72599faf..577fc59807 100644
--- a/core/io/image.cpp
+++ b/core/io/image.cpp
@@ -30,14 +30,17 @@
 
 #include "image.h"
 
+#include "core/error/error_list.h"
 #include "core/error/error_macros.h"
 #include "core/io/image_loader.h"
 #include "core/io/resource_loader.h"
 #include "core/math/math_funcs.h"
 #include "core/string/print_string.h"
 #include "core/templates/hash_map.h"
+#include "core/variant/dictionary.h"
 
 #include <stdio.h>
+#include <cmath>
 
 const char *Image::format_names[Image::FORMAT_MAX] = {
 	"Lum8", //luminance
@@ -3135,6 +3138,8 @@ void Image::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("rgbe_to_srgb"), &Image::rgbe_to_srgb);
 	ClassDB::bind_method(D_METHOD("bump_map_to_normal_map", "bump_scale"), &Image::bump_map_to_normal_map, DEFVAL(1.0));
 
+	ClassDB::bind_method(D_METHOD("compute_image_metrics", "compared_image", "use_luma"), &Image::compute_image_metrics);
+
 	ClassDB::bind_method(D_METHOD("blit_rect", "src", "src_rect", "dst"), &Image::blit_rect);
 	ClassDB::bind_method(D_METHOD("blit_rect_mask", "src", "mask", "src_rect", "dst"), &Image::blit_rect_mask);
 	ClassDB::bind_method(D_METHOD("blend_rect", "src", "src_rect", "dst"), &Image::blend_rect);
@@ -3620,3 +3625,128 @@ Ref<Resource> Image::duplicate(bool p_subresources) const {
 void Image::set_as_black() {
 	memset(data.ptrw(), 0, data.size());
 }
+
+Dictionary Image::compute_image_metrics(const Ref<Image> p_compared_image, bool p_luma_metric) {
+	// https://github.com/richgel999/bc7enc_rdo/blob/master/LICENSE
+	//
+	// This is free and unencumbered software released into the public domain.
+	// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+	// software, either in source code form or as a compiled binary, for any purpose,
+	// commercial or non - commercial, and by any means.
+	// In jurisdictions that recognize copyright laws, the author or authors of this
+	// software dedicate any and all copyright interest in the software to the public
+	// domain. We make this dedication for the benefit of the public at large and to
+	// the detriment of our heirs and successors. We intend this dedication to be an
+	// overt act of relinquishment in perpetuity of all present and future rights to
+	// this software under copyright law.
+	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+	// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+	// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+	// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+	Dictionary result;
+	result["max"] = INFINITY;
+	result["mean"] = INFINITY;
+	result["mean_squared"] = INFINITY;
+	result["root_mean_squared"] = INFINITY;
+	result["peak_snr"] = 0.0f;
+
+	ERR_FAIL_NULL_V(p_compared_image, result);
+	Error err = OK;
+	Ref<Image> compared_image = duplicate(true);
+	if (compared_image->is_compressed()) {
+		err = compared_image->decompress();
+	}
+	ERR_FAIL_COND_V(err != OK, result);
+	Ref<Image> source_image = p_compared_image->duplicate(true);
+	if (source_image->is_compressed()) {
+		err = source_image->decompress();
+	}
+	ERR_FAIL_COND_V(err != OK, result);
+
+	ERR_FAIL_COND_V(err != OK, result);
+
+	ERR_FAIL_COND_V_MSG((compared_image->get_format() >= Image::FORMAT_RH) && (compared_image->get_format() <= Image::FORMAT_RGBE9995), result, "Metrics on HDR images are not supported.");
+	ERR_FAIL_COND_V_MSG((source_image->get_format() >= Image::FORMAT_RH) && (source_image->get_format() <= Image::FORMAT_RGBE9995), result, "Metrics on HDR images are not supported.");
+
+	double image_metric_max, image_metric_mean, image_metric_mean_squared, image_metric_root_mean_squared, image_metric_peak_snr = 0.0;
+	const bool average_component_error = true;
+
+	const uint32_t width = MIN(compared_image->get_width(), source_image->get_width());
+	const uint32_t height = MIN(compared_image->get_height(), source_image->get_height());
+
+	// Histogram approach originally due to Charles Bloom.
+	double hist[256];
+	memset(hist, 0, sizeof(hist));
+
+	for (uint32_t y = 0; y < height; y++) {
+		for (uint32_t x = 0; x < width; x++) {
+			const Color color_a = compared_image->get_pixel(x, y);
+
+			const Color color_b = source_image->get_pixel(x, y);
+
+			if (!p_luma_metric) {
+				ERR_FAIL_COND_V_MSG(color_a.r > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				ERR_FAIL_COND_V_MSG(color_b.r > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				hist[Math::abs(color_a.get_r8() - color_b.get_r8())]++;
+				ERR_FAIL_COND_V_MSG(color_a.g > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				ERR_FAIL_COND_V_MSG(color_b.g > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				hist[Math::abs(color_a.get_g8() - color_b.get_g8())]++;
+				ERR_FAIL_COND_V_MSG(color_a.b > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				ERR_FAIL_COND_V_MSG(color_b.b > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				hist[Math::abs(color_a.get_b8() - color_b.get_b8())]++;
+				ERR_FAIL_COND_V_MSG(color_a.a > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				ERR_FAIL_COND_V_MSG(color_b.a > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				hist[Math::abs(color_a.get_a8() - color_b.get_a8())]++;
+			} else {
+				ERR_FAIL_COND_V_MSG(color_a.r > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				ERR_FAIL_COND_V_MSG(color_b.r > 1.0f, Dictionary(), "Can't compare HDR colors.");
+				// REC709 weightings
+				int luma_a = (13938U * color_a.get_r8() + 46869U * color_a.get_g8() + 4729U * color_a.get_b8() + 32768U) >> 16U;
+				int luma_b = (13938U * color_b.get_r8() + 46869U * color_b.get_g8() + 4729U * color_b.get_b8() + 32768U) >> 16U;
+				hist[Math::abs(luma_a - luma_b)]++;
+			}
+		}
+	}
+
+	image_metric_max = 0;
+	double sum = 0.0f, sum2 = 0.0f;
+	for (uint32_t i = 0; i < 256; i++) {
+		if (!hist[i]) {
+			continue;
+		}
+
+		image_metric_max = MAX(image_metric_max, i);
+
+		double x = i * hist[i];
+
+		sum += x;
+		sum2 += i * x;
+	}
+
+	// See http://richg42.blogspot.com/2016/09/how-to-compute-psnr-from-old-berkeley.html
+	double total_values = width * height;
+
+	if (average_component_error) {
+		total_values *= 4;
+	}
+
+	image_metric_mean = CLAMP(sum / total_values, 0.0f, 255.0f);
+	image_metric_mean_squared = CLAMP(sum2 / total_values, 0.0f, 255.0f * 255.0f);
+
+	image_metric_root_mean_squared = sqrt(image_metric_mean_squared);
+
+	if (!image_metric_root_mean_squared) {
+		image_metric_peak_snr = 1e+10f;
+	} else {
+		image_metric_peak_snr = CLAMP(log10(255.0f / image_metric_root_mean_squared) * 20.0f, 0.0f, 500.0f);
+	}
+	result["max"] = image_metric_max;
+	result["mean"] = image_metric_mean;
+	result["mean_squared"] = image_metric_mean_squared;
+	result["root_mean_squared"] = image_metric_root_mean_squared;
+	result["peak_snr"] = image_metric_peak_snr;
+	return result;
+}
diff --git a/core/io/image.h b/core/io/image.h
index ddfb2bb01d..53bfa0881f 100644
--- a/core/io/image.h
+++ b/core/io/image.h
@@ -399,6 +399,8 @@ public:
 		mipmaps = p_image->mipmaps;
 		data = p_image->data;
 	}
+
+	Dictionary compute_image_metrics(const Ref<Image> p_compared_image, bool p_luma_metric = true);
 };
 
 VARIANT_ENUM_CAST(Image::Format)
diff --git a/core/io/resource_importer.cpp b/core/io/resource_importer.cpp
index 470fb2d42d..e167611d4a 100644
--- a/core/io/resource_importer.cpp
+++ b/core/io/resource_importer.cpp
@@ -463,3 +463,8 @@ void ResourceImporter::_bind_methods() {
 	BIND_ENUM_CONSTANT(IMPORT_ORDER_DEFAULT);
 	BIND_ENUM_CONSTANT(IMPORT_ORDER_SCENE);
 }
+
+void ResourceFormatImporter::add_importer(const Ref<ResourceImporter> &p_importer) {
+	ERR_FAIL_COND(p_importer.is_null());
+	importers.insert(0, p_importer);
+}
diff --git a/core/io/resource_importer.h b/core/io/resource_importer.h
index 261afbab69..f242f29ccd 100644
--- a/core/io/resource_importer.h
+++ b/core/io/resource_importer.h
@@ -80,9 +80,8 @@ public:
 	String get_internal_resource_path(const String &p_path) const;
 	void get_internal_resource_path_list(const String &p_path, List<String> *r_paths);
 
-	void add_importer(const Ref<ResourceImporter> &p_importer) {
-		importers.push_back(p_importer);
-	}
+	void add_importer(const Ref<ResourceImporter> &p_importer);
+
 	void remove_importer(const Ref<ResourceImporter> &p_importer) { importers.erase(p_importer); }
 	Ref<ResourceImporter> get_importer_by_name(const String &p_name) const;
 	Ref<ResourceImporter> get_importer_by_extension(const String &p_extension) const;
diff --git a/core/math/expression.cpp b/core/math/expression.cpp
index b447d09887..0ddac9744e 100644
--- a/core/math/expression.cpp
+++ b/core/math/expression.cpp
@@ -361,6 +361,7 @@ Error Expression::_get_token(Token &r_token) {
 									is_float = true;
 								} else if (c == 'e') {
 									reading = READING_EXP;
+									is_float = true;
 								} else {
 									reading = READING_DONE;
 								}
@@ -397,9 +398,6 @@ Error Expression::_get_token(Token &r_token) {
 									exp_beg = true;
 
 								} else if ((c == '-' || c == '+') && !exp_sign && !exp_beg) {
-									if (c == '-') {
-										is_float = true;
-									}
 									exp_sign = true;
 
 								} else {
diff --git a/core/multiplayer/multiplayer_api.cpp b/core/multiplayer/multiplayer_api.cpp
index 41d6d14696..c8cb333e2c 100644
--- a/core/multiplayer/multiplayer_api.cpp
+++ b/core/multiplayer/multiplayer_api.cpp
@@ -32,8 +32,6 @@
 
 #include "core/debugger/engine_debugger.h"
 #include "core/io/marshalls.h"
-#include "core/multiplayer/rpc_manager.h"
-#include "scene/main/node.h"
 
 #include <stdint.h>
 
@@ -42,6 +40,8 @@
 #endif
 
 MultiplayerReplicationInterface *(*MultiplayerAPI::create_default_replication_interface)(MultiplayerAPI *p_multiplayer) = nullptr;
+MultiplayerRPCInterface *(*MultiplayerAPI::create_default_rpc_interface)(MultiplayerAPI *p_multiplayer) = nullptr;
+MultiplayerCacheInterface *(*MultiplayerAPI::create_default_cache_interface)(MultiplayerAPI *p_multiplayer) = nullptr;
 
 #ifdef DEBUG_ENABLED
 void MultiplayerAPI::profile_bandwidth(const String &p_inout, int p_size) {
@@ -91,18 +91,17 @@ void MultiplayerAPI::poll() {
 
 void MultiplayerAPI::clear() {
 	connected_peers.clear();
-	path_get_cache.clear();
-	path_send_cache.clear();
 	packet_cache.clear();
-	last_send_cache_id = 1;
+	cache->clear();
 }
 
-void MultiplayerAPI::set_root_node(Node *p_node) {
-	root_node = p_node;
+void MultiplayerAPI::set_root_path(const NodePath &p_path) {
+	ERR_FAIL_COND_MSG(!p_path.is_absolute() && !p_path.is_empty(), "MultiplayerAPI root path must be absolute.");
+	root_path = p_path;
 }
 
-Node *MultiplayerAPI::get_root_node() {
-	return root_node;
+NodePath MultiplayerAPI::get_root_path() const {
+	return root_path;
 }
 
 void MultiplayerAPI::set_multiplayer_peer(const Ref<MultiplayerPeer> &p_peer) {
@@ -139,7 +138,7 @@ Ref<MultiplayerPeer> MultiplayerAPI::get_multiplayer_peer() const {
 }
 
 void MultiplayerAPI::_process_packet(int p_from, const uint8_t *p_packet, int p_packet_len) {
-	ERR_FAIL_COND_MSG(root_node == nullptr, "Multiplayer root node was not initialized. If you are using custom multiplayer, remember to set the root node via MultiplayerAPI.set_root_node before using it.");
+	ERR_FAIL_COND_MSG(root_path.is_empty(), "Multiplayer root was not initialized. If you are using custom multiplayer, remember to set the root path via MultiplayerAPI.set_root_path before using it.");
 	ERR_FAIL_COND_MSG(p_packet_len < 1, "Invalid packet received. Size too small.");
 
 #ifdef DEBUG_ENABLED
@@ -151,15 +150,15 @@ void MultiplayerAPI::_process_packet(int p_from, const uint8_t *p_packet, int p_
 
 	switch (packet_type) {
 		case NETWORK_COMMAND_SIMPLIFY_PATH: {
-			_process_simplify_path(p_from, p_packet, p_packet_len);
+			cache->process_simplify_path(p_from, p_packet, p_packet_len);
 		} break;
 
 		case NETWORK_COMMAND_CONFIRM_PATH: {
-			_process_confirm_path(p_from, p_packet, p_packet_len);
+			cache->process_confirm_path(p_from, p_packet, p_packet_len);
 		} break;
 
 		case NETWORK_COMMAND_REMOTE_CALL: {
-			rpc_manager->process_rpc(p_from, p_packet, p_packet_len);
+			rpc->process_rpc(p_from, p_packet, p_packet_len);
 		} break;
 
 		case NETWORK_COMMAND_RAW: {
@@ -177,140 +176,6 @@ void MultiplayerAPI::_process_packet(int p_from, const uint8_t *p_packet, int p_
 	}
 }
 
-void MultiplayerAPI::_process_simplify_path(int p_from, const uint8_t *p_packet, int p_packet_len) {
-	ERR_FAIL_COND_MSG(p_packet_len < 38, "Invalid packet received. Size too small.");
-	int ofs = 1;
-
-	String methods_md5;
-	methods_md5.parse_utf8((const char *)(p_packet + ofs), 32);
-	ofs += 33;
-
-	int id = decode_uint32(&p_packet[ofs]);
-	ofs += 4;
-
-	String paths;
-	paths.parse_utf8((const char *)(p_packet + ofs), p_packet_len - ofs);
-
-	NodePath path = paths;
-
-	if (!path_get_cache.has(p_from)) {
-		path_get_cache[p_from] = PathGetCache();
-	}
-
-	Node *node = root_node->get_node(path);
-	ERR_FAIL_COND(node == nullptr);
-	const bool valid_rpc_checksum = rpc_manager->get_rpc_md5(node) == methods_md5;
-	if (valid_rpc_checksum == false) {
-		ERR_PRINT("The rpc node checksum failed. Make sure to have the same methods on both nodes. Node path: " + path);
-	}
-
-	PathGetCache::NodeInfo ni;
-	ni.path = path;
-
-	path_get_cache[p_from].nodes[id] = ni;
-
-	// Encode path to send ack.
-	CharString pname = String(path).utf8();
-	int len = encode_cstring(pname.get_data(), nullptr);
-
-	Vector<uint8_t> packet;
-
-	packet.resize(1 + 1 + len);
-	packet.write[0] = NETWORK_COMMAND_CONFIRM_PATH;
-	packet.write[1] = valid_rpc_checksum;
-	encode_cstring(pname.get_data(), &packet.write[2]);
-
-	multiplayer_peer->set_transfer_channel(0);
-	multiplayer_peer->set_transfer_mode(Multiplayer::TRANSFER_MODE_RELIABLE);
-	multiplayer_peer->set_target_peer(p_from);
-	multiplayer_peer->put_packet(packet.ptr(), packet.size());
-}
-
-void MultiplayerAPI::_process_confirm_path(int p_from, const uint8_t *p_packet, int p_packet_len) {
-	ERR_FAIL_COND_MSG(p_packet_len < 3, "Invalid packet received. Size too small.");
-
-	const bool valid_rpc_checksum = p_packet[1];
-
-	String paths;
-	paths.parse_utf8((const char *)&p_packet[2], p_packet_len - 2);
-
-	NodePath path = paths;
-
-	if (valid_rpc_checksum == false) {
-		ERR_PRINT("The rpc node checksum failed. Make sure to have the same methods on both nodes. Node path: " + path);
-	}
-
-	PathSentCache *psc = path_send_cache.getptr(path);
-	ERR_FAIL_COND_MSG(!psc, "Invalid packet received. Tries to confirm a path which was not found in cache.");
-
-	Map<int, bool>::Element *E = psc->confirmed_peers.find(p_from);
-	ERR_FAIL_COND_MSG(!E, "Invalid packet received. Source peer was not found in cache for the given path.");
-	E->get() = true;
-}
-
-bool MultiplayerAPI::_send_confirm_path(Node *p_node, NodePath p_path, PathSentCache *psc, int p_target) {
-	bool has_all_peers = true;
-	List<int> peers_to_add; // If one is missing, take note to add it.
-
-	for (Set<int>::Element *E = connected_peers.front(); E; E = E->next()) {
-		if (p_target < 0 && E->get() == -p_target) {
-			continue; // Continue, excluded.
-		}
-
-		if (p_target > 0 && E->get() != p_target) {
-			continue; // Continue, not for this peer.
-		}
-
-		Map<int, bool>::Element *F = psc->confirmed_peers.find(E->get());
-
-		if (!F || !F->get()) {
-			// Path was not cached, or was cached but is unconfirmed.
-			if (!F) {
-				// Not cached at all, take note.
-				peers_to_add.push_back(E->get());
-			}
-
-			has_all_peers = false;
-		}
-	}
-
-	if (peers_to_add.size() > 0) {
-		// Those that need to be added, send a message for this.
-
-		// Encode function name.
-		const CharString path = String(p_path).utf8();
-		const int path_len = encode_cstring(path.get_data(), nullptr);
-
-		// Extract MD5 from rpc methods list.
-		const String methods_md5 = rpc_manager->get_rpc_md5(p_node);
-		const int methods_md5_len = 33; // 32 + 1 for the `0` that is added by the encoder.
-
-		Vector<uint8_t> packet;
-		packet.resize(1 + 4 + path_len + methods_md5_len);
-		int ofs = 0;
-
-		packet.write[ofs] = NETWORK_COMMAND_SIMPLIFY_PATH;
-		ofs += 1;
-
-		ofs += encode_cstring(methods_md5.utf8().get_data(), &packet.write[ofs]);
-
-		ofs += encode_uint32(psc->id, &packet.write[ofs]);
-
-		ofs += encode_cstring(path.get_data(), &packet.write[ofs]);
-
-		for (int &E : peers_to_add) {
-			multiplayer_peer->set_target_peer(E); // To all of you.
-			multiplayer_peer->set_transfer_channel(0);
-			multiplayer_peer->set_transfer_mode(Multiplayer::TRANSFER_MODE_RELIABLE);
-			multiplayer_peer->put_packet(packet.ptr(), packet.size());
-
-			psc->confirmed_peers.insert(E, false); // Insert into confirmed, but as false since it was not confirmed.
-		}
-	}
-
-	return has_all_peers;
-}
-
 // The variant is compressed and encoded; The first byte contains all the meta
 // information and the format is:
 // - The first LSB 5 bits are used for the variant type.
@@ -537,23 +402,14 @@ Error MultiplayerAPI::decode_and_decompress_variants(Vector<Variant> &r_variants
 
 void MultiplayerAPI::_add_peer(int p_id) {
 	connected_peers.insert(p_id);
-	path_get_cache.insert(p_id, PathGetCache());
+	cache->on_peer_change(p_id, true);
 	replicator->on_peer_change(p_id, true);
 	emit_signal(SNAME("peer_connected"), p_id);
 }
 
 void MultiplayerAPI::_del_peer(int p_id) {
 	replicator->on_peer_change(p_id, false);
-	// Cleanup get cache.
-	path_get_cache.erase(p_id);
-	// Cleanup sent cache.
-	// Some refactoring is needed to make this faster and do paths GC.
-	List<NodePath> keys;
-	path_send_cache.get_key_list(&keys);
-	for (const NodePath &E : keys) {
-		PathSentCache *psc = path_send_cache.getptr(E);
-		psc->confirmed_peers.erase(p_id);
-	}
+	cache->on_peer_change(p_id, false);
 	connected_peers.erase(p_id);
 	emit_signal(SNAME("peer_disconnected"), p_id);
 }
@@ -605,41 +461,15 @@ void MultiplayerAPI::_process_raw(int p_from, const uint8_t *p_packet, int p_pac
 }
 
 bool MultiplayerAPI::is_cache_confirmed(NodePath p_path, int p_peer) {
-	const PathSentCache *psc = path_send_cache.getptr(p_path);
-	ERR_FAIL_COND_V(!psc, false);
-	const Map<int, bool>::Element *F = psc->confirmed_peers.find(p_peer);
-	ERR_FAIL_COND_V(!F, false); // Should never happen.
-	return F->get();
-}
-
-bool MultiplayerAPI::send_confirm_path(Node *p_node, NodePath p_path, int p_peer_id, int &r_id) {
-	// See if the path is cached.
-	PathSentCache *psc = path_send_cache.getptr(p_path);
-	if (!psc) {
-		// Path is not cached, create.
-		path_send_cache[p_path] = PathSentCache();
-		psc = path_send_cache.getptr(p_path);
-		psc->id = last_send_cache_id++;
-	}
-	r_id = psc->id;
-
-	// See if all peers have cached path (if so, call can be fast).
-	return _send_confirm_path(p_node, p_path, psc, p_peer_id);
+	return cache->is_cache_confirmed(p_path, p_peer);
 }
 
-Node *MultiplayerAPI::get_cached_node(int p_from, uint32_t p_node_id) {
-	Map<int, PathGetCache>::Element *E = path_get_cache.find(p_from);
-	ERR_FAIL_COND_V_MSG(!E, nullptr, vformat("No cache found for peer %d.", p_from));
-
-	Map<int, PathGetCache::NodeInfo>::Element *F = E->get().nodes.find(p_node_id);
-	ERR_FAIL_COND_V_MSG(!F, nullptr, vformat("ID %d not found in cache of peer %d.", p_node_id, p_from));
+bool MultiplayerAPI::send_object_cache(Object *p_obj, NodePath p_path, int p_peer_id, int &r_id) {
+	return cache->send_object_cache(p_obj, p_path, p_peer_id, r_id);
+}
 
-	PathGetCache::NodeInfo *ni = &F->get();
-	Node *node = root_node->get_node(ni->path);
-	if (!node) {
-		ERR_PRINT("Failed to get cached path: " + String(ni->path) + ".");
-	}
-	return node;
+Object *MultiplayerAPI::get_cached_object(int p_from, uint32_t p_cache_id) {
+	return cache->get_cached_object(p_from, p_cache_id);
 }
 
 int MultiplayerAPI::get_unique_id() const {
@@ -680,8 +510,12 @@ bool MultiplayerAPI::is_object_decoding_allowed() const {
 	return allow_object_decoding;
 }
 
-void MultiplayerAPI::rpcp(Node *p_node, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount) {
-	rpc_manager->rpcp(p_node, p_peer_id, p_method, p_arg, p_argcount);
+String MultiplayerAPI::get_rpc_md5(const Object *p_obj) const {
+	return rpc->get_rpc_md5(p_obj);
+}
+
+void MultiplayerAPI::rpcp(Object *p_obj, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount) {
+	rpc->rpcp(p_obj, p_peer_id, p_method, p_arg, p_argcount);
 }
 
 Error MultiplayerAPI::spawn(Object *p_object, Variant p_config) {
@@ -701,8 +535,8 @@ Error MultiplayerAPI::replication_stop(Object *p_object, Variant p_config) {
 }
 
 void MultiplayerAPI::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("set_root_node", "node"), &MultiplayerAPI::set_root_node);
-	ClassDB::bind_method(D_METHOD("get_root_node"), &MultiplayerAPI::get_root_node);
+	ClassDB::bind_method(D_METHOD("set_root_path", "path"), &MultiplayerAPI::set_root_path);
+	ClassDB::bind_method(D_METHOD("get_root_path"), &MultiplayerAPI::get_root_path);
 	ClassDB::bind_method(D_METHOD("send_bytes", "bytes", "id", "mode", "channel"), &MultiplayerAPI::send_bytes, DEFVAL(MultiplayerPeer::TARGET_PEER_BROADCAST), DEFVAL(Multiplayer::TRANSFER_MODE_RELIABLE), DEFVAL(0));
 	ClassDB::bind_method(D_METHOD("has_multiplayer_peer"), &MultiplayerAPI::has_multiplayer_peer);
 	ClassDB::bind_method(D_METHOD("get_multiplayer_peer"), &MultiplayerAPI::get_multiplayer_peer);
@@ -722,7 +556,7 @@ void MultiplayerAPI::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "allow_object_decoding"), "set_allow_object_decoding", "is_object_decoding_allowed");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "refuse_new_connections"), "set_refuse_new_connections", "is_refusing_new_connections");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "multiplayer_peer", PROPERTY_HINT_RESOURCE_TYPE, "MultiplayerPeer", PROPERTY_USAGE_NONE), "set_multiplayer_peer", "get_multiplayer_peer");
-	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "root_node", PROPERTY_HINT_RESOURCE_TYPE, "Node", PROPERTY_USAGE_NONE), "set_root_node", "get_root_node");
+	ADD_PROPERTY(PropertyInfo(Variant::NODE_PATH, "root_path"), "set_root_path", "get_root_path");
 	ADD_PROPERTY_DEFAULT("refuse_new_connections", false);
 
 	ADD_SIGNAL(MethodInfo("peer_connected", PropertyInfo(Variant::INT, "id")));
@@ -739,11 +573,18 @@ MultiplayerAPI::MultiplayerAPI() {
 	} else {
 		replicator.instantiate();
 	}
-	rpc_manager = memnew(RPCManager(this));
-	clear();
+	if (create_default_rpc_interface) {
+		rpc = Ref<MultiplayerRPCInterface>(create_default_rpc_interface(this));
+	} else {
+		rpc.instantiate();
+	}
+	if (create_default_cache_interface) {
+		cache = Ref<MultiplayerCacheInterface>(create_default_cache_interface(this));
+	} else {
+		cache.instantiate();
+	}
 }
 
 MultiplayerAPI::~MultiplayerAPI() {
 	clear();
-	memdelete(rpc_manager);
 }
diff --git a/core/multiplayer/multiplayer_api.h b/core/multiplayer/multiplayer_api.h
index f4fdafc323..9fe67615e3 100644
--- a/core/multiplayer/multiplayer_api.h
+++ b/core/multiplayer/multiplayer_api.h
@@ -55,7 +55,34 @@ public:
 	MultiplayerReplicationInterface() {}
 };
 
-class RPCManager;
+class MultiplayerRPCInterface : public RefCounted {
+	GDCLASS(MultiplayerRPCInterface, RefCounted);
+
+public:
+	// Called by Node.rpc
+	virtual void rpcp(Object *p_obj, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount) {}
+	virtual void process_rpc(int p_from, const uint8_t *p_packet, int p_packet_len) {}
+	virtual String get_rpc_md5(const Object *p_obj) const { return String(); }
+
+	MultiplayerRPCInterface() {}
+};
+
+class MultiplayerCacheInterface : public RefCounted {
+	GDCLASS(MultiplayerCacheInterface, RefCounted);
+
+public:
+	virtual void clear() {}
+	virtual void on_peer_change(int p_id, bool p_connected) {}
+	virtual void process_simplify_path(int p_from, const uint8_t *p_packet, int p_packet_len) {}
+	virtual void process_confirm_path(int p_from, const uint8_t *p_packet, int p_packet_len) {}
+
+	// Returns true if all peers have cached path.
+	virtual bool send_object_cache(Object *p_obj, NodePath p_path, int p_target, int &p_id) { return false; }
+	virtual Object *get_cached_object(int p_from, uint32_t p_cache_id) { return nullptr; }
+	virtual bool is_cache_confirmed(NodePath p_path, int p_peer) { return false; }
+
+	MultiplayerCacheInterface() {}
+};
 
 class MultiplayerAPI : public RefCounted {
 	GDCLASS(MultiplayerAPI, RefCounted);
@@ -85,49 +112,30 @@ public:
 	};
 
 private:
-	//path sent caches
-	struct PathSentCache {
-		Map<int, bool> confirmed_peers;
-		int id;
-	};
-
-	//path get caches
-	struct PathGetCache {
-		struct NodeInfo {
-			NodePath path;
-			ObjectID instance;
-		};
-
-		Map<int, NodeInfo> nodes;
-	};
-
 	Ref<MultiplayerPeer> multiplayer_peer;
 	Set<int> connected_peers;
 	int remote_sender_id = 0;
 	int remote_sender_override = 0;
 
-	HashMap<NodePath, PathSentCache> path_send_cache;
-	Map<int, PathGetCache> path_get_cache;
-	int last_send_cache_id;
 	Vector<uint8_t> packet_cache;
 
-	Node *root_node = nullptr;
+	NodePath root_path;
 	bool allow_object_decoding = false;
 
+	Ref<MultiplayerCacheInterface> cache;
 	Ref<MultiplayerReplicationInterface> replicator;
-	RPCManager *rpc_manager = nullptr;
+	Ref<MultiplayerRPCInterface> rpc;
 
 protected:
 	static void _bind_methods();
 
-	bool _send_confirm_path(Node *p_node, NodePath p_path, PathSentCache *psc, int p_target);
 	void _process_packet(int p_from, const uint8_t *p_packet, int p_packet_len);
-	void _process_simplify_path(int p_from, const uint8_t *p_packet, int p_packet_len);
-	void _process_confirm_path(int p_from, const uint8_t *p_packet, int p_packet_len);
 	void _process_raw(int p_from, const uint8_t *p_packet, int p_packet_len);
 
 public:
 	static MultiplayerReplicationInterface *(*create_default_replication_interface)(MultiplayerAPI *p_multiplayer);
+	static MultiplayerRPCInterface *(*create_default_rpc_interface)(MultiplayerAPI *p_multiplayer);
+	static MultiplayerCacheInterface *(*create_default_cache_interface)(MultiplayerAPI *p_multiplayer);
 
 	static Error encode_and_compress_variant(const Variant &p_variant, uint8_t *p_buffer, int &r_len, bool p_allow_object_decoding);
 	static Error decode_and_decompress_variant(Variant &r_variant, const uint8_t *p_buffer, int p_len, int *r_len, bool p_allow_object_decoding);
@@ -136,23 +144,24 @@ public:
 
 	void poll();
 	void clear();
-	void set_root_node(Node *p_node);
-	Node *get_root_node();
+	void set_root_path(const NodePath &p_path);
+	NodePath get_root_path() const;
 	void set_multiplayer_peer(const Ref<MultiplayerPeer> &p_peer);
 	Ref<MultiplayerPeer> get_multiplayer_peer() const;
 
 	Error send_bytes(Vector<uint8_t> p_data, int p_to = MultiplayerPeer::TARGET_PEER_BROADCAST, Multiplayer::TransferMode p_mode = Multiplayer::TRANSFER_MODE_RELIABLE, int p_channel = 0);
 
-	// Called by Node.rpc
-	void rpcp(Node *p_node, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount);
+	// RPC API
+	void rpcp(Object *p_obj, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount);
+	String get_rpc_md5(const Object *p_obj) const;
 	// Replication API
 	Error spawn(Object *p_object, Variant p_config);
 	Error despawn(Object *p_object, Variant p_config);
 	Error replication_start(Object *p_object, Variant p_config);
 	Error replication_stop(Object *p_object, Variant p_config);
-	// Called by replicator
-	bool send_confirm_path(Node *p_node, NodePath p_path, int p_target, int &p_id);
-	Node *get_cached_node(int p_from, uint32_t p_node_id);
+	// Cache API
+	bool send_object_cache(Object *p_obj, NodePath p_path, int p_target, int &p_id);
+	Object *get_cached_object(int p_from, uint32_t p_cache_id);
 	bool is_cache_confirmed(NodePath p_path, int p_peer);
 
 	void _add_peer(int p_id);
@@ -174,8 +183,6 @@ public:
 	void set_allow_object_decoding(bool p_enable);
 	bool is_object_decoding_allowed() const;
 
-	RPCManager *get_rpc_manager() const { return rpc_manager; }
-
 #ifdef DEBUG_ENABLED
 	void profile_bandwidth(const String &p_inout, int p_size);
 #endif
diff --git a/core/string/string_name.cpp b/core/string/string_name.cpp
index 61742ac582..11674629fc 100644
--- a/core/string/string_name.cpp
+++ b/core/string/string_name.cpp
@@ -84,12 +84,15 @@ void StringName::cleanup() {
 	for (int i = 0; i < STRING_TABLE_LEN; i++) {
 		while (_table[i]) {
 			_Data *d = _table[i];
-			lost_strings++;
-			if (d->static_count.get() != d->refcount.get() && OS::get_singleton()->is_stdout_verbose()) {
-				if (d->cname) {
-					print_line("Orphan StringName: " + String(d->cname));
-				} else {
-					print_line("Orphan StringName: " + String(d->name));
+			if (d->static_count.get() != d->refcount.get()) {
+				lost_strings++;
+
+				if (OS::get_singleton()->is_stdout_verbose()) {
+					if (d->cname) {
+						print_line("Orphan StringName: " + String(d->cname));
+					} else {
+						print_line("Orphan StringName: " + String(d->name));
+					}
 				}
 			}
 
diff --git a/core/templates/rid_owner.h b/core/templates/rid_owner.h
index 3ed81e76fd..95632cdec2 100644
--- a/core/templates/rid_owner.h
+++ b/core/templates/rid_owner.h
@@ -292,43 +292,32 @@ public:
 	_FORCE_INLINE_ uint32_t get_rid_count() const {
 		return alloc_count;
 	}
-
-	_FORCE_INLINE_ T *get_ptr_by_index(uint32_t p_index) {
-		ERR_FAIL_UNSIGNED_INDEX_V(p_index, alloc_count, nullptr);
+	void get_owned_list(List<RID> *p_owned) {
 		if (THREAD_SAFE) {
 			spin_lock.lock();
 		}
-		uint64_t idx = free_list_chunks[p_index / elements_in_chunk][p_index % elements_in_chunk];
-		T *ptr = &chunks[idx / elements_in_chunk][idx % elements_in_chunk];
-		if (THREAD_SAFE) {
-			spin_lock.unlock();
-		}
-		return ptr;
-	}
-
-	_FORCE_INLINE_ RID get_rid_by_index(uint32_t p_index) {
-		ERR_FAIL_INDEX_V(p_index, alloc_count, RID());
-		if (THREAD_SAFE) {
-			spin_lock.lock();
+		for (size_t i = 0; i < max_alloc; i++) {
+			uint64_t validator = validator_chunks[i / elements_in_chunk][i % elements_in_chunk];
+			if (validator != 0xFFFFFFFF) {
+				p_owned->push_back(_make_from_id((validator << 32) | i));
+			}
 		}
-		uint64_t idx = free_list_chunks[p_index / elements_in_chunk][p_index % elements_in_chunk];
-		uint64_t validator = validator_chunks[idx / elements_in_chunk][idx % elements_in_chunk];
-
-		RID rid = _make_from_id((validator << 32) | idx);
 		if (THREAD_SAFE) {
 			spin_lock.unlock();
 		}
-		return rid;
 	}
 
-	void get_owned_list(List<RID> *p_owned) {
+	//used for fast iteration in the elements or RIDs
+	void fill_owned_buffer(RID *p_rid_buffer) {
 		if (THREAD_SAFE) {
 			spin_lock.lock();
 		}
+		uint32_t idx = 0;
 		for (size_t i = 0; i < max_alloc; i++) {
 			uint64_t validator = validator_chunks[i / elements_in_chunk][i % elements_in_chunk];
 			if (validator != 0xFFFFFFFF) {
-				p_owned->push_back(_make_from_id((validator << 32) | i));
+				p_rid_buffer[idx] = _make_from_id((validator << 32) | i);
+				idx++;
 			}
 		}
 		if (THREAD_SAFE) {
@@ -425,18 +414,14 @@ public:
 		return alloc.get_rid_count();
 	}
 
-	_FORCE_INLINE_ RID get_rid_by_index(uint32_t p_index) {
-		return alloc.get_rid_by_index(p_index);
-	}
-
-	_FORCE_INLINE_ T *get_ptr_by_index(uint32_t p_index) {
-		return *alloc.get_ptr_by_index(p_index);
-	}
-
 	_FORCE_INLINE_ void get_owned_list(List<RID> *p_owned) {
 		return alloc.get_owned_list(p_owned);
 	}
 
+	void fill_owned_buffer(RID *p_rid_buffer) {
+		alloc.fill_owned_buffer(p_rid_buffer);
+	}
+
 	void set_description(const char *p_descrption) {
 		alloc.set_description(p_descrption);
 	}
@@ -485,17 +470,12 @@ public:
 		return alloc.get_rid_count();
 	}
 
-	_FORCE_INLINE_ RID get_rid_by_index(uint32_t p_index) {
-		return alloc.get_rid_by_index(p_index);
-	}
-
-	_FORCE_INLINE_ T *get_ptr_by_index(uint32_t p_index) {
-		return alloc.get_ptr_by_index(p_index);
-	}
-
 	_FORCE_INLINE_ void get_owned_list(List<RID> *p_owned) {
 		return alloc.get_owned_list(p_owned);
 	}
+	void fill_owned_buffer(RID *p_rid_buffer) {
+		alloc.fill_owned_buffer(p_rid_buffer);
+	}
 
 	void set_description(const char *p_descrption) {
 		alloc.set_description(p_descrption);
diff --git a/doc/classes/CanvasItem.xml b/doc/classes/CanvasItem.xml
index 44845947b1..16aa7309cc 100644
--- a/doc/classes/CanvasItem.xml
+++ b/doc/classes/CanvasItem.xml
@@ -334,7 +334,7 @@
 		<method name="get_global_mouse_position" qualifiers="const">
 			<return type="Vector2" />
 			<description>
-				Returns the global position of the mouse.
+				Returns the mouse's position in the [CanvasLayer] that this [CanvasItem] is in using the coordinate system of the [CanvasLayer].
 			</description>
 		</method>
 		<method name="get_global_transform" qualifiers="const">
@@ -352,7 +352,7 @@
 		<method name="get_local_mouse_position" qualifiers="const">
 			<return type="Vector2" />
 			<description>
-				Returns the mouse position relative to this item's position.
+				Returns the mouse's position in this [CanvasItem] using the local coordinate system of this [CanvasItem].
 			</description>
 		</method>
 		<method name="get_transform" qualifiers="const">
diff --git a/doc/classes/CanvasLayer.xml b/doc/classes/CanvasLayer.xml
index 9ee5ce0dcb..614bd558e8 100644
--- a/doc/classes/CanvasLayer.xml
+++ b/doc/classes/CanvasLayer.xml
@@ -44,5 +44,16 @@
 		<member name="transform" type="Transform2D" setter="set_transform" getter="get_transform" default="Transform2D(1, 0, 0, 1, 0, 0)">
 			The layer's transform.
 		</member>
+		<member name="visible" type="bool" setter="set_visible" getter="is_visible" default="true">
+			If [code]false[/code], any [CanvasItem] under this [CanvasLayer] will be hidden.
+			Unlike [member CanvasItem.visible], visibility of a [CanvasLayer] isn't propagated to underlying layers.
+		</member>
 	</members>
+	<signals>
+		<signal name="visibility_changed">
+			<description>
+				Emitted when visibility of the layer is changed. See [member visible].
+			</description>
+		</signal>
+	</signals>
 </class>
diff --git a/doc/classes/Image.xml b/doc/classes/Image.xml
index 60d4b664d2..2f4a0079c9 100644
--- a/doc/classes/Image.xml
+++ b/doc/classes/Image.xml
@@ -88,6 +88,15 @@
 			<description>
 			</description>
 		</method>
+		<method name="compute_image_metrics">
+			<return type="Dictionary" />
+			<argument index="0" name="compared_image" type="Image" />
+			<argument index="1" name="use_luma" type="bool" />
+			<description>
+				Compute image metrics on the current image and the compared image.
+				The dictionary contains [code]max[/code], [code]mean[/code], [code]mean_squared[/code], [code]root_mean_squared[/code] and [code]peak_snr[/code].
+			</description>
+		</method>
 		<method name="convert">
 			<return type="void" />
 			<argument index="0" name="format" type="int" enum="Image.Format" />
diff --git a/doc/classes/InputEventMouse.xml b/doc/classes/InputEventMouse.xml
index 5215c29b4a..054b3dbb33 100644
--- a/doc/classes/InputEventMouse.xml
+++ b/doc/classes/InputEventMouse.xml
@@ -14,10 +14,12 @@
 			The mouse button mask identifier, one of or a bitwise combination of the [enum MouseButton] button masks.
 		</member>
 		<member name="global_position" type="Vector2" setter="set_global_position" getter="get_global_position" default="Vector2(0, 0)">
-			The global mouse position relative to the current [Viewport]. If used in [method Control._gui_input] and if the current [Control] is not under the mouse, moving it will not update this value.
+			When received in [method Node._input] or [method Node._unhandled_input], returns the mouse's position in the root [Viewport] using the coordinate system of the root [Viewport].
+			When received in [method Control._gui_input], returns the mouse's position in the [CanvasLayer] that the [Control] is in using the coordinate system of the [CanvasLayer].
 		</member>
 		<member name="position" type="Vector2" setter="set_position" getter="get_position" default="Vector2(0, 0)">
-			The local mouse position relative to the [Viewport]. If used in [method Control._gui_input], the position is relative to the current [Control] which is under the mouse. If the current  [Control] is not under the mouse, moving it will not update this value.
+			When received in [method Node._input] or [method Node._unhandled_input], returns the mouse's position in the [Viewport] this [Node] is in using the coordinate system of this [Viewport].
+			When received in [method Control._gui_input], returns the mouse's position in the [Control] using the local coordinate system of the [Control].
 		</member>
 	</members>
 </class>
diff --git a/doc/classes/MultiplayerAPI.xml b/doc/classes/MultiplayerAPI.xml
index 426d902983..642e000efc 100644
--- a/doc/classes/MultiplayerAPI.xml
+++ b/doc/classes/MultiplayerAPI.xml
@@ -79,8 +79,8 @@
 		<member name="refuse_new_connections" type="bool" setter="set_refuse_new_connections" getter="is_refusing_new_connections" default="false">
 			If [code]true[/code], the MultiplayerAPI's [member multiplayer_peer] refuses new incoming connections.
 		</member>
-		<member name="root_node" type="Node" setter="set_root_node" getter="get_root_node">
-			The root node to use for RPCs. Instead of an absolute path, a relative path will be used to find the node upon which the RPC should be executed.
+		<member name="root_path" type="NodePath" setter="set_root_path" getter="get_root_path" default="NodePath(&quot;&quot;)">
+			The root path to use for RPCs and replication. Instead of an absolute path, a relative path will be used to find the node upon which the RPC should be executed.
 			This effectively allows to have different branches of the scene tree to be managed by different MultiplayerAPI, allowing for example to run both client and server in the same scene.
 		</member>
 	</members>
diff --git a/doc/classes/Node.xml b/doc/classes/Node.xml
index 47be5695a0..89bc905e69 100644
--- a/doc/classes/Node.xml
+++ b/doc/classes/Node.xml
@@ -52,7 +52,7 @@
 				It is only called if input processing is enabled, which is done automatically if this method is overridden, and can be toggled with [method set_process_input].
 				To consume the input event and stop it propagating further to other nodes, [method Viewport.set_input_as_handled] can be called.
 				For gameplay input, [method _unhandled_input] and [method _unhandled_key_input] are usually a better fit as they allow the GUI to intercept the events first.
-				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not orphan).
+				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not an orphan).
 			</description>
 		</method>
 		<method name="_physics_process" qualifiers="virtual">
@@ -62,7 +62,7 @@
 				Called during the physics processing step of the main loop. Physics processing means that the frame rate is synced to the physics, i.e. the [code]delta[/code] variable should be constant. [code]delta[/code] is in seconds.
 				It is only called if physics processing is enabled, which is done automatically if this method is overridden, and can be toggled with [method set_physics_process].
 				Corresponds to the [constant NOTIFICATION_PHYSICS_PROCESS] notification in [method Object._notification].
-				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not orphan).
+				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not an orphan).
 			</description>
 		</method>
 		<method name="_process" qualifiers="virtual">
@@ -72,38 +72,38 @@
 				Called during the processing step of the main loop. Processing happens at every frame and as fast as possible, so the [code]delta[/code] time since the previous frame is not constant. [code]delta[/code] is in seconds.
 				It is only called if processing is enabled, which is done automatically if this method is overridden, and can be toggled with [method set_process].
 				Corresponds to the [constant NOTIFICATION_PROCESS] notification in [method Object._notification].
-				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not orphan).
+				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not an orphan).
 			</description>
 		</method>
 		<method name="_ready" qualifiers="virtual">
 			<return type="void" />
 			<description>
 				Called when the node is "ready", i.e. when both the node and its children have entered the scene tree. If the node has children, their [method _ready] callbacks get triggered first, and the parent node will receive the ready notification afterwards.
-				Corresponds to the [constant NOTIFICATION_READY] notification in [method Object._notification]. See also the [code]onready[/code] keyword for variables.
+				Corresponds to the [constant NOTIFICATION_READY] notification in [method Object._notification]. See also the [code]@onready[/code] annotation for variables.
 				Usually used for initialization. For even earlier initialization, [method Object._init] may be used. See also [method _enter_tree].
-				[b]Note:[/b] [method _ready] may be called only once for each node. After removing a node from the scene tree and adding again, [code]_ready[/code] will not be called for the second time. This can be bypassed with requesting another call with [method request_ready], which may be called anywhere before adding the node again.
+				[b]Note:[/b] [method _ready] may be called only once for each node. After removing a node from the scene tree and adding it again, [code]_ready[/code] will not be called a second time. This can be bypassed by requesting another call with [method request_ready], which may be called anywhere before adding the node again.
 			</description>
 		</method>
 		<method name="_unhandled_input" qualifiers="virtual">
 			<return type="void" />
 			<argument index="0" name="event" type="InputEvent" />
 			<description>
-				Called when an [InputEvent] hasn't been consumed by [method _input] or any GUI. The input event propagates up through the node tree until a node consumes it.
+				Called when an [InputEvent] hasn't been consumed by [method _input] or any GUI [Control] item. The input event propagates up through the node tree until a node consumes it.
 				It is only called if unhandled input processing is enabled, which is done automatically if this method is overridden, and can be toggled with [method set_process_unhandled_input].
 				To consume the input event and stop it propagating further to other nodes, [method Viewport.set_input_as_handled] can be called.
 				For gameplay input, this and [method _unhandled_key_input] are usually a better fit than [method _input] as they allow the GUI to intercept the events first.
-				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not orphan).
+				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not an orphan).
 			</description>
 		</method>
 		<method name="_unhandled_key_input" qualifiers="virtual">
 			<return type="void" />
 			<argument index="0" name="event" type="InputEvent" />
 			<description>
-				Called when an [InputEventKey] or [InputEventShortcut] hasn't been consumed by [method _input] or any GUI. The input event propagates up through the node tree until a node consumes it.
+				Called when an [InputEventKey] or [InputEventShortcut] hasn't been consumed by [method _input] or any GUI [Control] item. The input event propagates up through the node tree until a node consumes it.
 				It is only called if unhandled key input processing is enabled, which is done automatically if this method is overridden, and can be toggled with [method set_process_unhandled_key_input].
 				To consume the input event and stop it propagating further to other nodes, [method Viewport.set_input_as_handled] can be called.
 				For gameplay input, this and [method _unhandled_input] are usually a better fit than [method _input] as they allow the GUI to intercept the events first.
-				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not orphan).
+				[b]Note:[/b] This method is only called if the node is present in the scene tree (i.e. if it's not an orphan).
 			</description>
 		</method>
 		<method name="add_child">
diff --git a/doc/classes/TextureButton.xml b/doc/classes/TextureButton.xml
index 476ab2d1bf..5f081b95f5 100644
--- a/doc/classes/TextureButton.xml
+++ b/doc/classes/TextureButton.xml
@@ -12,17 +12,17 @@
 		<link title="3D Voxel Demo">https://godotengine.org/asset-library/asset/676</link>
 	</tutorials>
 	<members>
-		<member name="expand" type="bool" setter="set_expand" getter="get_expand" default="false">
-			If [code]true[/code], the texture stretches to the edges of the node's bounding rectangle using the [member stretch_mode]. If [code]false[/code], the texture will not scale with the node.
-		</member>
 		<member name="flip_h" type="bool" setter="set_flip_h" getter="is_flipped_h" default="false">
 			If [code]true[/code], texture is flipped horizontally.
 		</member>
 		<member name="flip_v" type="bool" setter="set_flip_v" getter="is_flipped_v" default="false">
 			If [code]true[/code], texture is flipped vertically.
 		</member>
-		<member name="stretch_mode" type="int" setter="set_stretch_mode" getter="get_stretch_mode" enum="TextureButton.StretchMode" default="0">
-			Controls the texture's behavior when you resize the node's bounding rectangle, [b]only if[/b] [member expand] is [code]true[/code]. Set it to one of the [enum StretchMode] constants. See the constants to learn more.
+		<member name="ignore_texture_size" type="bool" setter="set_ignore_texture_size" getter="get_ignore_texture_size" default="false">
+			If [code]true[/code], the size of the texture won't be considered for minimum size calculation, so the [TextureButton] can be shrunk down past the texture size.
+		</member>
+		<member name="stretch_mode" type="int" setter="set_stretch_mode" getter="get_stretch_mode" enum="TextureButton.StretchMode" default="2">
+			Controls the texture's behavior when you resize the node's bounding rectangle. See the [enum StretchMode] constants for available options.
 		</member>
 		<member name="texture_click_mask" type="BitMap" setter="set_click_mask" getter="get_click_mask">
 			Pure black and white [BitMap] image to use for click detection. On the mask, white pixels represent the button's clickable area. Use it to create buttons with curved shapes.
diff --git a/doc/classes/Viewport.xml b/doc/classes/Viewport.xml
index 1b37cab68e..7a60ca9fa6 100644
--- a/doc/classes/Viewport.xml
+++ b/doc/classes/Viewport.xml
@@ -55,7 +55,7 @@
 		<method name="get_mouse_position" qualifiers="const">
 			<return type="Vector2" />
 			<description>
-				Returns the mouse position relative to the viewport.
+				Returns the mouse's positon in this [Viewport] using the coordinate system of this [Viewport].
 			</description>
 		</method>
 		<method name="get_render_info">
@@ -180,7 +180,7 @@
 			<return type="void" />
 			<argument index="0" name="to_position" type="Vector2" />
 			<description>
-				Warps the mouse to a position relative to the viewport.
+				Moves the mouse pointer to the specified position in this [Viewport] using the coordinate system of this [Viewport].
 			</description>
 		</method>
 	</methods>
diff --git a/doc/classes/bool.xml b/doc/classes/bool.xml
index 49f2d2dd7f..243d19d94f 100644
--- a/doc/classes/bool.xml
+++ b/doc/classes/bool.xml
@@ -52,7 +52,7 @@
 		[codeblocks]
 		[gdscript]
 		var _can_shoot = true
-		onready var _cool_down = $CoolDownTimer
+		@onready var _cool_down = $CoolDownTimer
 
 		func shoot():
 		    if _can_shoot and Input.is_action_pressed("shoot"):
diff --git a/editor/editor_help.cpp b/editor/editor_help.cpp
index dfc95fb676..96c0f3a209 100644
--- a/editor/editor_help.cpp
+++ b/editor/editor_help.cpp
@@ -2007,7 +2007,7 @@ FindBar::FindBar() {
 	hide_button = memnew(TextureButton);
 	add_child(hide_button);
 	hide_button->set_focus_mode(FOCUS_NONE);
-	hide_button->set_expand(true);
+	hide_button->set_ignore_texture_size(true);
 	hide_button->set_stretch_mode(TextureButton::STRETCH_KEEP_CENTERED);
 	hide_button->connect("pressed", callable_mp(this, &FindBar::_hide_bar));
 }
diff --git a/editor/editor_settings_dialog.cpp b/editor/editor_settings_dialog.cpp
index 2520d662c5..1cb95226ec 100644
--- a/editor/editor_settings_dialog.cpp
+++ b/editor/editor_settings_dialog.cpp
@@ -513,6 +513,38 @@ void EditorSettingsDialog::_shortcut_button_pressed(Object *p_item, int p_column
 	}
 }
 
+void EditorSettingsDialog::_shortcut_cell_double_clicked() {
+	// When a shortcut cell is double clicked:
+	// If the cell has children and is in the bindings column, and if its first child is editable,
+	// then uncollapse the cell, and if the first child is the only child, then edit that child.
+	// If the cell is in the bindings column and can be edited, then edit it.
+	// If the cell is in the name column, then toggle collapse.
+	const ShortcutButton edit_btn_id = EditorSettingsDialog::SHORTCUT_EDIT;
+	const int edit_btn_col = 1;
+	TreeItem *ti = shortcuts->get_selected();
+	String type = ti->get_meta("type");
+	int col = shortcuts->get_selected_column();
+	if (type == "shortcut" && col == 0) {
+		if (ti->get_first_child()) {
+			ti->set_collapsed(!ti->is_collapsed());
+		}
+	} else if (type == "shortcut" && col == 1) {
+		if (ti->get_first_child()) {
+			TreeItem *child_ti = ti->get_first_child();
+			if (child_ti->get_button_by_id(edit_btn_col, edit_btn_id) != -1) {
+				ti->set_collapsed(false);
+				if (ti->get_child_count() == 1) {
+					_shortcut_button_pressed(child_ti, edit_btn_col, edit_btn_id);
+				}
+			}
+		}
+	} else if (type == "event" && col == 1) {
+		if (ti->get_button_by_id(edit_btn_col, edit_btn_id) != -1) {
+			_shortcut_button_pressed(ti, edit_btn_col, edit_btn_id);
+		}
+	}
+}
+
 Variant EditorSettingsDialog::get_drag_data_fw(const Point2 &p_point, Control *p_from) {
 	TreeItem *selected = shortcuts->get_selected();
 
@@ -692,6 +724,7 @@ EditorSettingsDialog::EditorSettingsDialog() {
 	shortcuts->set_column_title(0, TTR("Name"));
 	shortcuts->set_column_title(1, TTR("Binding"));
 	shortcuts->connect("button_pressed", callable_mp(this, &EditorSettingsDialog::_shortcut_button_pressed));
+	shortcuts->connect("item_activated", callable_mp(this, &EditorSettingsDialog::_shortcut_cell_double_clicked));
 	tab_shortcuts->add_child(shortcuts);
 
 	shortcuts->set_drag_forwarding(this);
diff --git a/editor/editor_settings_dialog.h b/editor/editor_settings_dialog.h
index f1c4ea7770..c8858b4fcb 100644
--- a/editor/editor_settings_dialog.h
+++ b/editor/editor_settings_dialog.h
@@ -104,6 +104,7 @@ class EditorSettingsDialog : public AcceptDialog {
 
 	void _update_shortcuts();
 	void _shortcut_button_pressed(Object *p_item, int p_column, int p_idx);
+	void _shortcut_cell_double_clicked();
 
 	void _builtin_action_popup_index_pressed(int p_index);
 
diff --git a/editor/import/resource_importer_layered_texture.cpp b/editor/import/resource_importer_layered_texture.cpp
index d63366638e..69e3311fe6 100644
--- a/editor/import/resource_importer_layered_texture.cpp
+++ b/editor/import/resource_importer_layered_texture.cpp
@@ -32,8 +32,10 @@
 
 #include "resource_importer_texture.h"
 
+#include "core/error/error_macros.h"
 #include "core/io/config_file.h"
 #include "core/io/image_loader.h"
+#include "core/object/ref_counted.h"
 #include "editor/editor_file_system.h"
 #include "editor/editor_node.h"
 #include "resource_importer_texture.h"
@@ -263,12 +265,12 @@ void ResourceImporterLayeredTexture::_save_tex(Vector<Ref<Image>> p_images, cons
 	f->store_8('L');
 
 	f->store_32(StreamTextureLayered::FORMAT_VERSION);
-	f->store_32(p_images.size()); //2d layers or 3d depth
+	f->store_32(p_images.size()); // For 2d layers or 3d depth.
 	f->store_32(mode);
 	f->store_32(0);
 
 	f->store_32(0);
-	f->store_32(mipmap_images.size()); // amount of mipmaps
+	f->store_32(mipmap_images.size()); // Adjust the amount of mipmaps.
 	f->store_32(0);
 	f->store_32(0);
 
@@ -289,7 +291,6 @@ Error ResourceImporterLayeredTexture::import(const String &p_source_file, const
 	int hdr_compression = p_options["compress/hdr_compression"];
 	int bptc_ldr = p_options["compress/bptc_ldr"];
 	bool mipmaps = p_options["mipmaps/generate"];
-	//bool mipmap_limit = p_options["mipmaps/limit"];
 
 	int channel_pack = p_options["compress/channel_pack"];
 	int hslices = (p_options.has("slices/horizontal")) ? int(p_options["slices/horizontal"]) : 0;
@@ -377,87 +378,23 @@ Error ResourceImporterLayeredTexture::import(const String &p_source_file, const
 			slices.push_back(slice);
 		}
 	}
-
-	String extension = get_save_extension();
 	Array formats_imported;
-
-	if (compress_mode == COMPRESS_VRAM_COMPRESSED) {
-		//must import in all formats, in order of priority (so platform choses the best supported one. IE, etc2 over etc).
-		//Android, GLES 2.x
-
-		bool ok_on_pc = false;
-		bool is_hdr = (image->get_format() >= Image::FORMAT_RF && image->get_format() <= Image::FORMAT_RGBE9995);
-		bool is_ldr = (image->get_format() >= Image::FORMAT_L8 && image->get_format() <= Image::FORMAT_RGB565);
-		bool can_bptc = ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_bptc");
-		bool can_s3tc = ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_s3tc");
-
-		if (can_bptc) {
-			formats_imported.push_back("bptc"); // Needs to be added anyway.
-		}
-		bool can_compress_hdr = hdr_compression > 0;
-
-		if (is_hdr && can_compress_hdr) {
-			if (used_channels == Image::USED_CHANNELS_LA || used_channels == Image::USED_CHANNELS_RGBA) {
-				//can compress hdr, but hdr with alpha is not compressible
-
-				if (hdr_compression == 2) {
-					//but user selected to compress hdr anyway, so force an alpha-less format.
-					if (image->get_format() == Image::FORMAT_RGBAF) {
-						for (int i = 0; i < slices.size(); i++) {
-							slices.write[i]->convert(Image::FORMAT_RGBF);
-						}
-
-					} else if (image->get_format() == Image::FORMAT_RGBAH) {
-						for (int i = 0; i < slices.size(); i++) {
-							slices.write[i]->convert(Image::FORMAT_RGBH);
-						}
-					}
-				} else {
-					can_compress_hdr = false;
-				}
-			}
-
-			if (can_compress_hdr) {
-				if (!can_bptc) {
-					//default to rgbe
-					if (image->get_format() != Image::FORMAT_RGBE9995) {
-						for (int i = 0; i < slices.size(); i++) {
-							slices.write[i]->convert(Image::FORMAT_RGBE9995);
-						}
-					}
-				}
-			} else {
-				can_bptc = false;
-			}
-		}
-
-		if (is_ldr && can_bptc) {
-			if (bptc_ldr == 0 || (bptc_ldr == 1 && !(used_channels == Image::USED_CHANNELS_LA || used_channels == Image::USED_CHANNELS_RGBA))) {
-				can_bptc = false;
-			}
-		}
-
-		if (can_bptc || can_s3tc) {
-			_save_tex(slices, p_save_path + ".s3tc." + extension, compress_mode, lossy, can_bptc ? Image::COMPRESS_BPTC : Image::COMPRESS_S3TC, csource, used_channels, mipmaps, false);
-			r_platform_variants->push_back("s3tc");
-			formats_imported.push_back("s3tc");
-			ok_on_pc = true;
-		}
-
-		if (ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_etc2")) {
-			_save_tex(slices, p_save_path + ".etc2." + extension, compress_mode, lossy, Image::COMPRESS_ETC2, csource, used_channels, mipmaps, true);
-			r_platform_variants->push_back("etc2");
-			formats_imported.push_back("etc2");
-		}
-
-		if (!ok_on_pc) {
-			EditorNode::add_io_error("Warning, no suitable PC VRAM compression enabled in Project Settings. This texture will not display correctly on PC.");
-		}
-	} else {
-		//import normally
-		_save_tex(slices, p_save_path + "." + extension, compress_mode, lossy, Image::COMPRESS_S3TC /* IGNORED */, csource, used_channels, mipmaps, false);
-	}
-
+	Ref<LayeredTextureImport> texture_import;
+	texture_import.instantiate();
+	texture_import->csource = &csource;
+	texture_import->save_path = p_save_path;
+	texture_import->options = p_options;
+	texture_import->platform_variants = r_platform_variants;
+	texture_import->image = image;
+	texture_import->formats_imported = formats_imported;
+	texture_import->slices = &slices;
+	texture_import->compress_mode = compress_mode;
+	texture_import->lossy = lossy;
+	texture_import->hdr_compression = hdr_compression;
+	texture_import->bptc_ldr = bptc_ldr;
+	texture_import->mipmaps = mipmaps;
+	texture_import->used_channels = used_channels;
+	_check_compress_stex(texture_import);
 	if (r_metadata) {
 		Dictionary metadata;
 		metadata["vram_texture"] = compress_mode == COMPRESS_VRAM_COMPRESSED;
@@ -537,3 +474,76 @@ ResourceImporterLayeredTexture::ResourceImporterLayeredTexture() {
 
 ResourceImporterLayeredTexture::~ResourceImporterLayeredTexture() {
 }
+
+void ResourceImporterLayeredTexture::_check_compress_stex(Ref<LayeredTextureImport> r_texture_import) {
+	String extension = get_save_extension();
+	ERR_FAIL_NULL(r_texture_import->csource);
+	if (r_texture_import->compress_mode != COMPRESS_VRAM_COMPRESSED) {
+		// Import normally.
+		_save_tex(*r_texture_import->slices, r_texture_import->save_path + "." + extension, r_texture_import->compress_mode, r_texture_import->lossy, Image::COMPRESS_S3TC /* IGNORED */, *r_texture_import->csource, r_texture_import->used_channels, r_texture_import->mipmaps, false);
+		return;
+	}
+	// Must import in all formats, in order of priority (so platform choses the best supported one. IE, etc2 over etc).
+	// Android, GLES 2.x
+
+	bool can_bptc = ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_bptc");
+	if (can_bptc) {
+		r_texture_import->formats_imported.push_back("bptc"); // BPTC needs to be added anyway.
+	}
+	bool can_compress_hdr = r_texture_import->hdr_compression > 0;
+	ERR_FAIL_NULL(r_texture_import->image);
+	bool is_hdr = (r_texture_import->image->get_format() >= Image::FORMAT_RF && r_texture_import->image->get_format() <= Image::FORMAT_RGBE9995);
+	bool is_ldr = (r_texture_import->image->get_format() >= Image::FORMAT_L8 && r_texture_import->image->get_format() <= Image::FORMAT_RGB565);
+	bool can_s3tc = ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_s3tc");
+	ERR_FAIL_NULL(r_texture_import->slices);
+	// Can compress hdr, but hdr with alpha is not compressible.
+	if (r_texture_import->hdr_compression == 2) {
+		// The user selected to compress hdr anyway, so force an alpha-less format.
+		if (r_texture_import->image->get_format() == Image::FORMAT_RGBAF) {
+			for (int i = 0; i < r_texture_import->slices->size(); i++) {
+				r_texture_import->slices->write[i]->convert(Image::FORMAT_RGBF);
+			}
+
+		} else if (r_texture_import->image->get_format() == Image::FORMAT_RGBAH) {
+			for (int i = 0; i < r_texture_import->slices->size(); i++) {
+				r_texture_import->slices->write[i]->convert(Image::FORMAT_RGBH);
+			}
+		}
+	} else {
+		can_compress_hdr = false;
+	}
+
+	if (is_hdr && can_compress_hdr) {
+		if (!can_bptc) {
+			//default to rgbe
+			if (r_texture_import->image->get_format() != Image::FORMAT_RGBE9995) {
+				for (int i = 0; i < r_texture_import->slices->size(); i++) {
+					r_texture_import->slices->write[i]->convert(Image::FORMAT_RGBE9995);
+				}
+			}
+		}
+	} else {
+		can_bptc = false;
+	}
+
+	if (is_ldr && can_bptc) {
+		if (r_texture_import->bptc_ldr == 0 || (r_texture_import->bptc_ldr == 1 && !(r_texture_import->used_channels == Image::USED_CHANNELS_LA || r_texture_import->used_channels == Image::USED_CHANNELS_RGBA))) {
+			can_bptc = false;
+		}
+	}
+	if (!(r_texture_import->used_channels == Image::USED_CHANNELS_LA || r_texture_import->used_channels == Image::USED_CHANNELS_RGBA)) {
+		if (ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_etc2")) {
+			_save_tex(*r_texture_import->slices, r_texture_import->save_path + ".etc2." + extension, r_texture_import->compress_mode, r_texture_import->lossy, Image::COMPRESS_ETC2, *r_texture_import->csource, r_texture_import->used_channels, r_texture_import->mipmaps, true);
+			r_texture_import->platform_variants->push_back("etc2");
+			r_texture_import->formats_imported.push_back("etc2");
+		}
+
+		if (can_bptc || can_s3tc) {
+			_save_tex(*r_texture_import->slices, r_texture_import->save_path + ".s3tc." + extension, r_texture_import->compress_mode, r_texture_import->lossy, can_bptc ? Image::COMPRESS_BPTC : Image::COMPRESS_S3TC, *r_texture_import->csource, r_texture_import->used_channels, r_texture_import->mipmaps, false);
+			r_texture_import->platform_variants->push_back("s3tc");
+			r_texture_import->formats_imported.push_back("s3tc");
+		}
+		return;
+	}
+	EditorNode::add_io_error("Warning, no suitable PC VRAM compression enabled in Project Settings. This texture will not display correctly on PC.");
+}
diff --git a/editor/import/resource_importer_layered_texture.h b/editor/import/resource_importer_layered_texture.h
index ee8e7dc615..edd981c63d 100644
--- a/editor/import/resource_importer_layered_texture.h
+++ b/editor/import/resource_importer_layered_texture.h
@@ -33,9 +33,30 @@
 
 #include "core/io/image.h"
 #include "core/io/resource_importer.h"
+#include "core/object/ref_counted.h"
 
 class StreamTexture2D;
 
+class LayeredTextureImport : public RefCounted {
+	GDCLASS(LayeredTextureImport, RefCounted);
+
+public:
+	Image::CompressSource *csource = nullptr;
+	String save_path;
+	Map<StringName, Variant> options;
+	List<String> *platform_variants = nullptr;
+	Ref<Image> image = nullptr;
+	Array formats_imported;
+	Vector<Ref<Image>> *slices = nullptr;
+	int compress_mode = 0;
+	float lossy = 1.0;
+	int hdr_compression = 0;
+	int bptc_ldr = 0;
+	bool mipmaps = true;
+	Image::UsedChannels used_channels = Image::USED_CHANNELS_RGBA;
+	virtual ~LayeredTextureImport() {}
+};
+
 class ResourceImporterLayeredTexture : public ResourceImporter {
 	GDCLASS(ResourceImporterLayeredTexture, ResourceImporter);
 
@@ -66,6 +87,8 @@ protected:
 	static ResourceImporterLayeredTexture *singleton;
 
 public:
+	void _check_compress_stex(Ref<LayeredTextureImport> r_texture_import);
+
 	static ResourceImporterLayeredTexture *get_singleton() { return singleton; }
 	virtual String get_importer_name() const override;
 	virtual String get_visible_name() const override;
diff --git a/editor/import/resource_importer_scene.cpp b/editor/import/resource_importer_scene.cpp
index d9448dd4a9..5d356604f4 100644
--- a/editor/import/resource_importer_scene.cpp
+++ b/editor/import/resource_importer_scene.cpp
@@ -258,8 +258,8 @@ String ResourceImporterScene::get_visible_name() const {
 }
 
 void ResourceImporterScene::get_recognized_extensions(List<String> *p_extensions) const {
-	for (Set<Ref<EditorSceneFormatImporter>>::Element *E = importers.front(); E; E = E->next()) {
-		E->get()->get_extensions(p_extensions);
+	for (Ref<EditorSceneFormatImporter> importer_elem : importers) {
+		importer_elem->get_extensions(p_extensions);
 	}
 }
 
@@ -1490,8 +1490,8 @@ void ResourceImporterScene::get_import_options(const String &p_path, List<Import
 		post_importer_plugins.write[i]->get_import_options(p_path, r_options);
 	}
 
-	for (Ref<EditorSceneFormatImporter> importer : importers) {
-		importer->get_import_options(p_path, r_options);
+	for (Ref<EditorSceneFormatImporter> importer_elem : importers) {
+		importer_elem->get_import_options(p_path, r_options);
 	}
 }
 
@@ -1843,13 +1843,13 @@ Node *ResourceImporterScene::pre_import(const String &p_source_file) {
 	EditorProgress progress("pre-import", TTR("Pre-Import Scene"), 0);
 	progress.step(TTR("Importing Scene..."), 0);
 
-	for (Set<Ref<EditorSceneFormatImporter>>::Element *E = importers.front(); E; E = E->next()) {
+	for (Ref<EditorSceneFormatImporter> importer_elem : importers) {
 		List<String> extensions;
-		E->get()->get_extensions(&extensions);
+		importer_elem->get_extensions(&extensions);
 
 		for (const String &F : extensions) {
 			if (F.to_lower() == ext) {
-				importer = E->get();
+				importer = importer_elem;
 				break;
 			}
 		}
@@ -1883,13 +1883,13 @@ Error ResourceImporterScene::import(const String &p_source_file, const String &p
 	EditorProgress progress("import", TTR("Import Scene"), 104);
 	progress.step(TTR("Importing Scene..."), 0);
 
-	for (Set<Ref<EditorSceneFormatImporter>>::Element *E = importers.front(); E; E = E->next()) {
+	for (Ref<EditorSceneFormatImporter> importer_elem : importers) {
 		List<String> extensions;
-		E->get()->get_extensions(&extensions);
+		importer_elem->get_extensions(&extensions);
 
 		for (const String &F : extensions) {
 			if (F.to_lower() == ext) {
-				importer = E->get();
+				importer = importer_elem;
 				break;
 			}
 		}
@@ -2088,6 +2088,24 @@ ResourceImporterScene::ResourceImporterScene() {
 	singleton = this;
 }
 
+void ResourceImporterScene::add_importer(Ref<EditorSceneFormatImporter> p_importer) {
+	ERR_FAIL_COND(p_importer.is_null());
+	importers.insert(0, p_importer);
+}
+
+void ResourceImporterScene::remove_post_importer_plugin(const Ref<EditorScenePostImportPlugin> &p_plugin) {
+	post_importer_plugins.erase(p_plugin);
+}
+
+void ResourceImporterScene::add_post_importer_plugin(const Ref<EditorScenePostImportPlugin> &p_plugin) {
+	ERR_FAIL_COND(p_plugin.is_null());
+	post_importer_plugins.insert(0, p_plugin);
+}
+
+void ResourceImporterScene::remove_importer(Ref<EditorSceneFormatImporter> p_importer) {
+	importers.erase(p_importer);
+}
+
 ///////////////////////////////////////
 
 uint32_t EditorSceneFormatImporterESCN::get_import_flags() const {
diff --git a/editor/import/resource_importer_scene.h b/editor/import/resource_importer_scene.h
index 13b55b5754..a3aeac4022 100644
--- a/editor/import/resource_importer_scene.h
+++ b/editor/import/resource_importer_scene.h
@@ -153,7 +153,7 @@ VARIANT_ENUM_CAST(EditorScenePostImportPlugin::InternalImportCategory)
 class ResourceImporterScene : public ResourceImporter {
 	GDCLASS(ResourceImporterScene, ResourceImporter);
 
-	Set<Ref<EditorSceneFormatImporter>> importers;
+	Vector<Ref<EditorSceneFormatImporter>> importers;
 
 	static ResourceImporterScene *singleton;
 
@@ -224,13 +224,13 @@ class ResourceImporterScene : public ResourceImporter {
 public:
 	static ResourceImporterScene *get_singleton() { return singleton; }
 
-	void add_post_importer_plugin(const Ref<EditorScenePostImportPlugin> &p_plugin) { post_importer_plugins.push_back(p_plugin); }
-	void remove_post_importer_plugin(const Ref<EditorScenePostImportPlugin> &p_plugin) { post_importer_plugins.erase(p_plugin); }
+	void add_post_importer_plugin(const Ref<EditorScenePostImportPlugin> &p_plugin);
+	void remove_post_importer_plugin(const Ref<EditorScenePostImportPlugin> &p_plugin);
 
-	const Set<Ref<EditorSceneFormatImporter>> &get_importers() const { return importers; }
+	const Vector<Ref<EditorSceneFormatImporter>> &get_importers() const { return importers; }
 
-	void add_importer(Ref<EditorSceneFormatImporter> p_importer) { importers.insert(p_importer); }
-	void remove_importer(Ref<EditorSceneFormatImporter> p_importer) { importers.erase(p_importer); }
+	void add_importer(Ref<EditorSceneFormatImporter> p_importer);
+	void remove_importer(Ref<EditorSceneFormatImporter> p_importer);
 
 	virtual String get_importer_name() const override;
 	virtual String get_visible_name() const override;
diff --git a/editor/import/resource_importer_texture.cpp b/editor/import/resource_importer_texture.cpp
index 69c705ed5a..127cd4511e 100644
--- a/editor/import/resource_importer_texture.cpp
+++ b/editor/import/resource_importer_texture.cpp
@@ -496,11 +496,10 @@ Error ResourceImporterTexture::import(const String &p_source_file, const String
 		//must import in all formats, in order of priority (so platform choses the best supported one. IE, etc2 over etc).
 		//Android, GLES 2.x
 
-		bool ok_on_pc = false;
-		bool is_hdr = (image->get_format() >= Image::FORMAT_RF && image->get_format() <= Image::FORMAT_RGBE9995);
+		const bool is_hdr = (image->get_format() >= Image::FORMAT_RF && image->get_format() <= Image::FORMAT_RGBE9995);
 		bool is_ldr = (image->get_format() >= Image::FORMAT_L8 && image->get_format() <= Image::FORMAT_RGB565);
-		bool can_bptc = ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_bptc");
-		bool can_s3tc = ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_s3tc");
+		const bool can_bptc = ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_bptc");
+		const bool can_s3tc = ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_s3tc");
 
 		if (can_bptc) {
 			//add to the list anyway
@@ -525,29 +524,24 @@ Error ResourceImporterTexture::import(const String &p_source_file, const String
 				}
 			}
 
-			if (can_compress_hdr) {
-				if (!can_bptc) {
-					//fallback to RGBE99995
-					if (image->get_format() != Image::FORMAT_RGBE9995) {
-						image->convert(Image::FORMAT_RGBE9995);
-					}
+			if (!can_compress_hdr) {
+				//fallback to RGBE99995
+				if (image->get_format() != Image::FORMAT_RGBE9995) {
+					image->convert(Image::FORMAT_RGBE9995);
 				}
-			} else {
-				can_bptc = false;
-			}
-		}
-
-		if (is_ldr && can_bptc) {
-			if (bptc_ldr == 0 || (bptc_ldr == 1 && !has_alpha)) {
-				can_bptc = false;
 			}
 		}
 
+		bool ok_on_pc = false;
 		if (can_bptc || can_s3tc) {
-			_save_stex(image, p_save_path + ".s3tc.stex", compress_mode, lossy, can_bptc ? Image::COMPRESS_BPTC : Image::COMPRESS_S3TC, mipmaps, stream, detect_3d, detect_roughness, detect_normal, force_normal, srgb_friendly_pack, false, mipmap_limit, normal_image, roughness_channel);
+			ok_on_pc = true;
+			Image::CompressMode image_compress_mode = Image::COMPRESS_BPTC;
+			if (!bptc_ldr && can_s3tc && is_ldr) {
+				image_compress_mode = Image::COMPRESS_S3TC;
+			}
+			_save_stex(image, p_save_path + ".s3tc.stex", compress_mode, lossy, image_compress_mode, mipmaps, stream, detect_3d, detect_roughness, detect_normal, force_normal, srgb_friendly_pack, false, mipmap_limit, normal_image, roughness_channel);
 			r_platform_variants->push_back("s3tc");
 			formats_imported.push_back("s3tc");
-			ok_on_pc = true;
 		}
 
 		if (ProjectSettings::get_singleton()->get("rendering/textures/vram_compression/import_etc2")) {
diff --git a/editor/scene_tree_editor.cpp b/editor/scene_tree_editor.cpp
index c755bca64f..fcb4f5b32e 100644
--- a/editor/scene_tree_editor.cpp
+++ b/editor/scene_tree_editor.cpp
@@ -362,6 +362,17 @@ bool SceneTreeEditor::_add_nodes(Node *p_node, TreeItem *p_parent, bool p_scroll
 			}
 
 			_update_visibility_color(p_node, item);
+		} else if (p_node->is_class("CanvasLayer")) {
+			bool v = p_node->call("is_visible");
+			if (v) {
+				item->add_button(0, get_theme_icon("GuiVisibilityVisible", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
+			} else {
+				item->add_button(0, get_theme_icon("GuiVisibilityHidden", "EditorIcons"), BUTTON_VISIBILITY, false, TTR("Toggle Visibility"));
+			}
+
+			if (!p_node->is_connected("visibility_changed", callable_mp(this, &SceneTreeEditor::_node_visibility_changed))) {
+				p_node->connect("visibility_changed", callable_mp(this, &SceneTreeEditor::_node_visibility_changed), varray(p_node));
+			}
 		} else if (p_node->is_class("Node3D")) {
 			bool is_locked = p_node->has_meta("_edit_lock_");
 			if (is_locked) {
@@ -471,6 +482,9 @@ void SceneTreeEditor::_node_visibility_changed(Node *p_node) {
 	if (p_node->is_class("CanvasItem")) {
 		visible = p_node->call("is_visible");
 		CanvasItemEditor::get_singleton()->get_viewport_control()->update();
+	} else if (p_node->is_class("CanvasLayer")) {
+		visible = p_node->call("is_visible");
+		CanvasItemEditor::get_singleton()->get_viewport_control()->update();
 	} else if (p_node->is_class("Node3D")) {
 		visible = p_node->call("is_visible");
 	}
@@ -514,7 +528,7 @@ void SceneTreeEditor::_node_removed(Node *p_node) {
 		p_node->disconnect("script_changed", callable_mp(this, &SceneTreeEditor::_node_script_changed));
 	}
 
-	if (p_node->is_class("Node3D") || p_node->is_class("CanvasItem")) {
+	if (p_node->is_class("Node3D") || p_node->is_class("CanvasItem") || p_node->is_class("CanvasLayer")) {
 		if (p_node->is_connected("visibility_changed", callable_mp(this, &SceneTreeEditor::_node_visibility_changed))) {
 			p_node->disconnect("visibility_changed", callable_mp(this, &SceneTreeEditor::_node_visibility_changed));
 		}
diff --git a/editor/script_create_dialog.cpp b/editor/script_create_dialog.cpp
index 0c916bf56a..cafa12c42e 100644
--- a/editor/script_create_dialog.cpp
+++ b/editor/script_create_dialog.cpp
@@ -763,10 +763,10 @@ void ScriptCreateDialog::_update_dialog() {
 }
 
 ScriptLanguage::ScriptTemplate ScriptCreateDialog::_get_current_template() const {
-	int selected_id = template_menu->get_selected_id();
+	int selected_index = template_menu->get_selected();
 	for (const ScriptLanguage::ScriptTemplate &t : template_list) {
 		if (is_using_templates) {
-			if (t.id == selected_id) {
+			if (t.id == selected_index) {
 				return t;
 			}
 		} else {
diff --git a/modules/cvtt/SCsub b/modules/cvtt/SCsub
index e56177d6e9..1d5a7ff6a3 100644
--- a/modules/cvtt/SCsub
+++ b/modules/cvtt/SCsub
@@ -11,7 +11,16 @@ thirdparty_obj = []
 
 thirdparty_dir = "#thirdparty/cvtt/"
 thirdparty_sources = [
-    "ConvectionKernels.cpp",
+    "ConvectionKernels_API.cpp",
+    "ConvectionKernels_ETC.cpp",
+    "ConvectionKernels_BC67.cpp",
+    "ConvectionKernels_IndexSelector.cpp",
+    "ConvectionKernels_BC6H_IO.cpp",
+    "ConvectionKernels_S3TC.cpp",
+    "ConvectionKernels_BC7_PrioData.cpp",
+    "ConvectionKernels_SingleFile.cpp",
+    "ConvectionKernels_BCCommon.cpp",
+    "ConvectionKernels_Util.cpp",
 ]
 
 thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
diff --git a/modules/cvtt/image_compress_cvtt.cpp b/modules/cvtt/image_compress_cvtt.cpp
index 9e0579740b..d18340a2c8 100644
--- a/modules/cvtt/image_compress_cvtt.cpp
+++ b/modules/cvtt/image_compress_cvtt.cpp
@@ -41,7 +41,7 @@ struct CVTTCompressionJobParams {
 	bool is_hdr = false;
 	bool is_signed = false;
 	int bytes_per_pixel = 0;
-
+	cvtt::BC7EncodingPlan bc7_plan;
 	cvtt::Options options;
 };
 
@@ -116,7 +116,7 @@ static void _digest_row_task(const CVTTCompressionJobParams &p_job_params, const
 				cvtt::Kernels::EncodeBC6HU(output_blocks, input_blocks_hdr, p_job_params.options);
 			}
 		} else {
-			cvtt::Kernels::EncodeBC7(output_blocks, input_blocks_ldr, p_job_params.options);
+			cvtt::Kernels::EncodeBC7(output_blocks, input_blocks_ldr, p_job_params.options, p_job_params.bc7_plan);
 		}
 
 		unsigned int num_real_blocks = ((w - x_start) + 3) / 4;
@@ -141,7 +141,6 @@ void image_compress_cvtt(Image *p_image, float p_lossy_quality, Image::UsedChann
 	if (p_image->get_format() >= Image::FORMAT_BPTC_RGBA) {
 		return; //do not compress, already compressed
 	}
-
 	int w = p_image->get_width();
 	int h = p_image->get_height();
 
@@ -153,22 +152,8 @@ void image_compress_cvtt(Image *p_image, float p_lossy_quality, Image::UsedChann
 	}
 
 	cvtt::Options options;
-	uint32_t flags = cvtt::Flags::Fastest;
-
-	if (p_lossy_quality > 0.85) {
-		flags = cvtt::Flags::Ultra;
-	} else if (p_lossy_quality > 0.75) {
-		flags = cvtt::Flags::Better;
-	} else if (p_lossy_quality > 0.55) {
-		flags = cvtt::Flags::Default;
-	} else if (p_lossy_quality > 0.35) {
-		flags = cvtt::Flags::Fast;
-	} else if (p_lossy_quality > 0.15) {
-		flags = cvtt::Flags::Faster;
-	}
-
+	uint32_t flags = cvtt::Flags::Default;
 	flags |= cvtt::Flags::BC7_RespectPunchThrough;
-
 	if (p_channels == Image::USED_CHANNELS_RG) { //guessing this is a normal map
 		flags |= cvtt::Flags::Uniform;
 	}
@@ -215,12 +200,15 @@ void image_compress_cvtt(Image *p_image, float p_lossy_quality, Image::UsedChann
 	job_queue.job_params.is_signed = is_signed;
 	job_queue.job_params.options = options;
 	job_queue.job_params.bytes_per_pixel = is_hdr ? 6 : 4;
+	cvtt::Kernels::ConfigureBC7EncodingPlanFromQuality(job_queue.job_params.bc7_plan, 5);
 
-#ifdef NO_THREADS
 	int num_job_threads = 0;
-#else
-	int num_job_threads = OS::get_singleton()->can_use_threads() ? (OS::get_singleton()->get_processor_count() - 1) : 0;
-#endif
+	// Amdahl's law (Wikipedia)
+	// If a program needs 20 hours to complete using a single thread, but a one-hour portion of the program cannot be parallelized,
+	// therefore only the remaining 19 hours (p = 0.95) of execution time can be parallelized, then regardless of how many threads are devoted
+	// to a parallelized execution of this program, the minimum execution time cannot be less than one hour.
+	//
+	// The number of executions with different inputs can be increased while the latency is the same.
 
 	Vector<CVTTCompressionRowTask> tasks;
 
@@ -278,7 +266,6 @@ void image_compress_cvtt(Image *p_image, float p_lossy_quality, Image::UsedChann
 			memdelete(threads_wb[i]);
 		}
 	}
-
 	p_image->create(p_image->get_width(), p_image->get_height(), p_image->has_mipmaps(), target_format, data);
 }
 
@@ -388,6 +375,5 @@ void image_decompress_cvtt(Image *p_image) {
 		w >>= 1;
 		h >>= 1;
 	}
-
 	p_image->create(p_image->get_width(), p_image->get_height(), p_image->has_mipmaps(), target_format, data);
 }
diff --git a/modules/websocket/wsl_client.cpp b/modules/websocket/wsl_client.cpp
index be1c75c354..bebb198126 100644
--- a/modules/websocket/wsl_client.cpp
+++ b/modules/websocket/wsl_client.cpp
@@ -163,22 +163,24 @@ Error WSLClient::connect_to_host(String p_host, String p_path, uint16_t p_port,
 	_peer = Ref<WSLPeer>(memnew(WSLPeer));
 
 	if (p_host.is_valid_ip_address()) {
-		ip_candidates.clear();
-		ip_candidates.push_back(IPAddress(p_host));
+		_ip_candidates.push_back(IPAddress(p_host));
 	} else {
-		ip_candidates = IP::get_singleton()->resolve_hostname_addresses(p_host);
-	}
-
-	ERR_FAIL_COND_V(ip_candidates.is_empty(), ERR_INVALID_PARAMETER);
-
-	String port = "";
-	if ((p_port != 80 && !p_ssl) || (p_port != 443 && p_ssl)) {
-		port = ":" + itos(p_port);
+		// Queue hostname for resolution.
+		_resolver_id = IP::get_singleton()->resolve_hostname_queue_item(p_host);
+		ERR_FAIL_COND_V(_resolver_id == IP::RESOLVER_INVALID_ID, ERR_INVALID_PARAMETER);
+		// Check if it was found in cache.
+		IP::ResolverStatus ip_status = IP::get_singleton()->get_resolve_item_status(_resolver_id);
+		if (ip_status == IP::RESOLVER_STATUS_DONE) {
+			_ip_candidates = IP::get_singleton()->get_resolve_item_addresses(_resolver_id);
+			IP::get_singleton()->erase_resolve_item(_resolver_id);
+			_resolver_id = IP::RESOLVER_INVALID_ID;
+		}
 	}
 
-	Error err = ERR_BUG; // Should be at least one entry.
-	while (ip_candidates.size() > 0) {
-		err = _tcp->connect_to_host(ip_candidates.pop_front(), p_port);
+	// We assume OK while hostname resultion is pending.
+	Error err = _resolver_id != IP::RESOLVER_INVALID_ID ? OK : FAILED;
+	while (_ip_candidates.size()) {
+		err = _tcp->connect_to_host(_ip_candidates.pop_front(), p_port);
 		if (err == OK) {
 			break;
 		}
@@ -200,8 +202,11 @@ Error WSLClient::connect_to_host(String p_host, String p_path, uint16_t p_port,
 	}
 
 	_key = WSLPeer::generate_key();
-	// TODO custom extra headers (allow overriding this too?)
 	String request = "GET " + p_path + " HTTP/1.1\r\n";
+	String port = "";
+	if ((p_port != 80 && !p_ssl) || (p_port != 443 && p_ssl)) {
+		port = ":" + itos(p_port);
+	}
 	request += "Host: " + p_host + port + "\r\n";
 	request += "Upgrade: websocket\r\n";
 	request += "Connection: Upgrade\r\n";
@@ -231,6 +236,30 @@ int WSLClient::get_max_packet_size() const {
 }
 
 void WSLClient::poll() {
+	if (_resolver_id != IP::RESOLVER_INVALID_ID) {
+		IP::ResolverStatus ip_status = IP::get_singleton()->get_resolve_item_status(_resolver_id);
+		if (ip_status == IP::RESOLVER_STATUS_WAITING) {
+			return;
+		}
+		// Anything else is either a candidate or a failure.
+		Error err = FAILED;
+		if (ip_status == IP::RESOLVER_STATUS_DONE) {
+			_ip_candidates = IP::get_singleton()->get_resolve_item_addresses(_resolver_id);
+			while (_ip_candidates.size()) {
+				err = _tcp->connect_to_host(_ip_candidates.pop_front(), _port);
+				if (err == OK) {
+					break;
+				}
+			}
+		}
+		IP::get_singleton()->erase_resolve_item(_resolver_id);
+		_resolver_id = IP::RESOLVER_INVALID_ID;
+		if (err != OK) {
+			disconnect_from_host();
+			_on_error();
+			return;
+		}
+	}
 	if (_peer->is_connected_to_host()) {
 		_peer->poll();
 		if (!_peer->is_connected_to_host()) {
@@ -251,7 +280,7 @@ void WSLClient::poll() {
 			_on_error();
 			break;
 		case StreamPeerTCP::STATUS_CONNECTED: {
-			ip_candidates.clear();
+			_ip_candidates.clear();
 			Ref<StreamPeerSSL> ssl;
 			if (_use_ssl) {
 				if (_connection == _tcp) {
@@ -282,9 +311,9 @@ void WSLClient::poll() {
 			_do_handshake();
 		} break;
 		case StreamPeerTCP::STATUS_ERROR:
-			while (ip_candidates.size() > 0) {
+			while (_ip_candidates.size() > 0) {
 				_tcp->disconnect_from_host();
-				if (_tcp->connect_to_host(ip_candidates.pop_front(), _port) == OK) {
+				if (_tcp->connect_to_host(_ip_candidates.pop_front(), _port) == OK) {
 					return;
 				}
 			}
@@ -307,7 +336,7 @@ MultiplayerPeer::ConnectionStatus WSLClient::get_connection_status() const {
 		return CONNECTION_CONNECTED;
 	}
 
-	if (_tcp->is_connected_to_host()) {
+	if (_tcp->is_connected_to_host() || _resolver_id != IP::RESOLVER_INVALID_ID) {
 		return CONNECTION_CONNECTING;
 	}
 
@@ -330,7 +359,12 @@ void WSLClient::disconnect_from_host(int p_code, String p_reason) {
 	memset(_resp_buf, 0, sizeof(_resp_buf));
 	_resp_pos = 0;
 
-	ip_candidates.clear();
+	if (_resolver_id != IP::RESOLVER_INVALID_ID) {
+		IP::get_singleton()->erase_resolve_item(_resolver_id);
+		_resolver_id = IP::RESOLVER_INVALID_ID;
+	}
+
+	_ip_candidates.clear();
 }
 
 IPAddress WSLClient::get_connected_host() const {
diff --git a/modules/websocket/wsl_client.h b/modules/websocket/wsl_client.h
index 4839d7ab9b..d846e6be00 100644
--- a/modules/websocket/wsl_client.h
+++ b/modules/websocket/wsl_client.h
@@ -63,10 +63,11 @@ private:
 
 	String _key;
 	String _host;
-	int _port;
-	Array ip_candidates;
+	uint16_t _port;
+	Array _ip_candidates;
 	Vector<String> _protocols;
 	bool _use_ssl = false;
+	IP::ResolverID _resolver_id = IP::RESOLVER_INVALID_ID;
 
 	void _do_handshake();
 	bool _verify_headers(String &r_protocol);
diff --git a/scene/gui/texture_button.cpp b/scene/gui/texture_button.cpp
index 89a17ae854..26acfaaa70 100644
--- a/scene/gui/texture_button.cpp
+++ b/scene/gui/texture_button.cpp
@@ -37,7 +37,7 @@
 Size2 TextureButton::get_minimum_size() const {
 	Size2 rscale = Control::get_minimum_size();
 
-	if (!expand) {
+	if (!ignore_texture_size) {
 		if (normal.is_null()) {
 			if (pressed.is_null()) {
 				if (hover.is_null()) {
@@ -182,50 +182,48 @@ void TextureButton::_notification(int p_what) {
 				size = texdraw->get_size();
 				_texture_region = Rect2(Point2(), texdraw->get_size());
 				_tile = false;
-				if (expand) {
-					switch (stretch_mode) {
-						case STRETCH_KEEP:
-							size = texdraw->get_size();
-							break;
-						case STRETCH_SCALE:
-							size = get_size();
-							break;
-						case STRETCH_TILE:
-							size = get_size();
-							_tile = true;
-							break;
-						case STRETCH_KEEP_CENTERED:
-							ofs = (get_size() - texdraw->get_size()) / 2;
-							size = texdraw->get_size();
-							break;
-						case STRETCH_KEEP_ASPECT_CENTERED:
-						case STRETCH_KEEP_ASPECT: {
-							Size2 _size = get_size();
-							float tex_width = texdraw->get_width() * _size.height / texdraw->get_height();
-							float tex_height = _size.height;
-
-							if (tex_width > _size.width) {
-								tex_width = _size.width;
-								tex_height = texdraw->get_height() * tex_width / texdraw->get_width();
-							}
+				switch (stretch_mode) {
+					case STRETCH_KEEP:
+						size = texdraw->get_size();
+						break;
+					case STRETCH_SCALE:
+						size = get_size();
+						break;
+					case STRETCH_TILE:
+						size = get_size();
+						_tile = true;
+						break;
+					case STRETCH_KEEP_CENTERED:
+						ofs = (get_size() - texdraw->get_size()) / 2;
+						size = texdraw->get_size();
+						break;
+					case STRETCH_KEEP_ASPECT_CENTERED:
+					case STRETCH_KEEP_ASPECT: {
+						Size2 _size = get_size();
+						float tex_width = texdraw->get_width() * _size.height / texdraw->get_height();
+						float tex_height = _size.height;
+
+						if (tex_width > _size.width) {
+							tex_width = _size.width;
+							tex_height = texdraw->get_height() * tex_width / texdraw->get_width();
+						}
 
-							if (stretch_mode == STRETCH_KEEP_ASPECT_CENTERED) {
-								ofs.x = (_size.width - tex_width) / 2;
-								ofs.y = (_size.height - tex_height) / 2;
-							}
-							size.width = tex_width;
-							size.height = tex_height;
-						} break;
-						case STRETCH_KEEP_ASPECT_COVERED: {
-							size = get_size();
-							Size2 tex_size = texdraw->get_size();
-							Size2 scale_size(size.width / tex_size.width, size.height / tex_size.height);
-							float scale = scale_size.width > scale_size.height ? scale_size.width : scale_size.height;
-							Size2 scaled_tex_size = tex_size * scale;
-							Point2 ofs2 = ((scaled_tex_size - size) / scale).abs() / 2.0f;
-							_texture_region = Rect2(ofs2, size / scale);
-						} break;
-					}
+						if (stretch_mode == STRETCH_KEEP_ASPECT_CENTERED) {
+							ofs.x = (_size.width - tex_width) / 2;
+							ofs.y = (_size.height - tex_height) / 2;
+						}
+						size.width = tex_width;
+						size.height = tex_height;
+					} break;
+					case STRETCH_KEEP_ASPECT_COVERED: {
+						size = get_size();
+						Size2 tex_size = texdraw->get_size();
+						Size2 scale_size(size.width / tex_size.width, size.height / tex_size.height);
+						float scale = scale_size.width > scale_size.height ? scale_size.width : scale_size.height;
+						Size2 scaled_tex_size = tex_size * scale;
+						Point2 ofs2 = ((scaled_tex_size - size) / scale).abs() / 2.0f;
+						_texture_region = Rect2(ofs2, size / scale);
+					} break;
 				}
 
 				_position_rect = Rect2(ofs, size);
@@ -258,7 +256,7 @@ void TextureButton::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_disabled_texture", "texture"), &TextureButton::set_disabled_texture);
 	ClassDB::bind_method(D_METHOD("set_focused_texture", "texture"), &TextureButton::set_focused_texture);
 	ClassDB::bind_method(D_METHOD("set_click_mask", "mask"), &TextureButton::set_click_mask);
-	ClassDB::bind_method(D_METHOD("set_expand", "expand"), &TextureButton::set_expand);
+	ClassDB::bind_method(D_METHOD("set_ignore_texture_size", "ignore"), &TextureButton::set_ignore_texture_size);
 	ClassDB::bind_method(D_METHOD("set_stretch_mode", "mode"), &TextureButton::set_stretch_mode);
 	ClassDB::bind_method(D_METHOD("set_flip_h", "enable"), &TextureButton::set_flip_h);
 	ClassDB::bind_method(D_METHOD("is_flipped_h"), &TextureButton::is_flipped_h);
@@ -271,7 +269,7 @@ void TextureButton::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_disabled_texture"), &TextureButton::get_disabled_texture);
 	ClassDB::bind_method(D_METHOD("get_focused_texture"), &TextureButton::get_focused_texture);
 	ClassDB::bind_method(D_METHOD("get_click_mask"), &TextureButton::get_click_mask);
-	ClassDB::bind_method(D_METHOD("get_expand"), &TextureButton::get_expand);
+	ClassDB::bind_method(D_METHOD("get_ignore_texture_size"), &TextureButton::get_ignore_texture_size);
 	ClassDB::bind_method(D_METHOD("get_stretch_mode"), &TextureButton::get_stretch_mode);
 
 	ADD_GROUP("Textures", "texture_");
@@ -281,7 +279,7 @@ void TextureButton::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "texture_disabled", PROPERTY_HINT_RESOURCE_TYPE, "Texture2D"), "set_disabled_texture", "get_disabled_texture");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "texture_focused", PROPERTY_HINT_RESOURCE_TYPE, "Texture2D"), "set_focused_texture", "get_focused_texture");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "texture_click_mask", PROPERTY_HINT_RESOURCE_TYPE, "BitMap"), "set_click_mask", "get_click_mask");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "expand", PROPERTY_HINT_RESOURCE_TYPE, "bool"), "set_expand", "get_expand");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "ignore_texture_size", PROPERTY_HINT_RESOURCE_TYPE, "bool"), "set_ignore_texture_size", "get_ignore_texture_size");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "stretch_mode", PROPERTY_HINT_ENUM, "Scale,Tile,Keep,Keep Centered,Keep Aspect,Keep Aspect Centered,Keep Aspect Covered"), "set_stretch_mode", "get_stretch_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "flip_h", PROPERTY_HINT_RESOURCE_TYPE, "bool"), "set_flip_h", "is_flipped_h");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "flip_v", PROPERTY_HINT_RESOURCE_TYPE, "bool"), "set_flip_v", "is_flipped_v");
@@ -352,12 +350,12 @@ void TextureButton::set_focused_texture(const Ref<Texture2D> &p_focused) {
 	focused = p_focused;
 };
 
-bool TextureButton::get_expand() const {
-	return expand;
+bool TextureButton::get_ignore_texture_size() const {
+	return ignore_texture_size;
 }
 
-void TextureButton::set_expand(bool p_expand) {
-	expand = p_expand;
+void TextureButton::set_ignore_texture_size(bool p_ignore) {
+	ignore_texture_size = p_ignore;
 	update_minimum_size();
 	update();
 }
diff --git a/scene/gui/texture_button.h b/scene/gui/texture_button.h
index 1428a79a1d..5762949acd 100644
--- a/scene/gui/texture_button.h
+++ b/scene/gui/texture_button.h
@@ -54,8 +54,8 @@ private:
 	Ref<Texture2D> disabled;
 	Ref<Texture2D> focused;
 	Ref<BitMap> click_mask;
-	bool expand = false;
-	StretchMode stretch_mode = STRETCH_SCALE;
+	bool ignore_texture_size = false;
+	StretchMode stretch_mode = STRETCH_KEEP;
 
 	Rect2 _texture_region;
 	Rect2 _position_rect;
@@ -85,8 +85,8 @@ public:
 	Ref<Texture2D> get_focused_texture() const;
 	Ref<BitMap> get_click_mask() const;
 
-	bool get_expand() const;
-	void set_expand(bool p_expand);
+	bool get_ignore_texture_size() const;
+	void set_ignore_texture_size(bool p_ignore);
 
 	void set_stretch_mode(StretchMode p_stretch_mode);
 	StretchMode get_stretch_mode() const;
diff --git a/scene/gui/tree.cpp b/scene/gui/tree.cpp
index 1b32884880..a190e08088 100644
--- a/scene/gui/tree.cpp
+++ b/scene/gui/tree.cpp
@@ -2490,7 +2490,7 @@ int Tree::propagate_mouse_event(const Point2i &p_pos, int x_ofs, int y_ofs, int
 			/* process selection */
 
 			if (p_double_click && (!c.editable || c.mode == TreeItem::CELL_MODE_CUSTOM || c.mode == TreeItem::CELL_MODE_ICON /*|| c.mode==TreeItem::CELL_MODE_CHECK*/)) { //it's confusing for check
-
+				// Emits the "item_activated" signal.
 				propagate_mouse_activated = true;
 
 				incr_search.clear();
diff --git a/scene/main/canvas_item.cpp b/scene/main/canvas_item.cpp
index a0916c6291..a62bbb146c 100644
--- a/scene/main/canvas_item.cpp
+++ b/scene/main/canvas_item.cpp
@@ -72,6 +72,15 @@ bool CanvasItem::is_visible_in_tree() const {
 		p = p->get_parent_item();
 	}
 
+	const Node *n = get_parent();
+	while (n) {
+		const CanvasLayer *c = Object::cast_to<CanvasLayer>(n);
+		if (c && !c->is_visible()) {
+			return false;
+		}
+		n = n->get_parent();
+	}
+
 	return true;
 }
 
diff --git a/scene/main/canvas_item.h b/scene/main/canvas_item.h
index 3d49d89746..08fea52c3a 100644
--- a/scene/main/canvas_item.h
+++ b/scene/main/canvas_item.h
@@ -46,6 +46,8 @@ class World2D;
 class CanvasItem : public Node {
 	GDCLASS(CanvasItem, Node);
 
+	friend class CanvasLayer;
+
 public:
 	enum TextureFilter {
 		TEXTURE_FILTER_PARENT_NODE,
diff --git a/scene/main/canvas_layer.cpp b/scene/main/canvas_layer.cpp
index 282ab6b497..3f3e72357b 100644
--- a/scene/main/canvas_layer.cpp
+++ b/scene/main/canvas_layer.cpp
@@ -29,6 +29,7 @@
 /*************************************************************************/
 
 #include "canvas_layer.h"
+#include "canvas_item.h"
 #include "viewport.h"
 
 void CanvasLayer::set_layer(int p_xform) {
@@ -42,6 +43,32 @@ int CanvasLayer::get_layer() const {
 	return layer;
 }
 
+void CanvasLayer::set_visible(bool p_visible) {
+	if (p_visible == visible) {
+		return;
+	}
+
+	visible = p_visible;
+	emit_signal(SNAME("visibility_changed"));
+
+	for (int i = 0; i < get_child_count(); i++) {
+		CanvasItem *c = Object::cast_to<CanvasItem>(get_child(i));
+		if (c) {
+			RenderingServer::get_singleton()->canvas_item_set_visible(c->get_canvas_item(), p_visible && c->is_visible());
+
+			if (c->is_visible()) {
+				c->_propagate_visibility_changed(p_visible);
+			} else {
+				c->notification(CanvasItem::NOTIFICATION_VISIBILITY_CHANGED);
+			}
+		}
+	}
+}
+
+bool CanvasLayer::is_visible() const {
+	return visible;
+}
+
 void CanvasLayer::set_transform(const Transform2D &p_xform) {
 	transform = p_xform;
 	locrotscale_dirty = true;
@@ -264,6 +291,9 @@ void CanvasLayer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_layer", "layer"), &CanvasLayer::set_layer);
 	ClassDB::bind_method(D_METHOD("get_layer"), &CanvasLayer::get_layer);
 
+	ClassDB::bind_method(D_METHOD("set_visible", "visible"), &CanvasLayer::set_visible);
+	ClassDB::bind_method(D_METHOD("is_visible"), &CanvasLayer::is_visible);
+
 	ClassDB::bind_method(D_METHOD("set_transform", "transform"), &CanvasLayer::set_transform);
 	ClassDB::bind_method(D_METHOD("get_transform"), &CanvasLayer::get_transform);
 
@@ -289,6 +319,7 @@ void CanvasLayer::_bind_methods() {
 
 	ADD_GROUP("Layer", "");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "layer", PROPERTY_HINT_RANGE, "-128,128,1"), "set_layer", "get_layer");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "visible"), "set_visible", "is_visible");
 	ADD_GROUP("Transform", "");
 	ADD_PROPERTY(PropertyInfo(Variant::VECTOR2, "offset"), "set_offset", "get_offset");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "rotation", PROPERTY_HINT_RANGE, "-1080,1080,0.1,or_lesser,or_greater,radians"), "set_rotation", "get_rotation");
@@ -299,6 +330,8 @@ void CanvasLayer::_bind_methods() {
 	ADD_GROUP("Follow Viewport", "follow_viewport");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "follow_viewport_enable"), "set_follow_viewport", "is_following_viewport");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "follow_viewport_scale", PROPERTY_HINT_RANGE, "0.001,1000,0.001,or_greater,or_lesser"), "set_follow_viewport_scale", "get_follow_viewport_scale");
+
+	ADD_SIGNAL(MethodInfo("visibility_changed"));
 }
 
 CanvasLayer::CanvasLayer() {
diff --git a/scene/main/canvas_layer.h b/scene/main/canvas_layer.h
index 93a0152787..b7bd793440 100644
--- a/scene/main/canvas_layer.h
+++ b/scene/main/canvas_layer.h
@@ -52,6 +52,7 @@ class CanvasLayer : public Node {
 	Viewport *vp = nullptr;
 
 	int sort_index = 0;
+	bool visible = true;
 
 	bool follow_viewport = false;
 	float follow_viewport_scale = 1.0;
@@ -69,6 +70,9 @@ public:
 	void set_layer(int p_xform);
 	int get_layer() const;
 
+	void set_visible(bool p_visible);
+	bool is_visible() const;
+
 	void set_transform(const Transform2D &p_xform);
 	Transform2D get_transform() const;
 
diff --git a/scene/main/scene_tree.cpp b/scene/main/scene_tree.cpp
index 0e4a6a4b5c..69d781cbfc 100644
--- a/scene/main/scene_tree.cpp
+++ b/scene/main/scene_tree.cpp
@@ -1165,7 +1165,7 @@ void SceneTree::set_multiplayer(Ref<MultiplayerAPI> p_multiplayer) {
 	ERR_FAIL_COND(!p_multiplayer.is_valid());
 
 	multiplayer = p_multiplayer;
-	multiplayer->set_root_node(root);
+	multiplayer->set_root_path("/" + root->get_name());
 }
 
 void SceneTree::_bind_methods() {
diff --git a/scene/multiplayer/scene_cache_interface.cpp b/scene/multiplayer/scene_cache_interface.cpp
new file mode 100644
index 0000000000..de4a94470a
--- /dev/null
+++ b/scene/multiplayer/scene_cache_interface.cpp
@@ -0,0 +1,249 @@
+/*************************************************************************/
+/*  scene_cache_interface.cpp                                            */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "scene_cache_interface.h"
+
+#include "core/io/marshalls.h"
+#include "scene/main/node.h"
+#include "scene/main/window.h"
+
+MultiplayerCacheInterface *SceneCacheInterface::_create(MultiplayerAPI *p_multiplayer) {
+	return memnew(SceneCacheInterface(p_multiplayer));
+}
+
+void SceneCacheInterface::make_default() {
+	MultiplayerAPI::create_default_cache_interface = _create;
+}
+
+void SceneCacheInterface::on_peer_change(int p_id, bool p_connected) {
+	if (p_connected) {
+		path_get_cache.insert(p_id, PathGetCache());
+	} else {
+		// Cleanup get cache.
+		path_get_cache.erase(p_id);
+		// Cleanup sent cache.
+		// Some refactoring is needed to make this faster and do paths GC.
+		List<NodePath> keys;
+		path_send_cache.get_key_list(&keys);
+		for (const NodePath &E : keys) {
+			PathSentCache *psc = path_send_cache.getptr(E);
+			psc->confirmed_peers.erase(p_id);
+		}
+	}
+}
+
+void SceneCacheInterface::process_simplify_path(int p_from, const uint8_t *p_packet, int p_packet_len) {
+	Node *root_node = SceneTree::get_singleton()->get_root()->get_node(multiplayer->get_root_path());
+	ERR_FAIL_COND(!root_node);
+	ERR_FAIL_COND_MSG(p_packet_len < 38, "Invalid packet received. Size too small.");
+	int ofs = 1;
+
+	String methods_md5;
+	methods_md5.parse_utf8((const char *)(p_packet + ofs), 32);
+	ofs += 33;
+
+	int id = decode_uint32(&p_packet[ofs]);
+	ofs += 4;
+
+	String paths;
+	paths.parse_utf8((const char *)(p_packet + ofs), p_packet_len - ofs);
+
+	NodePath path = paths;
+
+	if (!path_get_cache.has(p_from)) {
+		path_get_cache[p_from] = PathGetCache();
+	}
+
+	Node *node = root_node->get_node(path);
+	ERR_FAIL_COND(node == nullptr);
+	const bool valid_rpc_checksum = multiplayer->get_rpc_md5(node) == methods_md5;
+	if (valid_rpc_checksum == false) {
+		ERR_PRINT("The rpc node checksum failed. Make sure to have the same methods on both nodes. Node path: " + path);
+	}
+
+	PathGetCache::NodeInfo ni;
+	ni.path = path;
+
+	path_get_cache[p_from].nodes[id] = ni;
+
+	// Encode path to send ack.
+	CharString pname = String(path).utf8();
+	int len = encode_cstring(pname.get_data(), nullptr);
+
+	Vector<uint8_t> packet;
+
+	packet.resize(1 + 1 + len);
+	packet.write[0] = MultiplayerAPI::NETWORK_COMMAND_CONFIRM_PATH;
+	packet.write[1] = valid_rpc_checksum;
+	encode_cstring(pname.get_data(), &packet.write[2]);
+
+	Ref<MultiplayerPeer> multiplayer_peer = multiplayer->get_multiplayer_peer();
+	ERR_FAIL_COND(multiplayer_peer.is_null());
+
+	multiplayer_peer->set_transfer_channel(0);
+	multiplayer_peer->set_transfer_mode(Multiplayer::TRANSFER_MODE_RELIABLE);
+	multiplayer_peer->set_target_peer(p_from);
+	multiplayer_peer->put_packet(packet.ptr(), packet.size());
+}
+
+void SceneCacheInterface::process_confirm_path(int p_from, const uint8_t *p_packet, int p_packet_len) {
+	ERR_FAIL_COND_MSG(p_packet_len < 3, "Invalid packet received. Size too small.");
+
+	const bool valid_rpc_checksum = p_packet[1];
+
+	String paths;
+	paths.parse_utf8((const char *)&p_packet[2], p_packet_len - 2);
+
+	NodePath path = paths;
+
+	if (valid_rpc_checksum == false) {
+		ERR_PRINT("The rpc node checksum failed. Make sure to have the same methods on both nodes. Node path: " + path);
+	}
+
+	PathSentCache *psc = path_send_cache.getptr(path);
+	ERR_FAIL_COND_MSG(!psc, "Invalid packet received. Tries to confirm a path which was not found in cache.");
+
+	Map<int, bool>::Element *E = psc->confirmed_peers.find(p_from);
+	ERR_FAIL_COND_MSG(!E, "Invalid packet received. Source peer was not found in cache for the given path.");
+	E->get() = true;
+}
+
+bool SceneCacheInterface::_send_confirm_path(Node *p_node, NodePath p_path, PathSentCache *psc, int p_target) {
+	bool has_all_peers = true;
+	List<int> peers_to_add; // If one is missing, take note to add it.
+
+	for (const Set<int>::Element *E = multiplayer->get_connected_peers().front(); E; E = E->next()) {
+		if (p_target < 0 && E->get() == -p_target) {
+			continue; // Continue, excluded.
+		}
+
+		if (p_target > 0 && E->get() != p_target) {
+			continue; // Continue, not for this peer.
+		}
+
+		Map<int, bool>::Element *F = psc->confirmed_peers.find(E->get());
+
+		if (!F || !F->get()) {
+			// Path was not cached, or was cached but is unconfirmed.
+			if (!F) {
+				// Not cached at all, take note.
+				peers_to_add.push_back(E->get());
+			}
+
+			has_all_peers = false;
+		}
+	}
+
+	if (peers_to_add.size() > 0) {
+		// Those that need to be added, send a message for this.
+
+		// Encode function name.
+		const CharString path = String(p_path).utf8();
+		const int path_len = encode_cstring(path.get_data(), nullptr);
+
+		// Extract MD5 from rpc methods list.
+		const String methods_md5 = multiplayer->get_rpc_md5(p_node);
+		const int methods_md5_len = 33; // 32 + 1 for the `0` that is added by the encoder.
+
+		Vector<uint8_t> packet;
+		packet.resize(1 + 4 + path_len + methods_md5_len);
+		int ofs = 0;
+
+		packet.write[ofs] = MultiplayerAPI::NETWORK_COMMAND_SIMPLIFY_PATH;
+		ofs += 1;
+
+		ofs += encode_cstring(methods_md5.utf8().get_data(), &packet.write[ofs]);
+
+		ofs += encode_uint32(psc->id, &packet.write[ofs]);
+
+		ofs += encode_cstring(path.get_data(), &packet.write[ofs]);
+
+		Ref<MultiplayerPeer> multiplayer_peer = multiplayer->get_multiplayer_peer();
+		ERR_FAIL_COND_V(multiplayer_peer.is_null(), false);
+
+		for (int &E : peers_to_add) {
+			multiplayer_peer->set_target_peer(E); // To all of you.
+			multiplayer_peer->set_transfer_channel(0);
+			multiplayer_peer->set_transfer_mode(Multiplayer::TRANSFER_MODE_RELIABLE);
+			multiplayer_peer->put_packet(packet.ptr(), packet.size());
+
+			psc->confirmed_peers.insert(E, false); // Insert into confirmed, but as false since it was not confirmed.
+		}
+	}
+
+	return has_all_peers;
+}
+
+bool SceneCacheInterface::is_cache_confirmed(NodePath p_path, int p_peer) {
+	const PathSentCache *psc = path_send_cache.getptr(p_path);
+	ERR_FAIL_COND_V(!psc, false);
+	const Map<int, bool>::Element *F = psc->confirmed_peers.find(p_peer);
+	ERR_FAIL_COND_V(!F, false); // Should never happen.
+	return F->get();
+}
+
+bool SceneCacheInterface::send_object_cache(Object *p_obj, NodePath p_path, int p_peer_id, int &r_id) {
+	Node *node = Object::cast_to<Node>(p_obj);
+	ERR_FAIL_COND_V(!node, false);
+	// See if the path is cached.
+	PathSentCache *psc = path_send_cache.getptr(p_path);
+	if (!psc) {
+		// Path is not cached, create.
+		path_send_cache[p_path] = PathSentCache();
+		psc = path_send_cache.getptr(p_path);
+		psc->id = last_send_cache_id++;
+	}
+	r_id = psc->id;
+
+	return _send_confirm_path(node, p_path, psc, p_peer_id);
+}
+
+Object *SceneCacheInterface::get_cached_object(int p_from, uint32_t p_cache_id) {
+	Node *root_node = SceneTree::get_singleton()->get_root()->get_node(multiplayer->get_root_path());
+	ERR_FAIL_COND_V(!root_node, nullptr);
+	Map<int, PathGetCache>::Element *E = path_get_cache.find(p_from);
+	ERR_FAIL_COND_V_MSG(!E, nullptr, vformat("No cache found for peer %d.", p_from));
+
+	Map<int, PathGetCache::NodeInfo>::Element *F = E->get().nodes.find(p_cache_id);
+	ERR_FAIL_COND_V_MSG(!F, nullptr, vformat("ID %d not found in cache of peer %d.", p_cache_id, p_from));
+
+	PathGetCache::NodeInfo *ni = &F->get();
+	Node *node = root_node->get_node(ni->path);
+	if (!node) {
+		ERR_PRINT("Failed to get cached path: " + String(ni->path) + ".");
+	}
+	return node;
+}
+
+void SceneCacheInterface::clear() {
+	path_get_cache.clear();
+	path_send_cache.clear();
+	last_send_cache_id = 1;
+}
diff --git a/scene/multiplayer/scene_cache_interface.h b/scene/multiplayer/scene_cache_interface.h
new file mode 100644
index 0000000000..91a53cb948
--- /dev/null
+++ b/scene/multiplayer/scene_cache_interface.h
@@ -0,0 +1,82 @@
+/*************************************************************************/
+/*  scene_cache_interface.h                                              */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef SCENE_CACHE_INTERFACE_H
+#define SCENE_CACHE_INTERFACE_H
+
+#include "core/multiplayer/multiplayer_api.h"
+
+class SceneCacheInterface : public MultiplayerCacheInterface {
+	GDCLASS(SceneCacheInterface, MultiplayerCacheInterface);
+
+private:
+	MultiplayerAPI *multiplayer;
+
+	//path sent caches
+	struct PathSentCache {
+		Map<int, bool> confirmed_peers;
+		int id;
+	};
+
+	//path get caches
+	struct PathGetCache {
+		struct NodeInfo {
+			NodePath path;
+			ObjectID instance;
+		};
+
+		Map<int, NodeInfo> nodes;
+	};
+
+	HashMap<NodePath, PathSentCache> path_send_cache;
+	Map<int, PathGetCache> path_get_cache;
+	int last_send_cache_id = 1;
+
+protected:
+	bool _send_confirm_path(Node *p_node, NodePath p_path, PathSentCache *psc, int p_target);
+	static MultiplayerCacheInterface *_create(MultiplayerAPI *p_multiplayer);
+
+public:
+	static void make_default();
+
+	virtual void clear() override;
+	virtual void on_peer_change(int p_id, bool p_connected) override;
+	virtual void process_simplify_path(int p_from, const uint8_t *p_packet, int p_packet_len) override;
+	virtual void process_confirm_path(int p_from, const uint8_t *p_packet, int p_packet_len) override;
+
+	// Returns true if all peers have cached path.
+	virtual bool send_object_cache(Object *p_obj, NodePath p_path, int p_target, int &p_id) override;
+	virtual Object *get_cached_object(int p_from, uint32_t p_cache_id) override;
+	virtual bool is_cache_confirmed(NodePath p_path, int p_peer) override;
+
+	SceneCacheInterface(MultiplayerAPI *p_multiplayer) { multiplayer = p_multiplayer; }
+};
+
+#endif // SCENE_CACHE_INTERFACE_H
diff --git a/scene/multiplayer/scene_replication_interface.cpp b/scene/multiplayer/scene_replication_interface.cpp
index 7155935084..2088a43ba7 100644
--- a/scene/multiplayer/scene_replication_interface.cpp
+++ b/scene/multiplayer/scene_replication_interface.cpp
@@ -186,12 +186,10 @@ Error SceneReplicationInterface::_send_spawn(Node *p_node, MultiplayerSpawner *p
 	}
 
 	// Prepare simplified path.
-	const Node *root_node = multiplayer->get_root_node();
-	ERR_FAIL_COND_V(!root_node, ERR_UNCONFIGURED);
-	NodePath rel_path = (root_node->get_path()).rel_path_to(p_spawner->get_path());
+	NodePath rel_path = multiplayer->get_root_path().rel_path_to(p_spawner->get_path());
 
 	int path_id = 0;
-	multiplayer->send_confirm_path(p_spawner, rel_path, p_peer, path_id);
+	multiplayer->send_object_cache(p_spawner, rel_path, p_peer, path_id);
 
 	// Encode name and parent ID.
 	CharString cname = p_node->get_name().operator String().utf8();
@@ -243,7 +241,7 @@ Error SceneReplicationInterface::on_spawn_receive(int p_from, const uint8_t *p_b
 	ofs += 1;
 	uint32_t node_target = decode_uint32(&p_buffer[ofs]);
 	ofs += 4;
-	MultiplayerSpawner *spawner = Object::cast_to<MultiplayerSpawner>(multiplayer->get_cached_node(p_from, node_target));
+	MultiplayerSpawner *spawner = Object::cast_to<MultiplayerSpawner>(multiplayer->get_cached_object(p_from, node_target));
 	ERR_FAIL_COND_V(!spawner, ERR_DOES_NOT_EXIST);
 	ERR_FAIL_COND_V(p_from != spawner->get_multiplayer_authority(), ERR_UNAUTHORIZED);
 
@@ -349,11 +347,9 @@ void SceneReplicationInterface::_send_sync(int p_peer, uint64_t p_msec) {
 			uint32_t net_id = rep_state->get_net_id(oid);
 			if (net_id == 0) {
 				// First time path based ID.
-				const Node *root_node = multiplayer->get_root_node();
-				ERR_FAIL_COND(!root_node);
-				NodePath rel_path = (root_node->get_path()).rel_path_to(sync->get_path());
+				NodePath rel_path = multiplayer->get_root_path().rel_path_to(sync->get_path());
 				int path_id = 0;
-				multiplayer->send_confirm_path(sync, rel_path, p_peer, path_id);
+				multiplayer->send_object_cache(sync, rel_path, p_peer, path_id);
 				net_id = path_id;
 				rep_state->set_net_id(oid, net_id | 0x80000000);
 			}
@@ -381,7 +377,7 @@ Error SceneReplicationInterface::on_sync_receive(int p_from, const uint8_t *p_bu
 		ofs += 4;
 		Node *node = nullptr;
 		if (net_id & 0x80000000) {
-			MultiplayerSynchronizer *sync = Object::cast_to<MultiplayerSynchronizer>(multiplayer->get_cached_node(p_from, net_id & 0x7FFFFFFF));
+			MultiplayerSynchronizer *sync = Object::cast_to<MultiplayerSynchronizer>(multiplayer->get_cached_object(p_from, net_id & 0x7FFFFFFF));
 			ERR_FAIL_COND_V(!sync || sync->get_multiplayer_authority() != p_from, ERR_UNAUTHORIZED);
 			node = sync->get_node(sync->get_root_path());
 		} else {
diff --git a/core/multiplayer/rpc_manager.cpp b/scene/multiplayer/scene_rpc_interface.cpp
index 1e6d2108be..7d7f57b9a1 100644
--- a/core/multiplayer/rpc_manager.cpp
+++ b/scene/multiplayer/scene_rpc_interface.cpp
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  rpc_manager.cpp                                                      */
+/*  scene_rpc_interface.cpp                                              */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -28,15 +28,24 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#include "core/multiplayer/rpc_manager.h"
+#include "scene/multiplayer/scene_rpc_interface.h"
 
 #include "core/debugger/engine_debugger.h"
 #include "core/io/marshalls.h"
 #include "core/multiplayer/multiplayer_api.h"
 #include "scene/main/node.h"
+#include "scene/main/window.h"
+
+MultiplayerRPCInterface *SceneRPCInterface::_create(MultiplayerAPI *p_multiplayer) {
+	return memnew(SceneRPCInterface(p_multiplayer));
+}
+
+void SceneRPCInterface::make_default() {
+	MultiplayerAPI::create_default_rpc_interface = _create;
+}
 
 #ifdef DEBUG_ENABLED
-_FORCE_INLINE_ void RPCManager::_profile_node_data(const String &p_what, ObjectID p_id) {
+_FORCE_INLINE_ void SceneRPCInterface::_profile_node_data(const String &p_what, ObjectID p_id) {
 	if (EngineDebugger::is_profiling("multiplayer")) {
 		Array values;
 		values.push_back("node");
@@ -46,7 +55,7 @@ _FORCE_INLINE_ void RPCManager::_profile_node_data(const String &p_what, ObjectI
 	}
 }
 #else
-_FORCE_INLINE_ void RPCManager::_profile_node_data(const String &p_what, ObjectID p_id) {}
+_FORCE_INLINE_ void SceneRPCInterface::_profile_node_data(const String &p_what, ObjectID p_id) {}
 #endif
 
 // Returns the packet size stripping the node path added when the node is not yet cached.
@@ -110,14 +119,16 @@ _FORCE_INLINE_ bool _can_call_mode(Node *p_node, Multiplayer::RPCMode mode, int
 	return false;
 }
 
-String RPCManager::get_rpc_md5(const Node *p_node) {
+String SceneRPCInterface::get_rpc_md5(const Object *p_obj) const {
+	const Node *node = Object::cast_to<Node>(p_obj);
+	ERR_FAIL_COND_V(!node, "");
 	String rpc_list;
-	const Vector<Multiplayer::RPCConfig> node_config = p_node->get_node_rpc_methods();
+	const Vector<Multiplayer::RPCConfig> node_config = node->get_node_rpc_methods();
 	for (int i = 0; i < node_config.size(); i++) {
 		rpc_list += String(node_config[i].name);
 	}
-	if (p_node->get_script_instance()) {
-		const Vector<Multiplayer::RPCConfig> script_config = p_node->get_script_instance()->get_rpc_methods();
+	if (node->get_script_instance()) {
+		const Vector<Multiplayer::RPCConfig> script_config = node->get_script_instance()->get_rpc_methods();
 		for (int i = 0; i < script_config.size(); i++) {
 			rpc_list += String(script_config[i].name);
 		}
@@ -125,7 +136,9 @@ String RPCManager::get_rpc_md5(const Node *p_node) {
 	return rpc_list.md5_text();
 }
 
-Node *RPCManager::_process_get_node(int p_from, const uint8_t *p_packet, uint32_t p_node_target, int p_packet_len) {
+Node *SceneRPCInterface::_process_get_node(int p_from, const uint8_t *p_packet, uint32_t p_node_target, int p_packet_len) {
+	Node *root_node = SceneTree::get_singleton()->get_root()->get_node(multiplayer->get_root_path());
+	ERR_FAIL_COND_V(!root_node, nullptr);
 	Node *node = nullptr;
 
 	if (p_node_target & 0x80000000) {
@@ -139,7 +152,7 @@ Node *RPCManager::_process_get_node(int p_from, const uint8_t *p_packet, uint32_
 
 		NodePath np = paths;
 
-		node = multiplayer->get_root_node()->get_node(np);
+		node = root_node->get_node(np);
 
 		if (!node) {
 			ERR_PRINT("Failed to get path from RPC: " + String(np) + ".");
@@ -147,11 +160,11 @@ Node *RPCManager::_process_get_node(int p_from, const uint8_t *p_packet, uint32_
 		return node;
 	} else {
 		// Use cached path.
-		return multiplayer->get_cached_node(p_from, p_node_target);
+		return Object::cast_to<Node>(multiplayer->get_cached_object(p_from, p_node_target));
 	}
 }
 
-void RPCManager::process_rpc(int p_from, const uint8_t *p_packet, int p_packet_len) {
+void SceneRPCInterface::process_rpc(int p_from, const uint8_t *p_packet, int p_packet_len) {
 	// Extract packet meta
 	int packet_min_size = 1;
 	int name_id_offset = 1;
@@ -224,7 +237,7 @@ void RPCManager::process_rpc(int p_from, const uint8_t *p_packet, int p_packet_l
 	_process_rpc(node, name_id, p_from, p_packet, packet_len, packet_min_size);
 }
 
-void RPCManager::_process_rpc(Node *p_node, const uint16_t p_rpc_method_id, int p_from, const uint8_t *p_packet, int p_packet_len, int p_offset) {
+void SceneRPCInterface::_process_rpc(Node *p_node, const uint16_t p_rpc_method_id, int p_from, const uint8_t *p_packet, int p_packet_len, int p_offset) {
 	ERR_FAIL_COND_MSG(p_offset > p_packet_len, "Invalid packet received. Size too small.");
 
 	// Check that remote can call the RPC on this node.
@@ -274,7 +287,7 @@ void RPCManager::_process_rpc(Node *p_node, const uint16_t p_rpc_method_id, int
 	}
 }
 
-void RPCManager::_send_rpc(Node *p_from, int p_to, uint16_t p_rpc_id, const Multiplayer::RPCConfig &p_config, const StringName &p_name, const Variant **p_arg, int p_argcount) {
+void SceneRPCInterface::_send_rpc(Node *p_from, int p_to, uint16_t p_rpc_id, const Multiplayer::RPCConfig &p_config, const StringName &p_name, const Variant **p_arg, int p_argcount) {
 	Ref<MultiplayerPeer> peer = multiplayer->get_multiplayer_peer();
 	ERR_FAIL_COND_MSG(peer.is_null(), "Attempt to call RPC without active multiplayer peer.");
 
@@ -290,12 +303,12 @@ void RPCManager::_send_rpc(Node *p_from, int p_to, uint16_t p_rpc_id, const Mult
 		ERR_FAIL_MSG("Attempt to call RPC with unknown peer ID: " + itos(p_to) + ".");
 	}
 
-	NodePath from_path = (multiplayer->get_root_node()->get_path()).rel_path_to(p_from->get_path());
+	NodePath from_path = multiplayer->get_root_path().rel_path_to(p_from->get_path());
 	ERR_FAIL_COND_MSG(from_path.is_empty(), "Unable to send RPC. Relative path is empty. THIS IS LIKELY A BUG IN THE ENGINE!");
 
 	// See if all peers have cached path (if so, call can be fast).
 	int psc_id;
-	const bool has_all_peers = multiplayer->send_confirm_path(p_from, from_path, p_to, psc_id);
+	const bool has_all_peers = multiplayer->send_object_cache(p_from, from_path, p_to, psc_id);
 
 	// Create base packet, lots of hardcode because it must be tight.
 
@@ -433,19 +446,21 @@ void RPCManager::_send_rpc(Node *p_from, int p_to, uint16_t p_rpc_id, const Mult
 	}
 }
 
-void RPCManager::rpcp(Node *p_node, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount) {
+void SceneRPCInterface::rpcp(Object *p_obj, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount) {
 	Ref<MultiplayerPeer> peer = multiplayer->get_multiplayer_peer();
 	ERR_FAIL_COND_MSG(!peer.is_valid(), "Trying to call an RPC while no multiplayer peer is active.");
-	ERR_FAIL_COND_MSG(!p_node->is_inside_tree(), "Trying to call an RPC on a node which is not inside SceneTree.");
+	Node *node = Object::cast_to<Node>(p_obj);
+	ERR_FAIL_COND(!node);
+	ERR_FAIL_COND_MSG(!node->is_inside_tree(), "Trying to call an RPC on a node which is not inside SceneTree.");
 	ERR_FAIL_COND_MSG(peer->get_connection_status() != MultiplayerPeer::CONNECTION_CONNECTED, "Trying to call an RPC via a multiplayer peer which is not connected.");
 
 	int node_id = peer->get_unique_id();
 	bool call_local_native = false;
 	bool call_local_script = false;
 	uint16_t rpc_id = UINT16_MAX;
-	const Multiplayer::RPCConfig config = _get_rpc_config(p_node, p_method, rpc_id);
+	const Multiplayer::RPCConfig config = _get_rpc_config(node, p_method, rpc_id);
 	ERR_FAIL_COND_MSG(config.name == StringName(),
-			vformat("Unable to get the RPC configuration for the function \"%s\" at path: \"%s\". This happens when the method is not marked for RPCs.", p_method, p_node->get_path()));
+			vformat("Unable to get the RPC configuration for the function \"%s\" at path: \"%s\". This happens when the method is not marked for RPCs.", p_method, node->get_path()));
 	if (p_peer_id == 0 || p_peer_id == node_id || (p_peer_id < 0 && p_peer_id != -node_id)) {
 		if (rpc_id & (1 << 15)) {
 			call_local_native = config.call_local;
@@ -456,21 +471,21 @@ void RPCManager::rpcp(Node *p_node, int p_peer_id, const StringName &p_method, c
 
 	if (p_peer_id != node_id) {
 #ifdef DEBUG_ENABLED
-		_profile_node_data("out_rpc", p_node->get_instance_id());
+		_profile_node_data("out_rpc", node->get_instance_id());
 #endif
 
-		_send_rpc(p_node, p_peer_id, rpc_id, config, p_method, p_arg, p_argcount);
+		_send_rpc(node, p_peer_id, rpc_id, config, p_method, p_arg, p_argcount);
 	}
 
 	if (call_local_native) {
 		Callable::CallError ce;
 
 		multiplayer->set_remote_sender_override(peer->get_unique_id());
-		p_node->call(p_method, p_arg, p_argcount, ce);
+		node->call(p_method, p_arg, p_argcount, ce);
 		multiplayer->set_remote_sender_override(0);
 
 		if (ce.error != Callable::CallError::CALL_OK) {
-			String error = Variant::get_call_error_text(p_node, p_method, p_arg, p_argcount, ce);
+			String error = Variant::get_call_error_text(node, p_method, p_arg, p_argcount, ce);
 			error = "rpc() aborted in local call:  - " + error + ".";
 			ERR_PRINT(error);
 			return;
@@ -482,11 +497,11 @@ void RPCManager::rpcp(Node *p_node, int p_peer_id, const StringName &p_method, c
 		ce.error = Callable::CallError::CALL_OK;
 
 		multiplayer->set_remote_sender_override(peer->get_unique_id());
-		p_node->get_script_instance()->call(p_method, p_arg, p_argcount, ce);
+		node->get_script_instance()->call(p_method, p_arg, p_argcount, ce);
 		multiplayer->set_remote_sender_override(0);
 
 		if (ce.error != Callable::CallError::CALL_OK) {
-			String error = Variant::get_call_error_text(p_node, p_method, p_arg, p_argcount, ce);
+			String error = Variant::get_call_error_text(node, p_method, p_arg, p_argcount, ce);
 			error = "rpc() aborted in script local call:  - " + error + ".";
 			ERR_PRINT(error);
 			return;
diff --git a/core/multiplayer/rpc_manager.h b/scene/multiplayer/scene_rpc_interface.h
index 00bd1f9cb0..86e1d0d280 100644
--- a/core/multiplayer/rpc_manager.h
+++ b/scene/multiplayer/scene_rpc_interface.h
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  rpc_manager.h                                                        */
+/*  scene_rpc_interface.h                                                */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -28,15 +28,14 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#ifndef MULTIPLAYER_RPC_H
-#define MULTIPLAYER_RPC_H
+#ifndef SCENE_RPC_INTERFACE_H
+#define SCENE_RPC_INTERFACE_H
 
 #include "core/multiplayer/multiplayer.h"
 #include "core/multiplayer/multiplayer_api.h"
-#include "core/object/ref_counted.h"
 
-class RPCManager : public RefCounted {
-	GDCLASS(RPCManager, RefCounted);
+class SceneRPCInterface : public MultiplayerRPCInterface {
+	GDCLASS(SceneRPCInterface, MultiplayerRPCInterface);
 
 private:
 	enum NetworkNodeIdCompression {
@@ -71,6 +70,8 @@ private:
 	Vector<uint8_t> packet_cache;
 
 protected:
+	static MultiplayerRPCInterface *_create(MultiplayerAPI *p_multiplayer);
+
 	_FORCE_INLINE_ void _profile_node_data(const String &p_what, ObjectID p_id);
 	void _process_rpc(Node *p_node, const uint16_t p_rpc_method_id, int p_from, const uint8_t *p_packet, int p_packet_len, int p_offset);
 
@@ -78,12 +79,13 @@ protected:
 	Node *_process_get_node(int p_from, const uint8_t *p_packet, uint32_t p_node_target, int p_packet_len);
 
 public:
-	// Called by Node.rpc
-	void rpcp(Node *p_node, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount);
-	void process_rpc(int p_from, const uint8_t *p_packet, int p_packet_len);
+	static void make_default();
+
+	virtual void rpcp(Object *p_obj, int p_peer_id, const StringName &p_method, const Variant **p_arg, int p_argcount) override;
+	virtual void process_rpc(int p_from, const uint8_t *p_packet, int p_packet_len) override;
+	virtual String get_rpc_md5(const Object *p_obj) const override;
 
-	String get_rpc_md5(const Node *p_node);
-	RPCManager(MultiplayerAPI *p_multiplayer) { multiplayer = p_multiplayer; }
+	SceneRPCInterface(MultiplayerAPI *p_multiplayer) { multiplayer = p_multiplayer; }
 };
 
-#endif // MULTIPLAYER_RPC_H
+#endif // SCENE_RPC_INTERFACE_H
diff --git a/scene/register_scene_types.cpp b/scene/register_scene_types.cpp
index f19b018982..9ed83eb8c3 100644
--- a/scene/register_scene_types.cpp
+++ b/scene/register_scene_types.cpp
@@ -136,7 +136,9 @@
 #include "scene/main/window.h"
 #include "scene/multiplayer/multiplayer_spawner.h"
 #include "scene/multiplayer/multiplayer_synchronizer.h"
+#include "scene/multiplayer/scene_cache_interface.h"
 #include "scene/multiplayer/scene_replication_interface.h"
+#include "scene/multiplayer/scene_rpc_interface.h"
 #include "scene/resources/audio_stream_sample.h"
 #include "scene/resources/bit_map.h"
 #include "scene/resources/box_shape_3d.h"
@@ -1058,6 +1060,8 @@ void register_scene_types() {
 
 	SceneDebugger::initialize();
 	SceneReplicationInterface::make_default();
+	SceneRPCInterface::make_default();
+	SceneCacheInterface::make_default();
 
 	NativeExtensionManager::get_singleton()->initialize_extensions(NativeExtension::INITIALIZATION_LEVEL_SCENE);
 }
diff --git a/servers/rendering/renderer_rd/effects_rd.cpp b/servers/rendering/renderer_rd/effects_rd.cpp
index 25a366aa4b..7183fd110f 100644
--- a/servers/rendering/renderer_rd/effects_rd.cpp
+++ b/servers/rendering/renderer_rd/effects_rd.cpp
@@ -1453,7 +1453,6 @@ void EffectsRD::downsample_depth(RID p_depth_buffer, const Vector<RID> &p_depth_
 
 	RD::get_singleton()->compute_list_end(RD::BARRIER_MASK_COMPUTE);
 
-	ss_effects.used_full_mips_last_frame = use_mips;
 	ss_effects.used_full_mips_last_frame = use_full_mips;
 	ss_effects.used_half_size_last_frame = use_half_size;
 }
diff --git a/servers/rendering/renderer_scene_cull.cpp b/servers/rendering/renderer_scene_cull.cpp
index 8ded180633..5b2be8e174 100644
--- a/servers/rendering/renderer_scene_cull.cpp
+++ b/servers/rendering/renderer_scene_cull.cpp
@@ -3871,8 +3871,12 @@ void RendererSceneCull::update_dirty_instances() {
 
 void RendererSceneCull::update() {
 	//optimize bvhs
-	for (uint32_t i = 0; i < scenario_owner.get_rid_count(); i++) {
-		Scenario *s = scenario_owner.get_ptr_by_index(i);
+
+	uint32_t rid_count = scenario_owner.get_rid_count();
+	RID *rids = (RID *)alloca(sizeof(RID) * rid_count);
+	scenario_owner.fill_owned_buffer(rids);
+	for (uint32_t i = 0; i < rid_count; i++) {
+		Scenario *s = scenario_owner.get_or_null(rids[i]);
 		s->indexers[Scenario::INDEXER_GEOMETRY].optimize_incremental(indexer_update_iterations);
 		s->indexers[Scenario::INDEXER_VOLUMES].optimize_incremental(indexer_update_iterations);
 	}
diff --git a/servers/rendering/shader_language.cpp b/servers/rendering/shader_language.cpp
index a433c666f3..ead196b7dd 100644
--- a/servers/rendering/shader_language.cpp
+++ b/servers/rendering/shader_language.cpp
@@ -5255,8 +5255,10 @@ ShaderLanguage::Node *ShaderLanguage::_parse_expression(BlockNode *p_block, cons
 					return nullptr;
 				} else {
 #ifdef DEBUG_ENABLED
-					if (check_warnings && HAS_WARNING(ShaderWarning::FORMATTING_ERROR_FLAG)) {
-						_add_line_warning(ShaderWarning::FORMATTING_ERROR, RTR("Empty statement. Remove ';' to fix this warning."));
+					if (!p_block || (p_block->block_type != BlockNode::BLOCK_TYPE_FOR_INIT && p_block->block_type != BlockNode::BLOCK_TYPE_FOR_CONDITION)) {
+						if (check_warnings && HAS_WARNING(ShaderWarning::FORMATTING_ERROR_FLAG)) {
+							_add_line_warning(ShaderWarning::FORMATTING_ERROR, RTR("Empty statement. Remove ';' to fix this warning."));
+						}
 					}
 #endif // DEBUG_ENABLED
 					_set_tkpos(prepos);
@@ -6370,6 +6372,8 @@ Error ShaderLanguage::_parse_block(BlockNode *p_block, const FunctionInfo &p_fun
 		}
 
 		bool is_struct = shader->structs.has(tk.text);
+		bool is_var_init = false;
+		bool is_condition = false;
 
 		if (tk.type == TK_CURLY_BRACKET_CLOSE) { //end of block
 			if (p_just_one) {
@@ -6380,6 +6384,8 @@ Error ShaderLanguage::_parse_block(BlockNode *p_block, const FunctionInfo &p_fun
 			return OK;
 
 		} else if (tk.type == TK_CONST || is_token_precision(tk.type) || is_token_nonvoid_datatype(tk.type) || is_struct) {
+			is_var_init = true;
+
 			String struct_name = "";
 			if (is_struct) {
 				struct_name = tk.text;
@@ -6489,9 +6495,17 @@ Error ShaderLanguage::_parse_block(BlockNode *p_block, const FunctionInfo &p_fun
 				decl.name = name;
 
 #ifdef DEBUG_ENABLED
-				if (check_warnings && HAS_WARNING(ShaderWarning::UNUSED_LOCAL_VARIABLE_FLAG)) {
-					if (p_block && p_block->parent_function) {
-						StringName func_name = p_block->parent_function->name;
+				if (check_warnings && HAS_WARNING(ShaderWarning::UNUSED_LOCAL_VARIABLE_FLAG) && p_block) {
+					FunctionNode *parent_function = nullptr;
+					{
+						BlockNode *block = p_block;
+						while (block && !block->parent_function) {
+							block = block->parent_block;
+						}
+						parent_function = block->parent_function;
+					}
+					if (parent_function) {
+						StringName func_name = parent_function->name;
 
 						if (!used_local_vars.has(func_name)) {
 							used_local_vars.insert(func_name, Map<StringName, Usage>());
@@ -7319,25 +7333,13 @@ Error ShaderLanguage::_parse_block(BlockNode *p_block, const FunctionInfo &p_fun
 			if (!expr) {
 				return ERR_PARSE_ERROR;
 			}
-
-			bool empty = false;
+			is_condition = expr->type == Node::TYPE_OPERATOR && expr->get_datatype() == TYPE_BOOL;
 
 			if (expr->type == Node::TYPE_OPERATOR) {
 				OperatorNode *op = static_cast<OperatorNode *>(expr);
 				if (op->op == OP_EMPTY) {
-					empty = true;
-				}
-			}
-			if (p_block->block_type == BlockNode::BLOCK_TYPE_FOR_INIT) {
-				if (!empty && expr->type != BlockNode::TYPE_VARIABLE_DECLARATION) {
-					_set_error(RTR("The left expression is expected to be a variable declaration."));
-					return ERR_PARSE_ERROR;
-				}
-			}
-			if (p_block->block_type == BlockNode::BLOCK_TYPE_FOR_CONDITION) {
-				if (!empty && expr->get_datatype() != TYPE_BOOL) {
-					_set_error(RTR("The middle expression is expected to be boolean."));
-					return ERR_PARSE_ERROR;
+					is_var_init = true;
+					is_condition = true;
 				}
 			}
 
@@ -7346,6 +7348,10 @@ Error ShaderLanguage::_parse_block(BlockNode *p_block, const FunctionInfo &p_fun
 
 			if (p_block->block_type == BlockNode::BLOCK_TYPE_FOR_CONDITION) {
 				if (tk.type == TK_COMMA) {
+					if (!is_condition) {
+						_set_error(RTR("The middle expression is expected to be a boolean operator."));
+						return ERR_PARSE_ERROR;
+					}
 					continue;
 				}
 				if (tk.type != TK_SEMICOLON) {
@@ -7366,6 +7372,17 @@ Error ShaderLanguage::_parse_block(BlockNode *p_block, const FunctionInfo &p_fun
 			}
 		}
 
+		if (p_block) {
+			if (p_block->block_type == BlockNode::BLOCK_TYPE_FOR_INIT && !is_var_init) {
+				_set_error(RTR("The left expression is expected to be a variable declaration."));
+				return ERR_PARSE_ERROR;
+			}
+			if (p_block->block_type == BlockNode::BLOCK_TYPE_FOR_CONDITION && !is_condition) {
+				_set_error(RTR("The middle expression is expected to be a boolean operator."));
+				return ERR_PARSE_ERROR;
+			}
+		}
+
 		if (p_just_one) {
 			break;
 		}
diff --git a/tests/core/math/test_expression.h b/tests/core/math/test_expression.h
index 5a894b20f3..6e3be541b0 100644
--- a/tests/core/math/test_expression.h
+++ b/tests/core/math/test_expression.h
@@ -137,7 +137,7 @@ TEST_CASE("[Expression] Scientific notation") {
 			expression.parse("2e5") == OK,
 			"The expression should parse successfully.");
 	CHECK_MESSAGE(
-			Math::is_equal_approx(double(expression.execute()), 25),
+			Math::is_equal_approx(double(expression.execute()), 2e5),
 			"The expression should return the expected result.");
 
 	CHECK_MESSAGE(
diff --git a/thirdparty/README.md b/thirdparty/README.md
index e1f911a9f9..34c33c3b56 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -52,13 +52,13 @@ Includes some patches in the `patches` folder which have been sent upstream.
 
 ## cvtt
 
-- Upstream: https://github.com/elasota/cvtt
-- Version: 1.0.0-beta4 (cc8472a04ba110fe999c686d07af40f7839051fd, 2018)
+- Upstream: https://github.com/elasota/ConvectionKernels
+- Version: git (dc2dbbe0ae2cf2be06ef56d1021e2222a56c7fe2, 2021)
 - License: MIT
 
 Files extracted from upstream source:
 
-- all .cpp, .h, and .txt files in ConvectionKernels/
+- all .cpp, .h, and .txt files except the folders MakeTables and etc2packer.
 
 
 ## doctest
diff --git a/thirdparty/cvtt/ConvectionKernels.cpp b/thirdparty/cvtt/ConvectionKernels.cpp
deleted file mode 100644
index 8d379344e1..0000000000
--- a/thirdparty/cvtt/ConvectionKernels.cpp
+++ /dev/null
@@ -1,7586 +0,0 @@
-/*
-Convection Texture Tools
-Copyright (c) 2018 Eric Lasota
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject
-to the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
--------------------------------------------------------------------------------------
-
-Portions based on DirectX Texture Library (DirectXTex)
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-Licensed under the MIT License.
-
-http://go.microsoft.com/fwlink/?LinkId=248926
-*/
-#include "ConvectionKernels.h"
-#include "ConvectionKernels_BC7_SingleColor.h"
-
-#if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__)
-#define CVTT_USE_SSE2
-#endif
-
-#ifdef CVTT_USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#include <float.h>
-#include <assert.h>
-#include <string.h>
-#include <algorithm>
-#include <math.h>
-
-#define UNREFERENCED_PARAMETER(n) ((void)n)
-
-namespace cvtt
-{
-#ifdef CVTT_USE_SSE2
-    // SSE2 version
-    struct ParallelMath
-    {
-        typedef uint16_t ScalarUInt16;
-        typedef int16_t ScalarSInt16;
-
-        template<unsigned int TRoundingMode>
-        struct RoundForScope
-        {
-            unsigned int m_oldCSR;
-
-            RoundForScope()
-            {
-                m_oldCSR = _mm_getcsr();
-                _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
-            }
-
-            ~RoundForScope()
-            {
-                _mm_setcsr(m_oldCSR);
-            }
-        };
-
-        struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
-        {
-        };
-
-        struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
-        {
-        };
-
-        struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
-        {
-        };
-
-        struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
-        {
-        };
-
-        static const int ParallelSize = 8;
-
-        enum Int16Subtype
-        {
-            IntSubtype_Signed,
-            IntSubtype_UnsignedFull,
-            IntSubtype_UnsignedTruncated,
-            IntSubtype_Abstract,
-        };
-
-        template<int TSubtype>
-        struct VInt16
-        {
-            __m128i m_value;
-
-            inline VInt16 operator+(int16_t other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
-                return result;
-            }
-
-            inline VInt16 operator+(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_add_epi16(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator|(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_or_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator&(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_and_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator-(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_sub_epi16(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator<<(int bits) const
-            {
-                VInt16 result;
-                result.m_value = _mm_slli_epi16(m_value, bits);
-                return result;
-            }
-        };
-
-        typedef VInt16<IntSubtype_Signed> SInt16;
-        typedef VInt16<IntSubtype_UnsignedFull> UInt16;
-        typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
-        typedef VInt16<IntSubtype_Abstract> AInt16;
-
-        template<int TSubtype>
-        struct VInt32
-        {
-            __m128i m_values[2];
-
-            inline VInt32 operator+(const VInt32& other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline VInt32 operator-(const VInt32& other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline VInt32 operator<<(const int other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_slli_epi32(m_values[0], other);
-                result.m_values[1] = _mm_slli_epi32(m_values[1], other);
-                return result;
-            }
-        };
-
-        typedef VInt32<IntSubtype_Signed> SInt32;
-        typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
-        typedef VInt32<IntSubtype_UnsignedFull> UInt32;
-        typedef VInt32<IntSubtype_Abstract> AInt32;
-
-        template<class TTargetType>
-        struct LosslessCast
-        {
-#ifdef CVTT_PERMIT_ALIASING
-            template<int TSrcSubtype>
-            static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
-            {
-                return reinterpret_cast<VInt32<TSubtype>&>(src);
-            }
-
-            template<int TSrcSubtype>
-            static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
-            {
-                return reinterpret_cast<VInt16<TSubtype>&>(src);
-            }
-#else
-            template<int TSrcSubtype>
-            static TTargetType Cast(const VInt32<TSrcSubtype> &src)
-            {
-                TTargetType result;
-                result.m_values[0] = src.m_values[0];
-                result.m_values[1] = src.m_values[1];
-                return result;
-            }
-
-            template<int TSrcSubtype>
-            static TTargetType Cast(const VInt16<TSrcSubtype> &src)
-            {
-                TTargetType result;
-                result.m_value = src.m_value;
-                return result;
-            }
-#endif
-        };
-
-        struct Int64
-        {
-            __m128i m_values[4];
-        };
-
-        struct Float
-        {
-            __m128 m_values[2];
-
-            inline Float operator+(const Float &other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator+(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-
-            inline Float operator-(const Float& other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator-() const
-            {
-                Float result;
-                result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
-                result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
-                return result;
-            }
-
-            inline Float operator*(const Float& other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator*(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-
-            inline Float operator/(const Float &other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator/(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-        };
-
-        struct Int16CompFlag
-        {
-            __m128i m_value;
-
-            inline Int16CompFlag operator&(const Int16CompFlag &other) const
-            {
-                Int16CompFlag result;
-                result.m_value = _mm_and_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline Int16CompFlag operator|(const Int16CompFlag &other) const
-            {
-                Int16CompFlag result;
-                result.m_value = _mm_or_si128(m_value, other.m_value);
-                return result;
-            }
-        };
-
-        struct FloatCompFlag
-        {
-            __m128 m_values[2];
-        };
-
-        template<int TSubtype>
-        static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_add_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_and_si128(flag.m_value, a.m_value);
-            return result;
-        }
-
-        template<int TSubtype>
-        static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
-        {
-            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
-        }
-
-        static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
-        {
-            SInt16 result;
-            result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
-            return result;
-        }
-
-        template<int TSubtype>
-        static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
-        {
-            dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
-        }
-
-        static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
-        {
-            for (int i = 0; i < 2; i++)
-                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
-        }
-
-        static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
-        {
-            for (int i = 0; i < 2; i++)
-                dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
-        }
-
-        static void MakeSafeDenominator(Float& v)
-        {
-            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
-        }
-
-        static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
-        {
-            int lostBits = 16 - precision;
-            if (lostBits == 0)
-                return v;
-
-            SInt16 result;
-            result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
-            return result;
-        }
-
-        static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
-        {
-            int lostBits = 16 - precision;
-            if (lostBits == 0)
-                return v;
-
-            UInt16 result;
-            result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
-            return result;
-        }
-
-        static UInt16 Min(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i bitFlip = _mm_set1_epi16(-32768);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
-            return result;
-        }
-
-        static SInt16 Min(const SInt16 &a, const SInt16 &b)
-        {
-            SInt16 result;
-            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt15 Min(const UInt15 &a, const UInt15 &b)
-        {
-            UInt15 result;
-            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Min(const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static UInt16 Max(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i bitFlip = _mm_set1_epi16(-32768);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
-            return result;
-        }
-
-        static SInt16 Max(const SInt16 &a, const SInt16 &b)
-        {
-            SInt16 result;
-            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt15 Max(const UInt15 &a, const UInt15 &b)
-        {
-            UInt15 result;
-            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Max(const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static Float Clamp(const Float &v, float min, float max)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
-            return result;
-        }
-
-        static Float Reciprocal(const Float &v)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
-            return result;
-        }
-
-        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
-        {
-            int16_t values[8];
-            for (int i = 0; i < 8; i++)
-                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
-
-            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
-        }
-
-        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
-        {
-            int16_t values[8];
-            for (int i = 0; i < 8; i++)
-                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
-
-            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
-        }
-
-        static Float MakeFloat(float v)
-        {
-            Float f;
-            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
-            return f;
-        }
-
-        static Float MakeFloatZero()
-        {
-            Float f;
-            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
-            return f;
-        }
-
-        static UInt16 MakeUInt16(uint16_t v)
-        {
-            UInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static SInt16 MakeSInt16(int16_t v)
-        {
-            SInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static AInt16 MakeAInt16(int16_t v)
-        {
-            AInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static UInt15 MakeUInt15(uint16_t v)
-        {
-            UInt15 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static SInt32 MakeSInt32(int32_t v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_set1_epi32(v);
-            result.m_values[1] = _mm_set1_epi32(v);
-            return result;
-        }
-
-        static UInt31 MakeUInt31(uint32_t v)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_set1_epi32(v);
-            result.m_values[1] = _mm_set1_epi32(v);
-            return result;
-        }
-
-        static uint16_t Extract(const UInt16 &v, int offset)
-        {
-            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
-        }
-
-        static int16_t Extract(const SInt16 &v, int offset)
-        {
-            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
-        }
-
-        static uint16_t Extract(const UInt15 &v, int offset)
-        {
-            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
-        }
-
-        static int16_t Extract(const AInt16 &v, int offset)
-        {
-            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
-        }
-
-        static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
-        {
-            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
-        }
-
-        static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
-        {
-            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
-        }
-
-        static void PutSInt16(SInt16 &dest, int offset, int16_t v)
-        {
-            reinterpret_cast<int16_t*>(&dest)[offset] = v;
-        }
-
-        static float ExtractFloat(const Float& v, int offset)
-        {
-            return reinterpret_cast<const float*>(&v)[offset];
-        }
-
-        static void PutFloat(Float &dest, int offset, float v)
-        {
-            reinterpret_cast<float*>(&dest)[offset] = v;
-        }
-
-        static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static FloatCompFlag Less(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        template<int TSubtype>
-        static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static FloatCompFlag Equal(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static Float ToFloat(const UInt16 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
-            return result;
-        }
-
-        static UInt31 ToUInt31(const UInt16 &v)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
-            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
-            return result;
-        }
-
-        static SInt32 ToInt32(const UInt16 &v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
-            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
-            return result;
-        }
-
-        static SInt32 ToInt32(const SInt16 &v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
-            result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
-            return result;
-        }
-
-        static Float ToFloat(const SInt16 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
-            return result;
-        }
-
-        static Float ToFloat(const UInt15 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
-            return result;
-        }
-
-        static Float ToFloat(const UInt31 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
-            result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
-            return result;
-        }
-
-        static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
-        {
-            __m128i lo = _mm_castps_si128(v.m_values[0]);
-            __m128i hi = _mm_castps_si128(v.m_values[1]);
-
-            Int16CompFlag result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
-        {
-            __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
-            __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
-
-            FloatCompFlag result;
-            result.m_values[0] = _mm_castsi128_ps(lo);
-            result.m_values[1] = _mm_castsi128_ps(hi);
-            return result;
-        }
-
-        static Int16CompFlag MakeBoolInt16(bool b)
-        {
-            Int16CompFlag result;
-            if (b)
-                result.m_value = _mm_set1_epi16(-1);
-            else
-                result.m_value = _mm_setzero_si128();
-            return result;
-        }
-
-        static FloatCompFlag MakeBoolFloat(bool b)
-        {
-            FloatCompFlag result;
-            if (b)
-                result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
-            else
-                result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
-            return result;
-        }
-
-        static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
-            return result;
-        }
-
-        static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
-            __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
-            return result;
-        }
-
-        static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
-            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            UInt15 result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
-            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            SInt16 result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static Float Sqrt(const Float &f)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
-            return result;
-        }
-
-        static UInt16 Abs(const SInt16 &a)
-        {
-            __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
-            __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
-
-            UInt16 result;
-            result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
-            return result;
-        }
-
-        static Float Abs(const Float& a)
-        {
-            __m128 invMask = _mm_set1_ps(-0.0f);
-
-            Float result;
-            result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
-            result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
-            return result;
-        }
-
-        static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
-        {
-            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
-
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(diff, diff);
-            return result;
-        }
-
-        static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
-        {
-            __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
-
-            __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
-            __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
-            __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
-            __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
-
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
-            result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
-
-            return result;
-        }
-
-        static Float TwosCLHalfToFloat(const SInt16 &v)
-        {
-            __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
-
-            __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
-            __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
-            __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
-
-            __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
-
-            // Convert exponent to high-bits 
-            exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
-
-            __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
-
-            __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
-            __m128i lowBits = _mm_slli_epi16(mantissa, 13);
-
-            __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
-            __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
-
-            __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
-            __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
-
-            Float result;
-            result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
-            result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
-
-            return result;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-            Float fb = TwosCLHalfToFloat(b);
-
-            Float diff = fa - fb;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a) * aWeight;
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static UInt16 RightShift(const UInt16 &v, int bits)
-        {
-            UInt16 result;
-            result.m_value = _mm_srli_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static UInt31 RightShift(const UInt31 &v, int bits)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
-            result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
-            return result;
-        }
-
-        static SInt16 RightShift(const SInt16 &v, int bits)
-        {
-            SInt16 result;
-            result.m_value = _mm_srai_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static UInt15 RightShift(const UInt15 &v, int bits)
-        {
-            UInt15 result;
-            result.m_value = _mm_srli_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static SInt32 RightShift(const SInt32 &v, int bits)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
-            result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
-            return result;
-        }
-
-        static SInt16 ToSInt16(const SInt32 &v)
-        {
-            SInt16 result;
-            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
-            return result;
-        }
-
-        static UInt16 ToUInt16(const UInt32 &v)
-        {
-            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
-            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
-
-            UInt16 result;
-            result.m_value = _mm_packs_epi32(low, high);
-            return result;
-        }
-
-        static UInt16 ToUInt16(const UInt31 &v)
-        {
-            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
-            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
-
-            UInt16 result;
-            result.m_value = _mm_packs_epi32(low, high);
-            return result;
-        }
-
-        static UInt15 ToUInt15(const UInt31 &v)
-        {
-            UInt15 result;
-            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
-            return result;
-        }
-
-        static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
-        {
-            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
-        {
-            return XMultiply(b, a);
-        }
-
-        static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
-        {
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
-        {
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
-        {
-            return XMultiply(b, a);
-        }
-
-        static bool AnySet(const Int16CompFlag &v)
-        {
-            return _mm_movemask_epi8(v.m_value) != 0;
-        }
-
-        static bool AllSet(const Int16CompFlag &v)
-        {
-            return _mm_movemask_epi8(v.m_value) == 0xffff;
-        }
-
-        static bool AnySet(const FloatCompFlag &v)
-        {
-            return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
-        }
-
-        static bool AllSet(const FloatCompFlag &v)
-        {
-            return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
-        }
-    };
-
-#else
-    // Scalar version
-    struct ParallelMath
-    {
-        struct RoundTowardZeroForScope
-        {
-        };
-
-        struct RoundTowardNearestForScope
-        {
-        };
-
-        struct RoundUpForScope
-        {
-        };
-
-        struct RoundDownForScope
-        {
-        };
-
-        static const int ParallelSize = 1;
-
-        enum Int16Subtype
-        {
-            IntSubtype_Signed,
-            IntSubtype_UnsignedFull,
-            IntSubtype_UnsignedTruncated,
-            IntSubtype_Abstract,
-        };
-
-        typedef int32_t SInt16;
-        typedef int32_t UInt15;
-        typedef int32_t UInt16;
-        typedef int32_t AInt16;
-
-        typedef int32_t SInt32;
-        typedef int32_t UInt31;
-        typedef int32_t UInt32;
-        typedef int32_t AInt32;
-
-        typedef int32_t ScalarUInt16;
-        typedef int32_t ScalarSInt16;
-
-        typedef float Float;
-
-        template<class TTargetType>
-        struct LosslessCast
-        {
-            static const int32_t& Cast(const int32_t &src)
-            {
-                return src;
-            }
-        };
-
-        typedef bool Int16CompFlag;
-        typedef bool FloatCompFlag;
-
-        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
-        {
-            return a + b;
-        }
-
-        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
-        {
-            return a - b;
-        }
-
-        static float Select(bool flag, float a, float b)
-        {
-            return flag ? a : b;
-        }
-
-        static int32_t Select(bool flag, int32_t a, int32_t b)
-        {
-            return flag ? a : b;
-        }
-
-        static int32_t SelectOrZero(bool flag, int32_t a)
-        {
-            return flag ? a : 0;
-        }
-
-        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
-        {
-            if (flag)
-                dest = src;
-        }
-
-        static int32_t ConditionalNegate(bool flag, int32_t v)
-        {
-            return (flag) ? -v : v;
-        }
-
-        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
-        {
-            if (!flag)
-                dest = src;
-        }
-
-        static void ConditionalSet(float& dest, bool flag, float src)
-        {
-            if (flag)
-                dest = src;
-        }
-
-        static void NotConditionalSet(float& dest, bool flag, float src)
-        {
-            if (!flag)
-                dest = src;
-        }
-
-        static void MakeSafeDenominator(float& v)
-        {
-            if (v == 0.0f)
-                v = 1.0f;
-        }
-
-        static int32_t SignedRightShift(int32_t v, int bits)
-        {
-            return v >> bits;
-        }
-
-        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
-        {
-            v = (v << (32 - precision)) & 0xffffffff;
-            return SignedRightShift(v, 32 - precision);
-        }
-
-        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
-        {
-            return v & ((1 << precision) - 1);
-        }
-
-        static int32_t Min(int32_t a, int32_t b)
-        {
-            if (a < b)
-                return a;
-            return b;
-        }
-
-        static float Min(float a, float b)
-        {
-            if (a < b)
-                return a;
-            return b;
-        }
-
-        static int32_t Max(int32_t a, int32_t b)
-        {
-            if (a > b)
-                return a;
-            return b;
-        }
-
-        static float Max(float a, float b)
-        {
-            if (a > b)
-                return a;
-            return b;
-        }
-
-        static float Abs(float a)
-        {
-            return fabsf(a);
-        }
-
-        static int32_t Abs(int32_t a)
-        {
-            if (a < 0)
-                return -a;
-            return a;
-        }
-
-        static float Clamp(float v, float min, float max)
-        {
-            if (v < min)
-                return min;
-            if (v > max)
-                return max;
-            return v;
-        }
-
-        static float Reciprocal(float v)
-        {
-            return 1.0f / v;
-        }
-
-        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
-        {
-            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
-        }
-
-        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
-        {
-            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
-        }
-
-        static float MakeFloat(float v)
-        {
-            return v;
-        }
-
-        static float MakeFloatZero()
-        {
-            return 0.0f;
-        }
-
-        static int32_t MakeUInt16(uint16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeSInt16(int16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeAInt16(int16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeUInt15(uint16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeSInt32(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeUInt31(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t Extract(int32_t v, int offset)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            return v;
-        }
-
-        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static float ExtractFloat(float v, int offset)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            return v;
-        }
-
-        static void PutFloat(float &dest, int offset, float v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static bool Less(int32_t a, int32_t b)
-        {
-            return a < b;
-        }
-
-        static bool Less(float a, float b)
-        {
-            return a < b;
-        }
-
-        static bool LessOrEqual(int32_t a, int32_t b)
-        {
-            return a < b;
-        }
-
-        static bool LessOrEqual(float a, float b)
-        {
-            return a < b;
-        }
-
-        static bool Equal(int32_t a, int32_t b)
-        {
-            return a == b;
-        }
-
-        static bool Equal(float a, float b)
-        {
-            return a == b;
-        }
-
-        static float ToFloat(int32_t v)
-        {
-            return static_cast<float>(v);
-        }
-
-        static int32_t ToUInt31(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToInt32(int32_t v)
-        {
-            return v;
-        }
-
-        static bool FloatFlagToInt16(bool v)
-        {
-            return v;
-        }
-
-        static bool Int16FlagToFloat(bool v)
-        {
-            return v;
-        }
-
-        static bool MakeBoolInt16(bool b)
-        {
-            return b;
-        }
-
-        static bool MakeBoolFloat(bool b)
-        {
-            return b;
-        }
-
-        static bool AndNot(bool a, bool b)
-        {
-            return a && !b;
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
-        {
-            UNREFERENCED_PARAMETER(rtz);
-            return static_cast<int>(v);
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
-        {
-            UNREFERENCED_PARAMETER(ru);
-            return static_cast<int>(ceilf(v));
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
-        {
-            UNREFERENCED_PARAMETER(rd);
-            return static_cast<int>(floorf(v));
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
-        {
-            UNREFERENCED_PARAMETER(rtn);
-            return static_cast<int>(floorf(v + 0.5f));
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        static float Sqrt(float f)
-        {
-            return sqrtf(f);
-        }
-
-        static int32_t SqDiffUInt8(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static int32_t SqDiffInt16(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static int32_t SqDiffSInt16(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static float TwosCLHalfToFloat(int32_t v)
-        {
-            int32_t absV = (v < 0) ? -v : v;
-
-            int32_t signBits = (absV & -32768);
-            int32_t mantissa = (absV & 0x03ff);
-            int32_t exponent = (absV & 0x7c00);
-
-            bool isDenormal = (exponent == 0);
-
-            // Convert exponent to high-bits
-            exponent = (exponent >> 3) + 14336;
-
-            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
-
-            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
-
-            float f, correction;
-            memcpy(&f, &fBits, 4);
-            memcpy(&correction, &denormalCorrection, 4);
-
-            return f - correction;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-            Float fb = TwosCLHalfToFloat(b);
-
-            Float diff = fa - fb;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a) * aWeight;
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static int32_t RightShift(int32_t v, int bits)
-        {
-            return SignedRightShift(v, bits);
-        }
-
-        static int32_t ToSInt16(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToUInt16(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToUInt15(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t XMultiply(int32_t a, int32_t b)
-        {
-            return a * b;
-        }
-
-        static int32_t CompactMultiply(int32_t a, int32_t b)
-        {
-            return a * b;
-        }
-
-        static bool AnySet(bool v)
-        {
-            return v;
-        }
-
-        static bool AllSet(bool v)
-        {
-            return v;
-        }
-    };
-
-#endif
-
-    namespace Internal
-    {
-        namespace BC7Data
-        {
-            enum AlphaMode
-            {
-                AlphaMode_Combined,
-                AlphaMode_Separate,
-                AlphaMode_None,
-            };
-
-            enum PBitMode
-            {
-                PBitMode_PerEndpoint,
-                PBitMode_PerSubset,
-                PBitMode_None
-            };
-
-            struct BC7ModeInfo
-            {
-                PBitMode m_pBitMode;
-                AlphaMode m_alphaMode;
-                int m_rgbBits;
-                int m_alphaBits;
-                int m_partitionBits;
-                int m_numSubsets;
-                int m_indexBits;
-                int m_alphaIndexBits;
-                bool m_hasIndexSelector;
-            };
-
-            BC7ModeInfo g_modes[] =
-            {
-                { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
-                { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
-                { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
-                { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
-
-                { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
-                { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
-                { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
-                { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
-            };
-
-			const int g_weight2[] = { 0, 21, 43, 64 };
-			const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
-			const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
-
-			const int *g_weightTables[] =
-			{
-				NULL,
-				NULL,
-				g_weight2,
-				g_weight3,
-				g_weight4
-			};
-
-            struct BC6HModeInfo
-            {
-                uint16_t m_modeID;
-                bool m_partitioned;
-                bool m_transformed;
-                int m_aPrec;
-                int m_bPrec[3];
-            };
-
-            // [partitioned][precision]
-            bool g_hdrModesExistForPrecision[2][17] =
-            {
-                //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
-                { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
-                { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
-            };
-
-            BC6HModeInfo g_hdrModes[] =
-            {
-                { 0x00, true,  true,  10,{ 5, 5, 5 } },
-                { 0x01, true,  true,  7,{ 6, 6, 6 } },
-                { 0x02, true,  true,  11,{ 5, 4, 4 } },
-                { 0x06, true,  true,  11,{ 4, 5, 4 } },
-                { 0x0a, true,  true,  11,{ 4, 4, 5 } },
-                { 0x0e, true,  true,  9,{ 5, 5, 5 } },
-                { 0x12, true,  true,  8,{ 6, 5, 5 } },
-                { 0x16, true,  true,  8,{ 5, 6, 5 } },
-                { 0x1a, true,  true,  8,{ 5, 5, 6 } },
-                { 0x1e, true,  false, 6,{ 6, 6, 6 } },
-                { 0x03, false, false, 10,{ 10, 10, 10 } },
-                { 0x07, false, true,  11,{ 9, 9, 9 } },
-                { 0x0b, false, true,  12,{ 8, 8, 8 } },
-                { 0x0f, false, true,  16,{ 4, 4, 4 } },
-            };
-
-            const int g_maxHDRPrecision = 16;
-
-            static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
-
-            static uint16_t g_partitionMap[64] =
-            {
-                0xCCCC, 0x8888, 0xEEEE, 0xECC8,
-                0xC880, 0xFEEC, 0xFEC8, 0xEC80,
-                0xC800, 0xFFEC, 0xFE80, 0xE800,
-                0xFFE8, 0xFF00, 0xFFF0, 0xF000,
-                0xF710, 0x008E, 0x7100, 0x08CE,
-                0x008C, 0x7310, 0x3100, 0x8CCE,
-                0x088C, 0x3110, 0x6666, 0x366C,
-                0x17E8, 0x0FF0, 0x718E, 0x399C,
-                0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
-                0x3c3c, 0x55aa, 0x9696, 0xa55a,
-                0x73ce, 0x13c8, 0x324c, 0x3bdc,
-                0x6996, 0xc33c, 0x9966, 0x660,
-                0x272, 0x4e4, 0x4e40, 0x2720,
-                0xc936, 0x936c, 0x39c6, 0x639c,
-                0x9336, 0x9cc6, 0x817e, 0xe718,
-                0xccf0, 0xfcc, 0x7744, 0xee22,
-            };
-
-            static uint32_t g_partitionMap2[64] =
-            {
-                0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
-                0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
-                0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
-                0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
-                0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
-                0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
-                0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
-                0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
-                0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
-                0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
-                0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
-                0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
-                0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
-                0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
-                0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
-                0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
-            };
-
-            static int g_fixupIndexes2[64] =
-            {
-                15,15,15,15,
-                15,15,15,15,
-                15,15,15,15,
-                15,15,15,15,
-                15, 2, 8, 2,
-                2, 8, 8,15,
-                2, 8, 2, 2,
-                8, 8, 2, 2,
-
-                15,15, 6, 8,
-                2, 8,15,15,
-                2, 8, 2, 2,
-                2,15,15, 6,
-                6, 2, 6, 8,
-                15,15, 2, 2,
-                15,15,15,15,
-                15, 2, 2,15,
-            };
-
-            static int g_fixupIndexes3[64][2] =
-            {
-                { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
-                { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
-                { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
-                { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
-                { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
-                { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
-                { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
-                { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
-
-                { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
-                { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
-                { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
-                { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
-                { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
-                { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
-                { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
-                { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
-            };
-
-            static const unsigned char g_fragments[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
-                0, 1, 2, 3,  // 16, 4
-                0, 1, 4,  // 20, 3
-                0, 1, 2, 4,  // 23, 4
-                2, 3, 7,  // 27, 3
-                1, 2, 3, 7,  // 30, 4
-                0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
-                0, 1, 4, 8,  // 42, 4
-                0, 1, 2, 4, 5, 8,  // 46, 6
-                0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
-                1, 4, 5, 6, 9,  // 60, 5
-                2, 5, 6, 7, 10,  // 65, 5
-                5, 6, 9, 10,  // 70, 4
-                2, 3, 7, 11,  // 74, 4
-                1, 2, 3, 6, 7, 11,  // 78, 6
-                0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
-                0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
-                2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
-                4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
-                0, 4, 8, 12,  // 128, 4
-                0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
-                0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
-                0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
-                3, 6, 7, 8, 9, 12,  // 158, 6
-                3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
-                0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
-                5, 8, 9, 10, 13,  // 192, 5
-                8, 12, 13,  // 197, 3
-                4, 8, 12, 13,  // 200, 4
-                2, 3, 6, 9, 12, 13,  // 204, 6
-                0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
-                0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
-                2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
-                2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
-                0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
-                0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
-                2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
-                1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
-                2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
-                2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
-                0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
-                6, 9, 10, 11, 14,  // 317, 5
-                0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
-                1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
-                1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
-                0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
-                0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
-                2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
-                1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
-                0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
-                0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
-                1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
-                0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
-                0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
-                8, 12, 13, 14,  // 418, 4
-                1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
-                4, 8, 9, 12, 13, 14,  // 430, 6
-                0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
-                1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
-                2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
-                0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
-                0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
-                1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
-                3, 7, 11, 15,  // 490, 4
-                0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
-                0, 4, 5, 10, 11, 15,  // 502, 6
-                1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
-                0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
-                0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
-                1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
-                2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
-                0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
-                1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
-                1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
-                0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
-                0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
-                1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
-                1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
-                0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
-                0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
-                1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
-                0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
-                2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
-                0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
-                0, 1, 5, 10, 14, 15,  // 669, 6
-                0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
-                0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
-                11, 14, 15,  // 691, 3
-                7, 11, 14, 15,  // 694, 4
-                1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
-                0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
-                0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
-                2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
-                4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
-                0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
-                0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
-                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
-                0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
-                3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
-                11, 13, 14, 15,  // 792, 4
-                0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
-                0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
-                7, 10, 11, 13, 14, 15,  // 814, 6
-                3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
-                1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
-                1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
-                12, 13, 14, 15,  // 848, 4
-                0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
-                0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
-                4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
-                4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
-                0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
-                0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
-                0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
-                0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
-                0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
-                7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
-                3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
-                2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
-                8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
-                0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
-                0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
-                3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
-                2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
-                0, 2,  // 1040, 2
-                1, 3,  // 1042, 2
-                0, 1, 4, 5,  // 1044, 4
-                0, 1, 2, 4, 5,  // 1048, 5
-                2, 3, 6,  // 1053, 3
-                0, 2, 4, 6,  // 1056, 4
-                1, 2, 5, 6,  // 1060, 4
-                0, 1, 2, 3, 5, 6,  // 1064, 6
-                0, 1, 2, 4, 5, 6,  // 1070, 6
-                0, 1, 2, 3, 4, 5, 6,  // 1076, 7
-                0, 3, 4, 7,  // 1083, 4
-                0, 1, 2, 3, 4, 7,  // 1087, 6
-                1, 3, 5, 7,  // 1093, 4
-                2, 3, 6, 7,  // 1097, 4
-                1, 2, 3, 6, 7,  // 1101, 5
-                1, 2, 3, 5, 6, 7,  // 1106, 6
-                0, 1, 2, 3, 5, 6, 7,  // 1112, 7
-                4, 5, 6, 7,  // 1119, 4
-                0, 8,  // 1123, 2
-                0, 1, 4, 5, 8,  // 1125, 5
-                0, 1, 8, 9,  // 1130, 4
-                4, 5, 8, 9,  // 1134, 4
-                0, 1, 4, 5, 8, 9,  // 1138, 6
-                2, 6, 8, 9,  // 1144, 4
-                6, 7, 8, 9,  // 1148, 4
-                0, 2, 4, 6, 8, 10,  // 1152, 6
-                1, 2, 5, 6, 9, 10,  // 1158, 6
-                0, 3, 4, 7, 9, 10,  // 1164, 6
-                0, 1, 2, 8, 9, 10,  // 1170, 6
-                4, 5, 6, 8, 9, 10,  // 1176, 6
-                3, 11,  // 1182, 2
-                2, 3, 6, 7, 11,  // 1184, 5
-                0, 3, 8, 11,  // 1189, 4
-                0, 3, 4, 7, 8, 11,  // 1193, 6
-                1, 3, 5, 7, 9, 11,  // 1199, 6
-                2, 3, 10, 11,  // 1205, 4
-                1, 5, 10, 11,  // 1209, 4
-                4, 5, 10, 11,  // 1213, 4
-                6, 7, 10, 11,  // 1217, 4
-                2, 3, 6, 7, 10, 11,  // 1221, 6
-                1, 2, 3, 9, 10, 11,  // 1227, 6
-                5, 6, 7, 9, 10, 11,  // 1233, 6
-                8, 9, 10, 11,  // 1239, 4
-                4, 12,  // 1243, 2
-                0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
-                8, 9, 12,  // 1253, 3
-                0, 4, 5, 8, 9, 12,  // 1256, 6
-                0, 1, 4, 5, 8, 9, 12,  // 1262, 7
-                2, 3, 5, 6, 8, 9, 12,  // 1269, 7
-                1, 5, 9, 13,  // 1276, 4
-                6, 7, 9, 13,  // 1280, 4
-                1, 4, 7, 10, 13,  // 1284, 5
-                1, 6, 8, 11, 13,  // 1289, 5
-                0, 1, 12, 13,  // 1294, 4
-                4, 5, 12, 13,  // 1298, 4
-                0, 1, 6, 7, 12, 13,  // 1302, 6
-                0, 1, 4, 8, 12, 13,  // 1308, 6
-                8, 9, 12, 13,  // 1314, 4
-                4, 8, 9, 12, 13,  // 1318, 5
-                4, 5, 8, 9, 12, 13,  // 1323, 6
-                0, 4, 5, 8, 9, 12, 13,  // 1329, 7
-                0, 1, 6, 10, 12, 13,  // 1336, 6
-                3, 6, 7, 9, 10, 12, 13,  // 1342, 7
-                0, 1, 10, 11, 12, 13,  // 1349, 6
-                2, 4, 7, 9, 14,  // 1355, 5
-                4, 5, 10, 14,  // 1360, 4
-                2, 6, 10, 14,  // 1364, 4
-                2, 5, 8, 11, 14,  // 1368, 5
-                0, 2, 12, 14,  // 1373, 4
-                8, 10, 12, 14,  // 1377, 4
-                4, 6, 8, 10, 12, 14,  // 1381, 6
-                13, 14,  // 1387, 2
-                9, 10, 13, 14,  // 1389, 4
-                5, 6, 9, 10, 13, 14,  // 1393, 6
-                0, 1, 2, 12, 13, 14,  // 1399, 6
-                4, 5, 6, 12, 13, 14,  // 1405, 6
-                8, 9, 12, 13, 14,  // 1411, 5
-                8, 9, 10, 12, 13, 14,  // 1416, 6
-                7, 15,  // 1422, 2
-                0, 5, 10, 15,  // 1424, 4
-                0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
-                10, 11, 15,  // 1436, 3
-                0, 1, 5, 6, 10, 11, 15,  // 1439, 7
-                3, 6, 7, 10, 11, 15,  // 1446, 6
-                12, 15,  // 1452, 2
-                0, 3, 12, 15,  // 1454, 4
-                4, 7, 12, 15,  // 1458, 4
-                0, 3, 6, 9, 12, 15,  // 1462, 6
-                0, 3, 5, 10, 12, 15,  // 1468, 6
-                8, 11, 12, 15,  // 1474, 4
-                5, 6, 8, 11, 12, 15,  // 1478, 6
-                4, 7, 8, 11, 12, 15,  // 1484, 6
-                1, 3, 13, 15,  // 1490, 4
-                9, 11, 13, 15,  // 1494, 4
-                5, 7, 9, 11, 13, 15,  // 1498, 6
-                2, 3, 14, 15,  // 1504, 4
-                2, 3, 4, 5, 14, 15,  // 1508, 6
-                6, 7, 14, 15,  // 1514, 4
-                2, 3, 5, 9, 14, 15,  // 1518, 6
-                2, 3, 8, 9, 14, 15,  // 1524, 6
-                10, 14, 15,  // 1530, 3
-                0, 4, 5, 9, 10, 14, 15,  // 1533, 7
-                2, 3, 7, 11, 14, 15,  // 1540, 6
-                10, 11, 14, 15,  // 1546, 4
-                7, 10, 11, 14, 15,  // 1550, 5
-                6, 7, 10, 11, 14, 15,  // 1555, 6
-                1, 2, 3, 13, 14, 15,  // 1561, 6
-                5, 6, 7, 13, 14, 15,  // 1567, 6
-                10, 11, 13, 14, 15,  // 1573, 5
-                9, 10, 11, 13, 14, 15,  // 1578, 6
-                0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
-                9, 10, 12, 13, 14, 15,  // 1592, 6
-                8, 11, 12, 13, 14, 15,  // 1598, 6
-                3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
-            };
-            static const int g_shapeRanges[][2] =
-            {
-                { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
-                { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
-                { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
-                { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
-                { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
-                { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
-                { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
-                { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
-                { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
-                { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
-                { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
-                { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
-                { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
-                { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
-                { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
-                { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
-                { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
-                { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
-                { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
-                { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
-                { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
-                { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
-                { 1604, 8 },
-            };
-            static const int g_shapes1[][2] =
-            {
-                { 0, 16 }
-            };
-            static const int g_shapes2[64][2] =
-            {
-                { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
-                { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
-                { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
-                { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
-                { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
-                { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
-                { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
-                { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
-            };
-            static const int g_shapes3[64][3] =
-            {
-                { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
-                { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
-                { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
-                { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
-                { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
-                { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
-                { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
-                { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
-            };
-
-            static const int g_shapeList1[] =
-            {
-                0,
-            };
-
-            static const int g_shapeList1Collapse[] =
-            {
-                0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-            static const int g_shapeList2[] =
-            {
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
-                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
-                45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
-                56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
-                67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
-                78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
-                89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
-                100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
-                111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
-                122, 123, 124, 125, 126, 127, 128,
-            };
-            static const int g_shapeList2Collapse[] =
-            {
-                -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-                21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
-                43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-                54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-                65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
-                76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
-                87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
-                98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
-                109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-                120, 121, 122, 123, 124, 125, 126, 127, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-
-            static const int g_shapeList12[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128,
-            };
-
-            static const int g_shapeList12Collapse[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-
-            static const int g_shapeList3[] =
-            {
-                1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
-                33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
-                110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
-                136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
-                147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
-                158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
-                169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
-                180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
-                191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
-                202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
-                213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-                224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
-                235, 236, 237, 238, 239, 240, 241, 242,
-            };
-
-            static const int g_shapeList3Collapse[] =
-            {
-                -1, 0, 1, -1, 2, -1, 3, -1, 4, -1, -1,
-                -1, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1,
-                -1, -1, -1, -1, -1, -1, -1, 10, -1, -1, -1,
-                11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 12, -1, -1, 13,
-                -1, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
-                16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, 17, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, 18, -1, -1, -1, -1, 19, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 20, -1, -1, 21,
-                22, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, 24, -1, -1, -1, -1, 25, 26, 27, 28,
-                29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-                40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
-                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
-                62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
-                73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
-                84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
-                95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
-                106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-                117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-                128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
-                139,
-            };
-
-            static const int g_shapeList3Short[] =
-            {
-                1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
-                106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
-                171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
-                233, 237, 240,
-            };
-
-            static const int g_shapeList3ShortCollapse[] =
-            {
-                -1, 0, 1, -1, 2, -1, 3, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 4, -1, 5, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1,
-                -1, -1, -1, -1, 8, -1, -1, -1, -1, -1, -1,
-                9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, 10, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 11, -1, -1, -1,
-                12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, 14,
-                15, -1, -1, -1, 16, -1, -1, -1, -1, -1, 17,
-                18, -1, -1, 19, -1, 20, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, 21, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, 23,
-                -1, 24, 25, -1, -1, -1, -1, -1, -1, -1, 26,
-                27, -1, -1, -1, -1, -1, -1, -1, 28, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 29, -1, -1, -1,
-                -1, -1, 30, 31, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, 32, 33, -1, -1, -1, 34, -1, -1, 35, -1,
-                -1,
-            };
-
-            static const int g_shapeListAll[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
-                132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
-                143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
-                154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
-                165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
-                176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
-                187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
-                198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
-                209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
-                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
-                231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
-                242,
-            };
-
-            static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
-            static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
-            static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
-            static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
-            static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
-            static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
-            static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
-
-            static const int g_maxFragmentsPerMode = (g_numShapes2 > g_numShapes3) ? g_numShapes2 : g_numShapes3;
-        }
-
-        namespace BC6HData
-        {
-            enum EField
-            {
-                NA, // N/A
-                M,  // Mode
-                D,  // Shape
-                RW,
-                RX,
-                RY,
-                RZ,
-                GW,
-                GX,
-                GY,
-                GZ,
-                BW,
-                BX,
-                BY,
-                BZ,
-            };
-
-            struct ModeDescriptor
-            {
-                EField m_eField;
-                uint8_t   m_uBit;
-            };
-
-            const ModeDescriptor g_modeDescriptors[14][82] =
-            {
-                {   // Mode 1 (0x00) - 10 5 5 5
-                    { M, 0 },{ M, 1 },{ GY, 4 },{ BY, 4 },{ BZ, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 2 (0x01) - 7 6 6 6
-                    { M, 0 },{ M, 1 },{ GY, 5 },{ GZ, 4 },{ GZ, 5 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 3 (0x02) - 11 5 4 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RW,10 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 4 (0x06) - 11 4 5 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GW,10 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 0 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ GY, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 5 (0x0a) - 11 4 4 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
-                    { BY, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BW,10 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 1 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ BZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 6 (0x0e) - 9 5 5 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 7 (0x12) - 8 6 5 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ GZ, 4 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 3 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 8 (0x16) - 8 5 6 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 0 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ GZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 9 (0x1a) - 8 5 5 6
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ BY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 10 (0x1e) - 6 6 6 6
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ GZ, 4 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GY, 5 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ GZ, 5 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 11 (0x03) - 10 10
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RX, 9 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GX, 9 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BX, 9 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 12 (0x07) - 11 9
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 13 (0x0b) - 12 8
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 14 (0x0f) - 16 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,15 },
-                    { RW,14 },{ RW,13 },{ RW,12 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,15 },
-                    { GW,14 },{ GW,13 },{ GW,12 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,15 },
-                    { BW,14 },{ BW,13 },{ BW,12 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-            };
-        }
-
-        struct PackingVector
-        {
-            uint32_t m_vector[4];
-            int m_offset;
-
-            void Init()
-            {
-                for (int i = 0; i < 4; i++)
-                    m_vector[i] = 0;
-
-                m_offset = 0;
-            }
-
-            inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
-            {
-                int vOffset = m_offset >> 5;
-                int bitOffset = m_offset & 0x1f;
-
-                m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
-
-                int overflowBits = bitOffset + bits - 32;
-                if (overflowBits > 0)
-                    m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
-
-                m_offset += bits;
-            }
-
-            inline void Flush(uint8_t* output)
-            {
-                assert(m_offset == 128);
-
-                for (int v = 0; v < 4; v++)
-                {
-                    uint32_t chunk = m_vector[v];
-                    for (int b = 0; b < 4; b++)
-                        output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
-                }
-            }
-        };
-
-
-		struct UnpackingVector
-		{
-			uint32_t m_vector[4];
-
-			void Init(const uint8_t *bytes)
-			{
-				for (int i = 0; i < 4; i++)
-					m_vector[i] = 0;
-
-				for (int b = 0; b < 16; b++)
-					m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
-			}
-
-			inline ParallelMath::ScalarUInt16 Unpack(int bits)
-			{
-				uint32_t bitMask = (1 << bits) - 1;
-
-				ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
-
-				for (int i = 0; i < 4; i++)
-				{
-					m_vector[i] >>= bits;
-					if (i != 3)
-						m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
-				}
-
-				return result;
-			}
-		};
-
-        void ComputeTweakFactors(int tweak, int range, float *outFactors)
-        {
-            int totalUnits = range - 1;
-            int minOutsideUnits = ((tweak >> 1) & 1);
-            int maxOutsideUnits = (tweak & 1);
-            int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
-
-            outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
-            outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
-        }
-
-        ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
-        {
-            if (isSigned)
-            {
-                ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
-                return (v * 32.0f + offset) / 31.0f;
-            }
-            else
-                return (v * 64.0f + 30.0f) / 31.0f;
-        }
-
-        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
-        {
-#ifdef CVTT_ENABLE_ASSERTS
-            for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                assert(ParallelMath::Extract(v, i) != -32768)
-#endif
-
-            ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
-            ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
-
-            ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
-            ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
-            ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
-            ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
-
-            return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
-        }
-
-        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
-        {
-            return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
-        }
-
-        void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
-        {
-            for (int epi = 0; epi < 2; epi++)
-            {
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    if (isSigned)
-                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
-                    else
-                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
-                }
-            }
-        }
-
-        template<int TVectorSize>
-        class UnfinishedEndpoints
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            UnfinishedEndpoints()
-            {
-            }
-
-            UnfinishedEndpoints(const MFloat *base, const MFloat *offset)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_base[ch] = base[ch];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_offset[ch] = offset[ch];
-            }
-
-            UnfinishedEndpoints(const UnfinishedEndpoints& other)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_base[ch] = other.m_base[ch];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_offset[ch] = other.m_offset[ch];
-            }
-
-            void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MUInt15 channelEPs[2];
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f);
-                        channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode);
-                    }
-
-                    outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]);
-                    outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]);
-                }
-            }
-
-            void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode)
-            {
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MSInt16 channelEPs[2];
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f);
-                        channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode);
-                    }
-
-                    outEP0[ch] = channelEPs[0];
-                    outEP1[ch] = channelEPs[1];
-                }
-            }
-
-            void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1)
-            {
-                ParallelMath::RoundTowardNearestForScope roundingMode;
-
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
-                    MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
-                    outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode);
-                    outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode);
-                }
-            }
-
-            template<int TNewVectorSize>
-            UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler)
-            {
-                MFloat newBase[TNewVectorSize];
-                MFloat newOffset[TNewVectorSize];
-
-                for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++)
-                {
-                    newBase[ch] = m_base[ch];
-                    newOffset[ch] = m_offset[ch];
-                }
-
-                MFloat fillerV = ParallelMath::MakeFloat(filler);
-
-                for (int ch = TVectorSize; ch < TNewVectorSize; ch++)
-                {
-                    newBase[ch] = fillerV;
-                    newOffset[ch] = ParallelMath::MakeFloatZero();
-                }
-
-                return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset);
-            }
-
-        private:
-            MFloat m_base[TVectorSize];
-            MFloat m_offset[TVectorSize];
-        };
-
-        template<int TMatrixSize>
-        class PackedCovarianceMatrix
-        {
-        public:
-            // 0: xx,
-            // 1: xy, yy
-            // 3: xz, yz, zz 
-            // 6: xw, yw, zw, ww
-            // ... etc.
-            static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
-
-            typedef ParallelMath::Float MFloat;
-
-            PackedCovarianceMatrix()
-            {
-                for (int i = 0; i < PyramidSize; i++)
-                    m_values[i] = ParallelMath::MakeFloatZero();
-            }
-
-            void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight)
-            {
-                int index = 0;
-                for (int row = 0; row < TMatrixSize; row++)
-                {
-                    for (int col = 0; col <= row; col++)
-                    {
-                        m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
-                        index++;
-                    }
-                }
-            }
-
-            void Product(MFloat *outVec, const MFloat *inVec)
-            {
-                for (int row = 0; row < TMatrixSize; row++)
-                {
-                    MFloat sum = ParallelMath::MakeFloatZero();
-
-                    int index = (row * (row + 1)) >> 1;
-                    for (int col = 0; col < TMatrixSize; col++)
-                    {
-                        sum = sum + inVec[col] * m_values[index];
-                        if (col >= row)
-                            index += col + 1;
-                        else
-                            index++;
-                    }
-
-                    outVec[row] = sum;
-                }
-            }
-
-        private:
-            ParallelMath::Float m_values[PyramidSize];
-        };
-
-        static const int NumEndpointSelectorPasses = 3;
-
-        template<int TVectorSize, int TIterationCount>
-        class EndpointSelector
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-
-            EndpointSelector()
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_centroid[ch] = ParallelMath::MakeFloatZero();
-                    m_direction[ch] = ParallelMath::MakeFloatZero();
-                }
-                m_weightTotal = ParallelMath::MakeFloatZero();
-                m_minDist = ParallelMath::MakeFloat(FLT_MAX);
-                m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
-            }
-
-            void ContributePass(const MFloat *value, int pass, const MFloat &weight)
-            {
-                if (pass == 0)
-                    ContributeCentroid(value, weight);
-                else if (pass == 1)
-                    ContributeDirection(value, weight);
-                else if (pass == 2)
-                    ContributeMinMax(value);
-            }
-
-            void FinishPass(int pass)
-            {
-                if (pass == 0)
-                    FinishCentroid();
-                else if (pass == 1)
-                    FinishDirection();
-            }
-
-            UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
-            {
-                MFloat unweightedBase[TVectorSize];
-                MFloat unweightedOffset[TVectorSize];
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
-                    MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
-
-                    float safeWeight = channelWeights[ch];
-                    if (safeWeight == 0.f)
-                        safeWeight = 1.0f;
-
-                    unweightedBase[ch] = min / channelWeights[ch];
-                    unweightedOffset[ch] = (max - min) / channelWeights[ch];
-                }
-
-                return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
-            }
-
-        private:
-            void ContributeCentroid(const MFloat *value, const MFloat &weight)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
-                m_weightTotal = m_weightTotal + weight;
-            }
-
-            void FinishCentroid()
-            {
-                MFloat denom = m_weightTotal;
-                ParallelMath::MakeSafeDenominator(denom);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_centroid[ch] = m_centroid[ch] / denom;
-            }
-
-            void ContributeDirection(const MFloat *value, const MFloat &weight)
-            {
-                MFloat diff[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    diff[ch] = value[ch] - m_centroid[ch];
-
-                m_covarianceMatrix.Add(diff, weight);
-            }
-
-            void FinishDirection()
-            {
-                MFloat approx[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    approx[ch] = ParallelMath::MakeFloat(1.0f);
-
-                for (int i = 0; i < TIterationCount; i++)
-                {
-                    MFloat product[TVectorSize];
-                    m_covarianceMatrix.Product(product, approx);
-
-                    MFloat largestComponent = product[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        largestComponent = ParallelMath::Max(largestComponent, product[ch]);
-
-                    // product = largestComponent*newApprox
-                    ParallelMath::MakeSafeDenominator(largestComponent);
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        approx[ch] = product[ch] / largestComponent;
-                }
-
-                // Normalize
-                MFloat approxLen = ParallelMath::MakeFloatZero();
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    approxLen = approxLen + approx[ch] * approx[ch];
-
-                approxLen = ParallelMath::Sqrt(approxLen);
-
-                ParallelMath::MakeSafeDenominator(approxLen);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_direction[ch] = approx[ch] / approxLen;
-            }
-
-            void ContributeMinMax(const MFloat *value)
-            {
-                MFloat dist = ParallelMath::MakeFloatZero();
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
-
-                m_minDist = ParallelMath::Min(m_minDist, dist);
-                m_maxDist = ParallelMath::Max(m_maxDist, dist);
-            }
-
-            ParallelMath::Float m_centroid[TVectorSize];
-            ParallelMath::Float m_direction[TVectorSize];
-            PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
-            ParallelMath::Float m_weightTotal;
-
-            ParallelMath::Float m_minDist;
-            ParallelMath::Float m_maxDist;
-        };
-
-        static const ParallelMath::UInt16 g_weightReciprocals[] =
-        {
-            ParallelMath::MakeUInt16(0),        // -1 
-            ParallelMath::MakeUInt16(0),        // 0
-            ParallelMath::MakeUInt16(32768),    // 1
-            ParallelMath::MakeUInt16(16384),    // 2
-            ParallelMath::MakeUInt16(10923),    // 3
-            ParallelMath::MakeUInt16(8192),     // 4
-            ParallelMath::MakeUInt16(6554),     // 5
-            ParallelMath::MakeUInt16(5461),     // 6
-            ParallelMath::MakeUInt16(4681),     // 7
-            ParallelMath::MakeUInt16(4096),     // 8
-            ParallelMath::MakeUInt16(3641),     // 9
-            ParallelMath::MakeUInt16(3277),     // 10
-            ParallelMath::MakeUInt16(2979),     // 11
-            ParallelMath::MakeUInt16(2731),     // 12
-            ParallelMath::MakeUInt16(2521),     // 13
-            ParallelMath::MakeUInt16(2341),     // 14
-            ParallelMath::MakeUInt16(2185),     // 15
-        };
-
-        template<int TVectorSize>
-        class IndexSelector
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::UInt31 MUInt31;
-
-            template<class TInterpolationEPType, class TColorEPType>
-            void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
-            {
-                // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
-                // We need to select indexes using the color-space endpoints.
-
-                m_isUniform = true;
-                for (int ch = 1; ch < TVectorSize; ch++)
-                {
-                    if (channelWeights[ch] != channelWeights[0])
-                        m_isUniform = false;
-                }
-
-                // To work with channel weights, we need something where:
-                // pxDiff = px - ep[0]
-                // epDiff = ep[1] - ep[0]
-                //
-                // weightedEPDiff = epDiff * channelWeights
-                // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
-                // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
-                // index = normalizedIndex * maxValue
-                //
-                // Equivalent to:
-                // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
-                // index = dot(axis, pxDiff)
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
-
-                m_range = range;
-                m_maxValue = static_cast<float>(range - 1);
-
-                MFloat epDiffWeighted[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
-                    MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
-                    epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
-                }
-
-                MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
-
-                ParallelMath::MakeSafeDenominator(lenSquared);
-
-                MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
-            }
-
-            template<bool TSigned>
-            void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
-            {
-                MAInt16 converted[2][TVectorSize];
-                for (int epi = 0; epi < 2; epi++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
-
-                Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
-            }
-
-            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
-                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
-                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
-                }
-            }
-
-            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
-                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
-                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
-                }
-            }
-
-            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
-            {
-                ReconstructLDR_BC7(index, pixel, TVectorSize);
-            }
-
-            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
-            {
-                ReconstructLDRPrecise(index, pixel, TVectorSize);
-            }
-
-            MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
-            {
-                MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
-
-                return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
-            }
-
-        protected:
-            MAInt16 m_endPoint[2][TVectorSize];
-
-        private:
-            MFloat m_origin[TVectorSize];
-            MFloat m_axis[TVectorSize];
-            int m_range;
-            float m_maxValue;
-            bool m_isUniform;
-        };
-
-
-        template<int TVectorSize>
-        class IndexSelectorHDR : public IndexSelector<TVectorSize>
-        {
-        public:
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt31 MUInt31;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::Float MFloat;
-
-        private:
-
-            MUInt15 InvertSingle(const MUInt15& anIndex) const
-            {
-                MUInt15 inverted = m_maxValueMinusOne - anIndex;
-                return ParallelMath::Select(m_isInverted, inverted, anIndex);
-            }
-
-            void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]);
-                    MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]);
-
-                    MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
-
-                    pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6);
-
-                    pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32));
-                }
-            }
-
-            void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]);
-                    MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]);
-
-                    MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
-
-                    pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6);
-
-                    pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31)));
-                }
-            }
-
-            MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const
-            {
-                MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch];
-                return diff * diff;
-            }
-
-            MFloat ErrorForInterpolator(int index, const MFloat *pixel) const
-            {
-                MFloat error = ErrorForInterpolatorComponent(index, 0, pixel);
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    error = error + ErrorForInterpolatorComponent(index, ch, pixel);
-                return error;
-            }
-
-        public:
-
-            void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights)
-            {
-                assert(range <= 16);
-
-                m_range = range;
-
-                m_isInverted = ParallelMath::MakeBoolInt16(false);
-                m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1));
-
-                if (!fastIndexing)
-                {
-                    for (int i = 0; i < range; i++)
-                    {
-                        MSInt16 recon2CL[TVectorSize];
-
-                        if (isSigned)
-                            ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
-                        else
-                            ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
-
-                        for (int ch = 0; ch < TVectorSize; ch++)
-                            m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch];
-                    }
-                }
-            }
-
-            void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const
-            {
-                ReconstructHDRSignedUninverted(InvertSingle(index), pixel);
-            }
-
-            void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const
-            {
-                ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel);
-            }
-
-            void ConditionalInvert(const ParallelMath::Int16CompFlag &invert)
-            {
-                m_isInverted = invert;
-            }
-
-            MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const
-            {
-                MUInt15 index = ParallelMath::MakeUInt15(0);
-
-                MFloat bestError = ErrorForInterpolator(0, pixel);
-                for (int i = 1; i < m_range; i++)
-                {
-                    MFloat error = ErrorForInterpolator(i, pixel);
-                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                    ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
-                    bestError = ParallelMath::Min(bestError, error);
-                }
-
-                return InvertSingle(index);
-            }
-
-            MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
-            {
-                return InvertSingle(this->SelectIndexLDR(pixel, rtn));
-            }
-
-        private:
-            MFloat m_reconstructedInterpolators[16][TVectorSize];
-            ParallelMath::Int16CompFlag m_isInverted;
-            MUInt15 m_maxValueMinusOne;
-            int m_range;
-        };
-
-        // Solve for a, b where v = a*t + b
-        // This allows endpoints to be mapped to where T=0 and T=1
-        // Least squares from totals:
-        // a = (tv - t*v/w)/(tt - t*t/w)
-        // b = (v - a*t)/w
-        template<int TVectorSize>
-        class EndpointRefiner
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            MFloat m_tv[TVectorSize];
-            MFloat m_v[TVectorSize];
-            MFloat m_tt;
-            MFloat m_t;
-            MFloat m_w;
-            int m_wu;
-
-            float m_rcpMaxIndex;
-            float m_channelWeights[TVectorSize];
-            float m_rcpChannelWeights[TVectorSize];
-
-            void Init(int indexRange, const float channelWeights[TVectorSize])
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_tv[ch] = ParallelMath::MakeFloatZero();
-                    m_v[ch] = ParallelMath::MakeFloatZero();
-                }
-                m_tt = ParallelMath::MakeFloatZero();
-                m_t = ParallelMath::MakeFloatZero();
-                m_w = ParallelMath::MakeFloatZero();
-
-                m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_channelWeights[ch] = channelWeights[ch];
-                    m_rcpChannelWeights[ch] = 1.0f;
-                    if (m_channelWeights[ch] != 0.0f)
-                        m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch];
-                }
-
-                m_wu = 0;
-            }
-
-            void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight)
-            {
-                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat v = pwFloatPixel[ch] * weight;
-
-                    m_tv[ch] = m_tv[ch] + t * v;
-                    m_v[ch] = m_v[ch] + v;
-                }
-                m_tt = m_tt + weight * t * t;
-                m_t = m_t + weight * t;
-                m_w = m_w + weight;
-            }
-
-            void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels)
-            {
-                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MFloat v = pwFloatPixel[ch];
-
-                    m_tv[ch] = m_tv[ch] + t * v;
-                    m_v[ch] = m_v[ch] + v;
-                }
-                m_tt = m_tt + t * t;
-                m_t = m_t + t;
-                m_wu++;
-            }
-
-            void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index)
-            {
-                ContributeUnweightedPW(floatPixel, index, TVectorSize);
-            }
-
-            void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])
-            {
-                // a = (tv - t*v/w)/(tt - t*t/w)
-                // b = (v - a*t)/w
-                MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu));
-
-                ParallelMath::MakeSafeDenominator(w);
-                MFloat wRcp = ParallelMath::Reciprocal(w);
-
-                MFloat adenom = (m_tt * w - m_t * m_t) * wRcp;
-
-                ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
-                ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    /*
-                    if (adenom == 0.0)
-                        p1 = p2 = er.v / er.w;
-                    else
-                    {
-                        float4 a = (er.tv - er.t*er.v / er.w) / adenom;
-                        float4 b = (er.v - a * er.t) / er.w;
-                        p1 = b;
-                        p2 = a + b;
-                    }
-                    */
-
-                    MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom;
-                    MFloat b = (m_v[ch] - a * m_t) * wRcp;
-
-                    MFloat p1 = b;
-                    MFloat p2 = a + b;
-
-                    ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp));
-                    ParallelMath::ConditionalSet(p2, adenomZero, p1);
-
-                    // Unweight
-                    float inverseWeight = m_rcpChannelWeights[ch];
-
-                    endPoint[0][ch] = p1 * inverseWeight;
-                    endPoint[1][ch] = p2 * inverseWeight;
-                }
-            }
-
-            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                MFloat floatEndPoint[2][TVectorSize];
-                GetRefinedEndpoints(floatEndPoint);
-
-                for (int epi = 0; epi < 2; epi++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode);
-            }
-
-            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode);
-            }
-
-            void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                MFloat floatEndPoint[2][TVectorSize];
-                GetRefinedEndpoints(floatEndPoint);
-
-                for (int epi = 0; epi < 2; epi++)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                    {
-                        MFloat f = floatEndPoint[epi][ch];
-                        if (isSigned)
-                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode));
-                        else
-                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode));
-                    }
-                }
-            }
-        };
-
-        template<int TVectorSize>
-        class AggregatedError
-        {
-        public:
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt31 MUInt31;
-            typedef ParallelMath::Float MFloat;
-
-            AggregatedError()
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0);
-            }
-
-            void Add(const MUInt16 &channelErrorUnweighted, int ch)
-            {
-                m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted);
-            }
-
-            MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const
-            {
-                if (flags & cvtt::Flags::Uniform)
-                {
-                    MUInt31 total = m_errorUnweighted[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        total = total + m_errorUnweighted[ch];
-                    return ParallelMath::ToFloat(total);
-                }
-                else
-                {
-                    MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch];
-                    return total;
-                }
-            }
-
-        private:
-            MUInt31 m_errorUnweighted[TVectorSize];
-        };
-
-        class BCCommon
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            static int TweakRoundsForRange(int range)
-            {
-                if (range == 3)
-                    return 3;
-                return 4;
-            }
-
-            template<int TVectorSize>
-            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
-            {
-                for (int ch = 0; ch < numRealChannels; ch++)
-                    aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
-            }
-
-            template<int TVectorSize>
-            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
-            {
-                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
-            {
-                AggregatedError<TVectorSize> aggError;
-                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
-                return aggError.Finalize(flags, channelWeightsSq);
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
-            {
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & Flags::Uniform)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
-                }
-                else
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
-                }
-
-                return error;
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
-            {
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & Flags::Uniform)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
-                }
-                else
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
-                }
-
-                return error;
-            }
-
-            template<int TChannelCount>
-            static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
-            {
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < TChannelCount; ch++)
-                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
-                }
-            }
-
-            template<int TChannelCount>
-            static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
-            {
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < TChannelCount; ch++)
-                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
-                }
-            }
-        };
-
-        class BC7Computer
-        {
-        public:
-            static const int MaxTweakRounds = 4;
-
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::Float MFloat;
-
-            struct WorkInfo
-            {
-                MUInt15 m_mode;
-                MFloat m_error;
-                MUInt15 m_ep[3][2][4];
-                MUInt15 m_indexes[16];
-                MUInt15 m_indexes2[16];
-
-                union
-                {
-                    MUInt15 m_partition;
-                    struct IndexSelectorAndRotation
-                    {
-                        MUInt15 m_indexSelector;
-                        MUInt15 m_rotation;
-                    } m_isr;
-                } m_u;
-            };
-
-            static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
-            {
-                ParallelMath::RoundTowardNearestForScope roundingMode;
-
-                float tf[2];
-                ComputeTweakFactors(tweak, range, tf);
-
-                MFloat base = ParallelMath::ToFloat(original[0]);
-                MFloat offs = ParallelMath::ToFloat(original[1]) - base;
-
-                result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
-                result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
-            }
-
-            static void Quantize(MUInt15* color, int bits, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                float maxColor = static_cast<float>((1 << bits) - 1);
-
-                for (int i = 0; i < channels; i++)
-                    color[i] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(ParallelMath::ToFloat(color[i]) * ParallelMath::MakeFloat(1.0f / 255.0f) * maxColor, 0.f, 255.f), roundingMode);
-            }
-
-            static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                uint16_t pShift = static_cast<uint16_t>(1 << (7 - bits));
-                MUInt15 pShiftV = ParallelMath::MakeUInt15(pShift);
-
-                float maxColorF = static_cast<float>(255 - (1 << (7 - bits)));
-
-                float maxQuantized = static_cast<float>((1 << bits) - 1);
-
-                for (int ch = 0; ch < channels; ch++)
-                {
-                    MUInt15 clr = color[ch];
-                    if (p)
-                        clr = ParallelMath::Max(clr, pShiftV) - pShiftV;
-
-                    MFloat rerangedColor = ParallelMath::ToFloat(clr) * maxQuantized / maxColorF;
-
-                    clr = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(rerangedColor, 0.0f, maxQuantized), roundingMode) << 1;
-                    if (p)
-                        clr = clr | ParallelMath::MakeUInt15(1);
-
-                    color[ch] = clr;
-                }
-            }
-
-            static void Unquantize(MUInt15* color, int bits, int channels)
-            {
-                for (int ch = 0; ch < channels; ch++)
-                {
-                    MUInt15 clr = color[ch];
-                    clr = clr << (8 - bits);
-                    color[ch] = clr | ParallelMath::RightShift(clr, bits);
-                }
-            }
-
-            static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 4, p[j], 3, roundingMode);
-                    Unquantize(ep[j], 5, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 6, p, 3, roundingMode);
-                    Unquantize(ep[j], 7, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints2(MUInt15 ep[2][4], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(ep[j], 5, 3, roundingMode);
-                    Unquantize(ep[j], 5, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 7, p[j], 3, roundingMode);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(epRGB[j], 5, 3, roundingMode);
-                    Unquantize(epRGB[j], 5, 3);
-
-                    Quantize(epA + j, 6, 1, roundingMode);
-                    Unquantize(epA + j, 6, 1);
-                }
-            }
-
-            static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(epRGB[j], 7, 3, roundingMode);
-                    Unquantize(epRGB[j], 7, 3);
-                }
-
-                // Alpha is full precision
-                (void)epA;
-            }
-
-            static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                    QuantizeP(ep[j], 7, p[j], 4, roundingMode);
-            }
-
-            static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 5, p[j], 4, roundingMode);
-                    Unquantize(ep[j], 6, 4);
-                }
-            }
-
-            struct SinglePlaneTemporaries
-            {
-                UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
-                UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
-
-                MUInt15 fragmentBestIndexes[BC7Data::g_numFragments];
-                MUInt15 shapeBestEP[BC7Data::g_maxFragmentsPerMode][2][4];
-                MFloat shapeBestError[BC7Data::g_maxFragmentsPerMode];
-            };
-
-            static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
-
-                MUInt15 intAverage[4];
-                for (int ch = 0; ch < 4; ch++)
-                    intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
-
-                MUInt15 eps[2][4];
-                MUInt15 reconstructed[4];
-                MUInt15 index = ParallelMath::MakeUInt15(0);
-
-                for (int epi = 0; epi < 2; epi++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                        eps[epi][ch] = ParallelMath::MakeUInt15(0);
-                    eps[epi][3] = ParallelMath::MakeUInt15(255);
-                }
-
-                for (int ch = 0; ch < 3; ch++)
-                    reconstructed[ch] = ParallelMath::MakeUInt15(0);
-                reconstructed[3] = ParallelMath::MakeUInt15(255);
-
-                // Depending on the target index and parity bits, there are multiple valid solid colors.
-                // We want to find the one closest to the actual average.
-                MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
-                for (int t = 0; t < numTables; t++)
-                {
-                    const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
-
-                    ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
-
-                    MUInt15 candidateReconstructed[4];
-                    MUInt15 candidateEPs[2][4];
-
-                    for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                    {
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                        {
-                            ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
-                            assert(avgValue >= 0 && avgValue <= 255);
-
-                            const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
-
-                            ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
-                            ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
-                            ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
-                        }
-                    }
-
-                    MFloat avgError = ParallelMath::MakeFloatZero();
-                    for (int ch = 0; ch < numRealChannels; ch++)
-                    {
-                        MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
-                        avgError = avgError + delta * delta * channelWeightsSq[ch];
-                    }
-
-                    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
-                    better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
-
-                    if (ParallelMath::AnySet(better))
-                    {
-                        ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
-
-                        MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
-
-                        ParallelMath::ConditionalSet(index, better, candidateIndex);
-
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                            ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
-
-                        for (int epi = 0; epi < 2; epi++)
-                            for (int ch = 0; ch < numRealChannels; ch++)
-                                ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
-                    }
-                }
-
-                AggregatedError<4> aggError;
-                for (int pxi = 0; pxi < shapeLength; pxi++)
-                {
-                    int px = fragmentStart[pxi];
-
-                    BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
-                }
-
-                MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
-
-                ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
-                if (ParallelMath::AnySet(better))
-                {
-                    shapeBestError = ParallelMath::Min(shapeBestError, error);
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                            ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
-                    }
-
-                    for (int pxi = 0; pxi < shapeLength; pxi++)
-                        ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
-                }
-            }
-
-
-            static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                float channelWeightsSq[4];
-
-                for (int ch = 0; ch < 4; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                SinglePlaneTemporaries temps;
-
-                MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
-                MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
-                ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
-                for (int px = 0; px < 16; px++)
-                {
-                    MUInt15 a = pixels[px][3];
-                    maxAlpha = ParallelMath::Max(maxAlpha, a);
-                    minAlpha = ParallelMath::Min(minAlpha, a);
-
-                    isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
-                }
-
-                ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
-                ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
-
-                bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
-
-                // Try RGB modes if any block has a min alpha 251 or higher
-                bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
-
-                // Try mode 7 if any block has alpha.
-                // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
-                // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
-                // situations, and only by at most 1 unit of error per pixel.
-                bool allowMode7 = anyBlockHasAlpha;
-
-                MFloat preWeightedPixels[16][4];
-
-                BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
-
-                const int *rgbInitialEPCollapseList = NULL;
-
-                // Get initial RGB endpoints
-                if (allowRGBModes)
-                {
-                    const int *shapeList;
-                    int numShapesToEvaluate;
-
-                    if (flags & Flags::BC7_EnablePartitioning)
-                    {
-                        if (flags & Flags::BC7_Enable3Subsets)
-                        {
-                            shapeList = BC7Data::g_shapeListAll;
-                            rgbInitialEPCollapseList = BC7Data::g_shapeListAll;
-                            numShapesToEvaluate = BC7Data::g_numShapesAll;
-                        }
-                        else
-                        {
-                            shapeList = BC7Data::g_shapeList12;
-                            rgbInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
-                            numShapesToEvaluate = BC7Data::g_numShapes12;
-                        }
-                    }
-                    else
-                    {
-                        shapeList = BC7Data::g_shapeList1;
-                        rgbInitialEPCollapseList = BC7Data::g_shapeList1Collapse;
-                        numShapesToEvaluate = BC7Data::g_numShapes1;
-                    }
-
-                    for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-
-                        int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                        int shapeSize = BC7Data::g_shapeRanges[shape][1];
-
-                        EndpointSelector<3, 8> epSelector;
-
-                        for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                        {
-                            for (int spx = 0; spx < shapeSize; spx++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + spx];
-                                epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
-                            }
-                            epSelector.FinishPass(epPass);
-                        }
-                        temps.unfinishedRGB[shapeIter] = epSelector.GetEndpoints(channelWeights);
-                    }
-                }
-
-                const int *rgbaInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
-
-                // Get initial RGBA endpoints
-                {
-                    const int *shapeList = BC7Data::g_shapeList12;
-                    int numShapesToEvaluate = BC7Data::g_numShapes12;
-
-                    for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-
-                        if (anyBlockHasAlpha || !allowRGBModes)
-                        {
-                            int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                            int shapeSize = BC7Data::g_shapeRanges[shape][1];
-
-                            EndpointSelector<4, 8> epSelector;
-
-                            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                            {
-                                for (int spx = 0; spx < shapeSize; spx++)
-                                {
-                                    int px = BC7Data::g_fragments[shapeStart + spx];
-                                    epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
-                                }
-                                epSelector.FinishPass(epPass);
-                            }
-                            temps.unfinishedRGBA[shapeIter] = epSelector.GetEndpoints(channelWeights);
-                        }
-                        else
-                        {
-                            temps.unfinishedRGBA[shapeIter] = temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].ExpandTo<4>(255);
-                        }
-                    }
-                }
-
-                for (uint16_t mode = 0; mode <= 7; mode++)
-                {
-                    if (!(flags & Flags::BC7_EnablePartitioning) && BC7Data::g_modes[mode].m_numSubsets != 1)
-                        continue;
-
-                    if (!(flags & Flags::BC7_Enable3Subsets) && BC7Data::g_modes[mode].m_numSubsets == 3)
-                        continue;
-
-                    if (mode == 4 || mode == 5)
-                        continue;
-
-                    if (mode < 4 && !allowRGBModes)
-                        continue;
-
-                    if (mode == 7 && !allowMode7)
-                        continue;
-
-                    bool isRGB = (mode < 4);
-
-                    unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
-                    int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
-                    int indexPrec = BC7Data::g_modes[mode].m_indexBits;
-
-                    int parityBitMax = 1;
-                    if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                        parityBitMax = 4;
-                    else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
-                        parityBitMax = 2;
-
-                    int numRealChannels = isRGB ? 3 : 4;
-
-                    int numShapes;
-                    const int *shapeList;
-                    const int *shapeCollapseList;
-
-                    if (numSubsets == 1)
-                    {
-                        numShapes = BC7Data::g_numShapes1;
-                        shapeList = BC7Data::g_shapeList1;
-                        shapeCollapseList = BC7Data::g_shapeList1Collapse;
-                    }
-                    else if (numSubsets == 2)
-                    {
-                        numShapes = BC7Data::g_numShapes2;
-                        shapeList = BC7Data::g_shapeList2;
-                        shapeCollapseList = BC7Data::g_shapeList2Collapse;
-                    }
-                    else
-                    {
-                        assert(numSubsets == 3);
-                        if (numPartitions == 16)
-                        {
-                            numShapes = BC7Data::g_numShapes3Short;
-                            shapeList = BC7Data::g_shapeList3Short;
-                            shapeCollapseList = BC7Data::g_shapeList3ShortCollapse;
-                        }
-                        else
-                        {
-                            assert(numPartitions == 64);
-                            numShapes = BC7Data::g_numShapes3;
-                            shapeList = BC7Data::g_shapeList3;
-                            shapeCollapseList = BC7Data::g_shapeList3Collapse;
-                        }
-                    }
-
-                    for (int slot = 0; slot < BC7Data::g_maxFragmentsPerMode; slot++)
-                        temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
-
-                    for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-                        int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                        int shapeLength = BC7Data::g_shapeRanges[shape][1];
-                        int shapeCollapsedEvalIndex = shapeCollapseList[shape];
-
-                        AggregatedError<1> alphaAggError;
-                        if (isRGB && anyBlockHasAlpha)
-                        {
-                            MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
-
-                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + pxi];
-                                MUInt15 original[1] = { pixels[px][3] };
-                                BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
-                            }
-                        }
-
-                        float alphaWeightsSq[1] = { channelWeightsSq[3] };
-                        MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
-
-                        assert(shapeCollapsedEvalIndex >= 0);
-
-                        MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
-
-                        for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                        {
-                            if (isRGB)
-                            {
-                                temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
-                                tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
-                            }
-                            else
-                            {
-                                temps.unfinishedRGBA[rgbaInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
-                            }
-                        }
-
-                        ParallelMath::Int16CompFlag punchThroughInvalid[4];
-                        for (int pIter = 0; pIter < parityBitMax; pIter++)
-                        {
-                            punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
-
-                            if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
-                            {
-                                // Modes 6 and 7 have parity bits that affect alpha
-                                if (pIter == 0)
-                                    punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
-                                else if (pIter == parityBitMax - 1)
-                                    punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
-                                else
-                                    punchThroughInvalid[pIter] = isPunchThrough;
-                            }
-                        }
-
-                        for (int pIter = 0; pIter < parityBitMax; pIter++)
-                        {
-                            if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
-                                continue;
-
-                            bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                uint16_t p[2];
-                                p[0] = (pIter & 1);
-                                p[1] = ((pIter >> 1) & 1);
-
-                                MUInt15 ep[2][4];
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    for (int ch = 0; ch < 4; ch++)
-                                        ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
-
-                                for (int refine = 0; refine < numRefineRounds; refine++)
-                                {
-                                    switch (mode)
-                                    {
-                                    case 0:
-                                        CompressEndpoints0(ep, p, rtn);
-                                        break;
-                                    case 1:
-                                        CompressEndpoints1(ep, p[0], rtn);
-                                        break;
-                                    case 2:
-                                        CompressEndpoints2(ep, rtn);
-                                        break;
-                                    case 3:
-                                        CompressEndpoints3(ep, p, rtn);
-                                        break;
-                                    case 6:
-                                        CompressEndpoints6(ep, p, rtn);
-                                        break;
-                                    case 7:
-                                        CompressEndpoints7(ep, p, rtn);
-                                        break;
-                                    default:
-                                        assert(false);
-                                        break;
-                                    };
-
-                                    MFloat shapeError = ParallelMath::MakeFloatZero();
-
-                                    IndexSelector<4> indexSelector;
-                                    indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
-
-                                    EndpointRefiner<4> epRefiner;
-                                    epRefiner.Init(1 << indexPrec, channelWeights);
-
-                                    MUInt15 indexes[16];
-
-                                    AggregatedError<4> aggError;
-                                    for (int pxi = 0; pxi < shapeLength; pxi++)
-                                    {
-                                        int px = BC7Data::g_fragments[shapeStart + pxi];
-
-                                        MUInt15 index;
-                                        MUInt15 reconstructed[4];
-
-                                        index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
-                                        indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
-
-                                        if (flags & cvtt::Flags::BC7_FastIndexing)
-                                            BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
-                                        else
-                                        {
-                                            MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
-
-                                            MUInt15 altIndexes[2];
-                                            altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
-
-                                            for (int ii = 0; ii < 2; ii++)
-                                            {
-                                                indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
-
-                                                MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
-                                                ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
-                                                error = ParallelMath::Min(error, altError);
-                                                ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
-                                            }
-
-                                            shapeError = shapeError + error;
-                                        }
-
-                                        if (refine != numRefineRounds - 1)
-                                            epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
-
-                                        indexes[pxi] = index;
-                                    }
-
-                                    if (flags & cvtt::Flags::BC7_FastIndexing)
-                                        shapeError = aggError.Finalize(flags, channelWeightsSq);
-
-                                    if (isRGB)
-                                        shapeError = shapeError + staticAlphaError;
-
-                                    ParallelMath::FloatCompFlag shapeErrorBetter;
-                                    ParallelMath::Int16CompFlag shapeErrorBetter16;
-
-                                    shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shapeCollapsedEvalIndex]);
-                                    shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
-
-                                    if (ParallelMath::AnySet(shapeErrorBetter16))
-                                    {
-                                        bool punchThroughOK = true;
-                                        if (needPunchThroughCheck)
-                                        {
-                                            shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
-                                            shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
-
-                                            if (!ParallelMath::AnySet(shapeErrorBetter16))
-                                                punchThroughOK = false;
-                                        }
-
-                                        if (punchThroughOK)
-                                        {
-                                            ParallelMath::ConditionalSet(temps.shapeBestError[shapeCollapsedEvalIndex], shapeErrorBetter, shapeError);
-                                            for (int epi = 0; epi < 2; epi++)
-                                                for (int ch = 0; ch < numRealChannels; ch++)
-                                                    ParallelMath::ConditionalSet(temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch], shapeErrorBetter16, ep[epi][ch]);
-
-                                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                                                ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
-                                        }
-                                    }
-
-                                    if (refine != numRefineRounds - 1)
-                                        epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
-                                } // refine
-                            } // tweak
-                        } // p
-
-                        if (flags & cvtt::Flags::BC7_TrySingleColor)
-                        {
-                            MUInt15 total[4];
-                            for (int ch = 0; ch < 4; ch++)
-                                total[ch] = ParallelMath::MakeUInt15(0);
-
-                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + pxi];
-                                for (int ch = 0; ch < 4; ch++)
-                                    total[ch] = total[ch] + pixels[pxi][ch];
-                            }
-
-                            MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
-                            MFloat average[4];
-                            for (int ch = 0; ch < 4; ch++)
-                                average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
-
-                            const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
-                            MFloat &shapeBestError = temps.shapeBestError[shapeCollapsedEvalIndex];
-                            MUInt15(&shapeBestEP)[2][4] = temps.shapeBestEP[shapeCollapsedEvalIndex];
-                            MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
-
-                            const cvtt::Tables::BC7SC::Table **scTables = NULL;
-                            int numSCTables = 0;
-
-                            switch (mode)
-                            {
-                            case 0:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i3,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 1:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i1,
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i2,
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i3,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i1,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i2,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i3,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 2:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode2,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 3:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode3_p0,
-                                        &cvtt::Tables::BC7SC::g_mode3_p1,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 6:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i1,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i2,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i3,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i4,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i5,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i6,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i7,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i1,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i2,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i3,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i4,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i5,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i6,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i7,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 7:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode7_p00,
-                                        &cvtt::Tables::BC7SC::g_mode7_p01,
-                                        &cvtt::Tables::BC7SC::g_mode7_p10,
-                                        &cvtt::Tables::BC7SC::g_mode7_p11,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            default:
-                                assert(false);
-                                break;
-                            }
-
-                            TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
-                        }
-                    } // shapeIter
-
-                    for (uint16_t partition = 0; partition < numPartitions; partition++)
-                    {
-                        const int *partitionShapes;
-                        if (numSubsets == 1)
-                            partitionShapes = BC7Data::g_shapes1[partition];
-                        else if (numSubsets == 2)
-                            partitionShapes = BC7Data::g_shapes2[partition];
-                        else
-                        {
-                            assert(numSubsets == 3);
-                            partitionShapes = BC7Data::g_shapes3[partition];
-                        }
-
-                        MFloat totalError = ParallelMath::MakeFloatZero();
-                        for (int subset = 0; subset < numSubsets; subset++)
-                            totalError = totalError + temps.shapeBestError[shapeCollapseList[partitionShapes[subset]]];
-
-                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
-                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                        if (ParallelMath::AnySet(errorBetter16))
-                        {
-                            for (int subset = 0; subset < numSubsets; subset++)
-                            {
-                                int shape = partitionShapes[subset];
-                                int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                                int shapeLength = BC7Data::g_shapeRanges[shape][1];
-                                int shapeCollapsedEvalIndex = shapeCollapseList[shape];
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    for (int ch = 0; ch < 4; ch++)
-                                        ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch]);
-
-                                for (int pxi = 0; pxi < shapeLength; pxi++)
-                                {
-                                    int px = BC7Data::g_fragments[shapeStart + pxi];
-                                    ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
-                                }
-                            }
-
-                            work.m_error = ParallelMath::Min(totalError, work.m_error);
-                            ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
-                            ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
-                        }
-                    }
-                }
-            }
-
-            static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
-                // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
-                // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
-                // - Separate alpha channel, then weighted RGB
-                // - Alpha+2 other channels, then the independent channel
-
-                if (!(flags & Flags::BC7_EnableDualPlane))
-                    return;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                float channelWeightsSq[4];
-                for (int ch = 0; ch < 4; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                for (uint16_t mode = 4; mode <= 5; mode++)
-                {
-                    for (uint16_t rotation = 0; rotation < 4; rotation++)
-                    {
-                        int alphaChannel = (rotation + 3) & 3;
-                        int redChannel = (rotation == 1) ? 3 : 0;
-                        int greenChannel = (rotation == 2) ? 3 : 1;
-                        int blueChannel = (rotation == 3) ? 3 : 2;
-
-                        MUInt15 rotatedRGB[16][3];
-                        MFloat floatRotatedRGB[16][3];
-
-                        for (int px = 0; px < 16; px++)
-                        {
-                            rotatedRGB[px][0] = pixels[px][redChannel];
-                            rotatedRGB[px][1] = pixels[px][greenChannel];
-                            rotatedRGB[px][2] = pixels[px][blueChannel];
-
-                            for (int ch = 0; ch < 3; ch++)
-                                floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
-                        }
-
-                        uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
-
-                        float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
-                        float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
-                        float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
-                        float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
-
-                        float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
-
-                        MFloat preWeightedRotatedRGB[16][3];
-                        BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
-
-                        for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
-                        {
-                            EndpointSelector<3, 8> rgbSelector;
-
-                            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                            {
-                                for (int px = 0; px < 16; px++)
-                                    rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
-
-                                rgbSelector.FinishPass(epPass);
-                            }
-
-                            MUInt15 alphaRange[2];
-
-                            alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
-                            for (int px = 1; px < 16; px++)
-                            {
-                                alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
-                                alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
-                            }
-
-                            int rgbPrec = 0;
-                            int alphaPrec = 0;
-
-                            if (mode == 4)
-                            {
-                                rgbPrec = indexSelector ? 3 : 2;
-                                alphaPrec = indexSelector ? 2 : 3;
-                            }
-                            else
-                                rgbPrec = alphaPrec = 2;
-
-                            UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
-
-                            MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
-                            MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
-
-                            MUInt15 bestRGBIndexes[16];
-                            MUInt15 bestAlphaIndexes[16];
-                            MUInt15 bestEP[2][4];
-
-                            for (int px = 0; px < 16; px++)
-                                bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                MUInt15 rgbEP[2][3];
-                                MUInt15 alphaEP[2];
-
-                                unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
-
-                                TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
-
-                                for (int refine = 0; refine < numRefineRounds; refine++)
-                                {
-                                    if (mode == 4)
-                                        CompressEndpoints4(rgbEP, alphaEP, rtn);
-                                    else
-                                        CompressEndpoints5(rgbEP, alphaEP, rtn);
-
-
-                                    IndexSelector<1> alphaIndexSelector;
-                                    IndexSelector<3> rgbIndexSelector;
-
-                                    {
-                                        MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
-                                        alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
-                                    }
-                                    rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
-
-                                    EndpointRefiner<3> rgbRefiner;
-                                    EndpointRefiner<1> alphaRefiner;
-
-                                    rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
-                                    alphaRefiner.Init(1 << alphaPrec, uniformWeight);
-
-                                    MFloat errorRGB = ParallelMath::MakeFloatZero();
-                                    MFloat errorA = ParallelMath::MakeFloatZero();
-
-                                    MUInt15 rgbIndexes[16];
-                                    MUInt15 alphaIndexes[16];
-
-                                    AggregatedError<3> rgbAggError;
-                                    AggregatedError<1> alphaAggError;
-
-                                    for (int px = 0; px < 16; px++)
-                                    {
-                                        MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
-                                        MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
-
-                                        MUInt15 reconstructedRGB[3];
-                                        MUInt15 reconstructedAlpha[1];
-
-                                        rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
-                                        alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
-
-                                        if (flags & cvtt::Flags::BC7_FastIndexing)
-                                        {
-                                            BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
-                                            BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
-                                        }
-                                        else
-                                        {
-                                            AggregatedError<3> baseRGBAggError;
-                                            AggregatedError<1> baseAlphaAggError;
-
-                                            BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
-                                            BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
-
-                                            MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                            MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
-
-                                            MUInt15 altRGBIndexes[2];
-                                            MUInt15 altAlphaIndexes[2];
-
-                                            altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
-
-                                            altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
-
-                                            for (int ii = 0; ii < 2; ii++)
-                                            {
-                                                rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
-                                                alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
-
-                                                AggregatedError<3> altRGBAggError;
-                                                AggregatedError<1> altAlphaAggError;
-
-                                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
-                                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
-
-                                                MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                                MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
-
-                                                ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
-                                                ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
-
-                                                rgbError = ParallelMath::Min(altRGBError, rgbError);
-                                                alphaError = ParallelMath::Min(altAlphaError, alphaError);
-
-                                                ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
-                                                ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
-                                            }
-
-                                            errorRGB = errorRGB + rgbError;
-                                            errorA = errorA + alphaError;
-                                        }
-
-                                        if (refine != numRefineRounds - 1)
-                                        {
-                                            rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
-                                            alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
-                                        }
-
-                                        if (flags & Flags::BC7_FastIndexing)
-                                        {
-                                            errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                            errorA = rgbAggError.Finalize(flags, rotatedAlphaWeightSq);
-                                        }
-
-                                        rgbIndexes[px] = rgbIndex;
-                                        alphaIndexes[px] = alphaIndex;
-                                    }
-
-                                    ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
-                                    ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
-
-                                    ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
-                                    ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
-
-                                    if (ParallelMath::AnySet(rgbBetterInt16))
-                                    {
-                                        bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
-
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
-
-                                        for (int ep = 0; ep < 2; ep++)
-                                        {
-                                            for (int ch = 0; ch < 3; ch++)
-                                                ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
-                                        }
-                                    }
-
-                                    if (ParallelMath::AnySet(alphaBetterInt16))
-                                    {
-                                        bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
-
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
-
-                                        for (int ep = 0; ep < 2; ep++)
-                                            ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
-                                    }
-
-                                    if (refine != numRefineRounds - 1)
-                                    {
-                                        rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
-
-                                        MUInt15 alphaEPTemp[2][1];
-                                        alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
-
-                                        for (int i = 0; i < 2; i++)
-                                            alphaEP[i] = alphaEPTemp[i][0];
-                                    }
-                                }	// refine
-                            } // tweak
-
-                            MFloat combinedError = bestRGBError + bestAlphaError;
-
-                            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
-                            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                            work.m_error = ParallelMath::Min(combinedError, work.m_error);
-
-                            ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
-                            ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
-                            ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
-
-                            for (int px = 0; px < 16; px++)
-                            {
-                                ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
-                                ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
-                            }
-
-                            for (int ep = 0; ep < 2; ep++)
-                                for (int ch = 0; ch < 4; ch++)
-                                    ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
-                        }
-                    }
-                }
-            }
-
-            template<class T>
-            static void Swap(T& a, T& b)
-            {
-                T temp = a;
-                a = b;
-                b = temp;
-            }
-
-            static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], int numTweakRounds, int numRefineRounds)
-            {
-                MUInt15 pixels[16][4];
-                MFloat floatPixels[16][4];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
-                }
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
-                }
-
-                WorkInfo work;
-                memset(&work, 0, sizeof(work));
-
-                work.m_error = ParallelMath::MakeFloat(FLT_MAX);
-
-                {
-                    ParallelMath::RoundTowardNearestForScope rtn;
-                    TrySinglePlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
-                    TryDualPlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    PackingVector pv;
-                    pv.Init();
-
-                    ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
-                    ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
-                    ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
-
-                    const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
-
-                    ParallelMath::ScalarUInt16 indexes[16];
-                    ParallelMath::ScalarUInt16 indexes2[16];
-                    ParallelMath::ScalarUInt16 endPoints[3][2][4];
-
-                    for (int i = 0; i < 16; i++)
-                    {
-                        indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
-                        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                            indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
-                    }
-
-                    for (int subset = 0; subset < 3; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            for (int ch = 0; ch < 4; ch++)
-                                endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
-                        }
-                    }
-
-                    int fixups[3] = { 0, 0, 0 };
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
-                        bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
-
-                        if (flipRGB)
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                                indexes[px] = highIndex - indexes[px];
-                        }
-
-                        if (flipAlpha)
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                                indexes2[px] = highIndex - indexes2[px];
-                        }
-
-                        if (indexSelector)
-                            Swap(flipRGB, flipAlpha);
-
-                        if (flipRGB)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
-                        }
-                        if (flipAlpha)
-                            Swap(endPoints[0][0][3], endPoints[0][1][3]);
-
-                    }
-                    else
-                    {
-                        if (modeInfo.m_numSubsets == 2)
-                            fixups[1] = BC7Data::g_fixupIndexes2[partition];
-                        else if (modeInfo.m_numSubsets == 3)
-                        {
-                            fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
-                            fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
-                        }
-
-                        bool flip[3] = { false, false, false };
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                            flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
-
-                        if (flip[0] || flip[1] || flip[2])
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                            {
-                                int subset = 0;
-                                if (modeInfo.m_numSubsets == 2)
-                                    subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-                                else if (modeInfo.m_numSubsets == 3)
-                                    subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
-
-                                if (flip[subset])
-                                    indexes[px] = highIndex - indexes[px];
-                            }
-
-                            int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
-                            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                            {
-                                if (flip[subset])
-                                    for (int ch = 0; ch < maxCH; ch++)
-                                        Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
-                            }
-                        }
-                    }
-
-                    pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
-
-                    if (modeInfo.m_partitionBits)
-                        pv.Pack(partition, modeInfo.m_partitionBits);
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
-                        pv.Pack(rotation, 2);
-                    }
-
-                    if (modeInfo.m_hasIndexSelector)
-                        pv.Pack(indexSelector, 1);
-
-                    // Encode RGB
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
-                                epPart >>= (8 - modeInfo.m_rgbBits);
-
-                                pv.Pack(epPart, modeInfo.m_rgbBits);
-                            }
-                        }
-                    }
-
-                    // Encode alpha
-                    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
-                                epPart >>= (8 - modeInfo.m_alphaBits);
-
-                                pv.Pack(epPart, modeInfo.m_alphaBits);
-                            }
-                        }
-                    }
-
-                    // Encode parity bits
-                    if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
-                            epPart >>= (7 - modeInfo.m_rgbBits);
-                            epPart &= 1;
-
-                            pv.Pack(epPart, 1);
-                        }
-                    }
-                    else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
-                                epPart >>= (7 - modeInfo.m_rgbBits);
-                                epPart &= 1;
-
-                                pv.Pack(epPart, 1);
-                            }
-                        }
-                    }
-
-                    // Encode indexes
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int bits = modeInfo.m_indexBits;
-                        if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
-                            bits--;
-
-                        pv.Pack(indexes[px], bits);
-                    }
-
-                    // Encode secondary indexes
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        for (int px = 0; px < 16; px++)
-                        {
-                            int bits = modeInfo.m_alphaIndexBits;
-                            if (px == 0)
-                                bits--;
-
-                            pv.Pack(indexes2[px], bits);
-                        }
-                    }
-
-                    pv.Flush(packedBlocks);
-
-                    packedBlocks += 16;
-                }
-            }
-
-            static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
-            {
-                UnpackingVector pv;
-                pv.Init(packedBlock);
-
-                int mode = 8;
-                for (int i = 0; i < 8; i++)
-                {
-                    if (pv.Unpack(1) == 1)
-                    {
-                        mode = i;
-                        break;
-                    }
-                }
-
-                if (mode > 7)
-                {
-                    for (int px = 0; px < 16; px++)
-                        for (int ch = 0; ch < 4; ch++)
-                            output.m_pixels[px][ch] = 0;
-
-                    return;
-                }
-
-                const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
-
-                int partition = 0;
-                if (modeInfo.m_partitionBits)
-                    partition = pv.Unpack(modeInfo.m_partitionBits);
-
-                int rotation = 0;
-                if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    rotation = pv.Unpack(2);
-
-                int indexSelector = 0;
-                if (modeInfo.m_hasIndexSelector)
-                    indexSelector = pv.Unpack(1);
-
-                // Resolve fixups
-                int fixups[3] = { 0, 0, 0 };
-
-                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
-                {
-                    if (modeInfo.m_numSubsets == 2)
-                        fixups[1] = BC7Data::g_fixupIndexes2[partition];
-                    else if (modeInfo.m_numSubsets == 3)
-                    {
-                        fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
-                        fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
-                    }
-                }
-
-                int endPoints[3][2][4];
-
-                // Decode RGB
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
-                    }
-                }
-
-                // Decode alpha
-                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
-                    }
-                }
-                else
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][3] = 255;
-                    }
-                }
-
-                int parityBits = 0;
-
-                // Decode parity bits
-                if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        int p = pv.Unpack(1);
-
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
-
-                            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                                endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
-                        }
-                    }
-
-                    parityBits = 1;
-                }
-                else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            int p = pv.Unpack(1);
-
-                            for (int ch = 0; ch < 3; ch++)
-                                endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
-
-                            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                                endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
-                        }
-                    }
-
-                    parityBits = 1;
-                }
-
-                // Fill endpoint bits
-                for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                {
-                    for (int ep = 0; ep < 2; ep++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
-
-                        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                            endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
-                    }
-                }
-
-                int indexes[16];
-                int indexes2[16];
-
-                // Decode indexes
-                for (int px = 0; px < 16; px++)
-                {
-                    int bits = modeInfo.m_indexBits;
-                    if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
-                        bits--;
-
-                    indexes[px] = pv.Unpack(bits);
-                }
-
-                // Decode secondary indexes
-                if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int bits = modeInfo.m_alphaIndexBits;
-                        if (px == 0)
-                            bits--;
-
-                        indexes2[px] = pv.Unpack(bits);
-                    }
-                }
-                else
-                {
-                    for (int px = 0; px < 16; px++)
-                        indexes2[px] = 0;
-                }
-
-                const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
-                const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
-
-                // Decode each pixel
-                for (int px = 0; px < 16; px++)
-                {
-                    int rgbWeight = 0;
-                    int alphaWeight = 0;
-
-                    int rgbIndex = indexes[px];
-
-                    rgbWeight = rgbWeights[indexes[px]];
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
-                        alphaWeight = rgbWeight;
-                    else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                        alphaWeight = alphaWeights[indexes2[px]];
-
-                    if (indexSelector == 1)
-                    {
-                        int temp = rgbWeight;
-                        rgbWeight = alphaWeight;
-                        alphaWeight = temp;
-                    }
-
-                    int pixel[4] = { 0, 0, 0, 255 };
-
-                    int subset = 0;
-
-                    if (modeInfo.m_numSubsets == 2)
-                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-                    else if (modeInfo.m_numSubsets == 3)
-                        subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
-
-                    for (int ch = 0; ch < 3; ch++)
-                        pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
-
-                    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                        pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
-
-                    if (rotation != 0)
-                    {
-                        int ch = rotation - 1;
-                        int temp = pixel[ch];
-                        pixel[ch] = pixel[3];
-                        pixel[3] = temp;
-                    }
-
-                    for (int ch = 0; ch < 4; ch++)
-                        output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
-                }
-            }
-        };
-
-        class BC6HComputer
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::UInt31 MUInt31;
-
-            static const int MaxTweakRounds = 4;
-            static const int MaxRefineRounds = 3;
-
-            static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
-            {
-                assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
-                assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
-
-                // Expand to full range
-                ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
-                MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
-
-                absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
-
-                MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
-
-                return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
-            }
-
-            static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
-            {
-                MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
-                return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
-            }
-
-            static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
-            {
-                MSInt16 zero = ParallelMath::MakeSInt16(0);
-
-                ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
-                MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
-
-                MSInt16 unq;
-                MUInt15 absUnq;
-
-                if (precision >= 16)
-                {
-                    unq = comp;
-                    absUnq = absComp;
-                }
-                else
-                {
-                    MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
-                    ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
-                    ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
-
-                    absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
-                    ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
-                    ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
-
-                    unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
-                }
-
-                outUnquantized = unq;
-
-                MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
-
-                outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
-            }
-
-            static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
-            {
-                MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
-                if (precision < 15)
-                {
-                    MUInt15 zero = ParallelMath::MakeUInt15(0);
-                    MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
-
-                    ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
-                    ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
-
-                    unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
-
-                    ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
-                    ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
-                }
-
-                outUnquantized = unq;
-                outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
-            }
-
-            static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MSInt16 unquantizedEP[2][3];
-                MSInt16 finishedUnquantizedEP[2][3];
-
-                {
-                    ParallelMath::RoundUpForScope ru;
-
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
-                            UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
-                            quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
-                        }
-                    }
-                }
-
-                indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
-                indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
-
-                MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
-
-                MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
-
-                ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
-
-                if (ParallelMath::AnySet(invert))
-                {
-                    ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
-
-                    indexSelector.ConditionalInvert(invert);
-
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MAInt16 firstEP = quantizedEndPoints[0][ch];
-                        MAInt16 secondEP = quantizedEndPoints[1][ch];
-
-                        quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
-                        quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
-                    }
-                }
-
-                indexes[fixupIndex] = index;
-            }
-
-            static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MUInt16 unquantizedEP[2][3];
-                MUInt16 finishedUnquantizedEP[2][3];
-
-                {
-                    ParallelMath::RoundUpForScope ru;
-
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
-                            UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
-                            quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
-                        }
-                    }
-                }
-
-                indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
-                indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
-
-                MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
-
-                MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
-
-                ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
-
-                if (ParallelMath::AnySet(invert))
-                {
-                    ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
-
-                    indexSelector.ConditionalInvert(invert);
-
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MAInt16 firstEP = quantizedEndPoints[0][ch];
-                        MAInt16 secondEP = quantizedEndPoints[1][ch];
-
-                        quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
-                        quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
-                    }
-                }
-
-                indexes[fixupIndex] = index;
-            }
-
-            static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
-            {
-                ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
-
-                MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
-
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    outEncodedEPs[0][0][ch] = ep0[0][ch];
-                    outEncodedEPs[0][1][ch] = ep0[1][ch];
-                    outEncodedEPs[1][0][ch] = ep1[0][ch];
-                    outEncodedEPs[1][1][ch] = ep1[1][ch];
-
-                    if (isTransformed)
-                    {
-                        for (int subset = 0; subset < 2; subset++)
-                        {
-                            for (int epi = 0; epi < 2; epi++)
-                            {
-                                if (epi == 0 && subset == 0)
-                                    continue;
-
-                                MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
-
-                                MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
-
-                                outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
-
-                                MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
-                                allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
-                            }
-                        }
-                    }
-
-                    if (!ParallelMath::AnySet(allLegal))
-                        break;
-                }
-
-                outIsLegal = allLegal;
-            }
-
-            static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
-            {
-                ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
-
-                MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
-
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    outEncodedEPs[0][ch] = ep[0][ch];
-                    outEncodedEPs[1][ch] = ep[1][ch];
-
-                    if (isTransformed)
-                    {
-                        MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
-
-                        MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
-
-                        outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
-
-                        MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
-                        allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
-                    }
-                }
-
-                outIsLegal = allLegal;
-            }
-
-            static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
-            {
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-                else if (numRefineRounds > MaxRefineRounds)
-                    numRefineRounds = MaxRefineRounds;
-
-                bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
-                float channelWeightsSq[3];
-
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                MSInt16 pixels[16][3];
-                MFloat floatPixels2CL[16][3];
-                MFloat floatPixelsLinearWeighted[16][3];
-
-                MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MSInt16 pixelValue;
-                        ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
-
-                        // Convert from sign+magnitude to 2CL
-                        if (isSigned)
-                        {
-                            ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
-                            MSInt16 magnitude = (pixelValue & low15Bits);
-                            ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
-                            pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
-                        }
-                        else
-                            pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
-
-                        pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
-
-                        pixels[px][ch] = pixelValue;
-                        floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
-                        floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
-                    }
-                }
-
-                MFloat preWeightedPixels[16][3];
-
-                BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
-
-                MAInt16 bestEndPoints[2][2][3];
-                MUInt15 bestIndexes[16];
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-                MUInt15 bestMode = ParallelMath::MakeUInt15(0);
-                MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
-
-                for (int px = 0; px < 16; px++)
-                    bestIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                for (int subset = 0; subset < 2; subset++)
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
-
-                UnfinishedEndpoints<3> partitionedUFEP[32][2];
-                UnfinishedEndpoints<3> singleUFEP;
-
-                // Generate UFEP for partitions
-                for (int p = 0; p < 32; p++)
-                {
-                    int partitionMask = BC7Data::g_partitionMap[p];
-
-                    EndpointSelector<3, 8> epSelectors[2];
-
-                    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                    {
-                        for (int px = 0; px < 16; px++)
-                        {
-                            int subset = (partitionMask >> px) & 1;
-                            epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
-                        }
-
-                        for (int subset = 0; subset < 2; subset++)
-                            epSelectors[subset].FinishPass(pass);
-                    }
-
-                    for (int subset = 0; subset < 2; subset++)
-                        partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
-                }
-
-                // Generate UFEP for single
-                {
-                    EndpointSelector<3, 8> epSelector;
-
-                    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                    {
-                        for (int px = 0; px < 16; px++)
-                            epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
-
-                        epSelector.FinishPass(pass);
-                    }
-
-                    singleUFEP = epSelector.GetEndpoints(channelWeights);
-                }
-
-                for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
-                {
-                    bool partitioned = (partitionedInt == 1);
-
-                    for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
-                    {
-                        if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
-                            continue;
-
-                        int numPartitions = partitioned ? 32 : 1;
-                        int numSubsets = partitioned ? 2 : 1;
-                        int indexBits = partitioned ? 3 : 4;
-                        int indexRange = (1 << indexBits);
-
-                        for (int p = 0; p < numPartitions; p++)
-                        {
-                            int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
-
-                            const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
-
-                            MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
-                            MUInt15 metaIndexes[MaxMetaRounds][16];
-                            MFloat metaError[MaxMetaRounds][2];
-
-                            bool roundValid[MaxMetaRounds][2];
-
-                            for (int r = 0; r < MaxMetaRounds; r++)
-                                for (int subset = 0; subset < 2; subset++)
-                                    roundValid[r][subset] = true;
-
-                            for (int subset = 0; subset < numSubsets; subset++)
-                            {
-                                for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
-                                {
-                                    EndpointRefiner<3> refiners[2];
-
-                                    bool abortRemainingRefines = false;
-                                    for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
-                                    {
-                                        int metaRound = tweak * MaxRefineRounds + refinePass;
-
-                                        if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
-                                            abortRemainingRefines = true;
-
-                                        if (abortRemainingRefines)
-                                        {
-                                            roundValid[metaRound][subset] = false;
-                                            continue;
-                                        }
-
-                                        MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
-                                        MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
-
-                                        MSInt16 endPointsColorSpace[2][3];
-
-                                        if (refinePass == 0)
-                                        {
-                                            UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
-
-                                            if (isSigned)
-                                                ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
-                                            else
-                                                ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
-                                        }
-                                        else
-                                            refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
-
-                                        refiners[subset].Init(indexRange, channelWeights);
-
-                                        int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
-
-                                        IndexSelectorHDR<3> indexSelector;
-                                        if (isSigned)
-                                            QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
-                                        else
-                                            QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
-
-                                        if (metaRound > 0)
-                                        {
-                                            ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
-
-                                            for (int prevRound = 0; prevRound < metaRound; prevRound++)
-                                            {
-                                                MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
-
-                                                ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
-
-                                                for (int epi = 0; epi < 2; epi++)
-                                                    for (int ch = 0; ch < 3; ch++)
-                                                        same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
-
-                                                anySame = (anySame | same);
-                                                if (ParallelMath::AllSet(anySame))
-                                                    break;
-                                            }
-
-                                            if (ParallelMath::AllSet(anySame))
-                                            {
-                                                roundValid[metaRound][subset] = false;
-                                                continue;
-                                            }
-                                        }
-
-                                        MFloat subsetError = ParallelMath::MakeFloatZero();
-
-                                        {
-                                            for (int px = 0; px < 16; px++)
-                                            {
-                                                if (subset != ((partitionMask >> px) & 1))
-                                                    continue;
-
-                                                MUInt15 index;
-                                                if (px == fixupIndex)
-                                                    index = mrIndexes[px];
-                                                else
-                                                {
-                                                    index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
-                                                    mrIndexes[px] = index;
-                                                }
-
-                                                MSInt16 reconstructed[3];
-                                                if (isSigned)
-                                                    indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
-                                                else
-                                                    indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
-
-                                                subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
-
-                                                if (refinePass != numRefineRounds - 1)
-                                                    refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
-                                            }
-                                        }
-
-                                        metaError[metaRound][subset] = subsetError;
-                                    }
-                                }
-                            }
-
-                            // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
-                            int numMeta1 = partitioned ? MaxMetaRounds : 1;
-                            for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
-                            {
-                                if (!roundValid[meta0][0])
-                                    continue;
-
-                                for (int meta1 = 0; meta1 < numMeta1; meta1++)
-                                {
-                                    MFloat combinedError = metaError[meta0][0];
-                                    if (partitioned)
-                                    {
-                                        if (!roundValid[meta1][1])
-                                            continue;
-
-                                        combinedError = combinedError + metaError[meta1][1];
-                                    }
-
-                                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
-                                    if (!ParallelMath::AnySet(errorBetter))
-                                        continue;
-
-                                    ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                                    // Figure out if this is encodable
-                                    for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
-                                    {
-                                        const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
-
-                                        if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
-                                            continue;
-
-                                        MAInt16 encodedEPs[2][2][3];
-                                        ParallelMath::Int16CompFlag isLegal;
-                                        if (partitioned)
-                                            EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
-                                        else
-                                            EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
-
-                                        ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
-                                        if (!ParallelMath::AnySet(isLegalAndBetter))
-                                            continue;
-
-                                        ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
-
-                                        ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
-                                        ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
-                                        ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
-
-                                        for (int subset = 0; subset < numSubsets; subset++)
-                                        {
-                                            for (int epi = 0; epi < 2; epi++)
-                                            {
-                                                for (int ch = 0; ch < 3; ch++)
-                                                    ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
-                                            }
-                                        }
-
-                                        for (int px = 0; px < 16; px++)
-                                        {
-                                            int subset = ((partitionMask >> px) & 1);
-                                            if (subset == 0)
-                                                ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
-                                            else
-                                                ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
-                                        }
-
-                                        needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
-                                        if (!ParallelMath::AnySet(needsCommit))
-                                            break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-
-                // At this point, everything should be set
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
-                    ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
-                    int32_t eps[2][2][3];
-                    ParallelMath::ScalarUInt16 indexes[16];
-
-                    const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
-
-                    const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
-
-                    const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
-
-                    for (int subset = 0; subset < 2; subset++)
-                    {
-                        for (int epi = 0; epi < 2; epi++)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
-                        }
-                    }
-
-                    for (int px = 0; px < 16; px++)
-                        indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
-
-                    uint16_t modeID = modeInfo.m_modeID;
-
-                    PackingVector pv;
-                    pv.Init();
-
-                    for (size_t i = 0; i < headerBits; i++)
-                    {
-                        int32_t codedValue = 0;
-                        switch (desc[i].m_eField)
-                        {
-                        case BC6HData::M:  codedValue = modeID; break;
-                        case BC6HData::D:  codedValue = partition; break;
-                        case BC6HData::RW: codedValue = eps[0][0][0]; break;
-                        case BC6HData::RX: codedValue = eps[0][1][0]; break;
-                        case BC6HData::RY: codedValue = eps[1][0][0]; break;
-                        case BC6HData::RZ: codedValue = eps[1][1][0]; break;
-                        case BC6HData::GW: codedValue = eps[0][0][1]; break;
-                        case BC6HData::GX: codedValue = eps[0][1][1]; break;
-                        case BC6HData::GY: codedValue = eps[1][0][1]; break;
-                        case BC6HData::GZ: codedValue = eps[1][1][1]; break;
-                        case BC6HData::BW: codedValue = eps[0][0][2]; break;
-                        case BC6HData::BX: codedValue = eps[0][1][2]; break;
-                        case BC6HData::BY: codedValue = eps[1][0][2]; break;
-                        case BC6HData::BZ: codedValue = eps[1][1][2]; break;
-                        default: assert(false); break;
-                        }
-
-                        pv.Pack(static_cast<uint16_t>((codedValue >> desc[i].m_uBit) & 1), 1);
-                    }
-
-                    int fixupIndex1 = 0;
-                    int indexBits = 4;
-                    if (modeInfo.m_partitioned)
-                    {
-                        fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
-                        indexBits = 3;
-                    }
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
-                        if (px == 0 || px == fixupIndex1)
-                            pv.Pack(index, indexBits - 1);
-                        else
-                            pv.Pack(index, indexBits);
-                    }
-
-                    pv.Flush(packedBlocks + 16 * block);
-                }
-            }
-
-            static void SignExtendSingle(int &v, int bits)
-            {
-                if (v & (1 << (bits - 1)))
-                    v |= -(1 << bits);
-            }
-
-            static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
-            {
-                UnpackingVector pv;
-                pv.Init(pBC);
-
-                int numModeBits = 2;
-                int modeBits = pv.Unpack(2);
-                if (modeBits != 0 && modeBits != 1)
-                {
-                    modeBits |= pv.Unpack(3) << 2;
-                    numModeBits += 3;
-                }
-
-                int mode = -1;
-                for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
-                {
-                    if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
-                    {
-                        mode = possibleMode;
-                        break;
-                    }
-                }
-
-                if (mode < 0)
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            output.m_pixels[px][ch] = 0;
-                        output.m_pixels[px][3] = 0x3c00;	// 1.0
-                    }
-                    return;
-                }
-
-                const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
-                const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
-                const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
-
-                int32_t partition = 0;
-                int32_t eps[2][2][3];
-
-                for (int subset = 0; subset < 2; subset++)
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            eps[subset][epi][ch] = 0;
-
-                for (size_t i = numModeBits; i < headerBits; i++)
-                {
-                    int32_t *pCodedValue = NULL;
-
-                    switch (desc[i].m_eField)
-                    {
-                    case BC6HData::D:  pCodedValue = &partition; break;
-                    case BC6HData::RW: pCodedValue = &eps[0][0][0]; break;
-                    case BC6HData::RX: pCodedValue = &eps[0][1][0]; break;
-                    case BC6HData::RY: pCodedValue = &eps[1][0][0]; break;
-                    case BC6HData::RZ: pCodedValue = &eps[1][1][0]; break;
-                    case BC6HData::GW: pCodedValue = &eps[0][0][1]; break;
-                    case BC6HData::GX: pCodedValue = &eps[0][1][1]; break;
-                    case BC6HData::GY: pCodedValue = &eps[1][0][1]; break;
-                    case BC6HData::GZ: pCodedValue = &eps[1][1][1]; break;
-                    case BC6HData::BW: pCodedValue = &eps[0][0][2]; break;
-                    case BC6HData::BX: pCodedValue = &eps[0][1][2]; break;
-                    case BC6HData::BY: pCodedValue = &eps[1][0][2]; break;
-                    case BC6HData::BZ: pCodedValue = &eps[1][1][2]; break;
-                    default: assert(false); break;
-                    }
-
-                    (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit;
-                }
-
-
-                uint16_t modeID = modeInfo.m_modeID;
-
-                int fixupIndex1 = 0;
-                int indexBits = 4;
-                int numSubsets = 1;
-                if (modeInfo.m_partitioned)
-                {
-                    fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
-                    indexBits = 3;
-                    numSubsets = 2;
-                }
-
-                int indexes[16];
-                for (int px = 0; px < 16; px++)
-                {
-                    if (px == 0 || px == fixupIndex1)
-                        indexes[px] = pv.Unpack(indexBits - 1);
-                    else
-                        indexes[px] = pv.Unpack(indexBits);
-                }
-
-                if (modeInfo.m_partitioned)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        if (isSigned)
-                            SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
-                        if (modeInfo.m_transformed || isSigned)
-                        {
-                            SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
-                            SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
-                            SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
-                        }
-                    }
-                }
-                else
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        if (isSigned)
-                            SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
-                        if (modeInfo.m_transformed || isSigned)
-                            SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
-                    }
-                }
-
-                int aPrec = modeInfo.m_aPrec;
-
-                if (modeInfo.m_transformed)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        int wrapMask = (1 << aPrec) - 1;
-
-                        eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
-                        if (isSigned)
-                            SignExtendSingle(eps[0][1][ch], aPrec);
-
-                        if (modeInfo.m_partitioned)
-                        {
-                            eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
-                            eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
-
-                            if (isSigned)
-                            {
-                                SignExtendSingle(eps[1][0][ch], aPrec);
-                                SignExtendSingle(eps[1][1][ch], aPrec);
-                            }
-                        }
-                    }
-                }
-
-                // Unquantize endpoints
-                for (int subset = 0; subset < numSubsets; subset++)
-                {
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            int &v = eps[subset][epi][ch];
-
-                            if (isSigned)
-                            {
-                                if (aPrec >= 16)
-                                {
-                                    // Nothing
-                                }
-                                else
-                                {
-                                    bool s = false;
-                                    int comp = v;
-                                    if (v < 0)
-                                    {
-                                        s = true;
-                                        comp = -comp;
-                                    }
-
-                                    int unq = 0;
-                                    if (comp == 0)
-                                        unq = 0;
-                                    else if (comp >= ((1 << (aPrec - 1)) - 1))
-                                        unq = 0x7fff;
-                                    else
-                                        unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
-
-                                    if (s)
-                                        unq = -unq;
-
-                                    v = unq;
-                                }
-                            }
-                            else
-                            {
-                                if (aPrec >= 15)
-                                {
-                                    // Nothing
-                                }
-                                else if (v == 0)
-                                {
-                                    // Nothing
-                                }
-                                else if (v == ((1 << aPrec) - 1))
-                                    v = 0xffff;
-                                else
-                                    v = ((v << 16) + 0x8000) >> aPrec;
-                            }
-                        }
-                    }
-                }
-
-                const int *weights = BC7Data::g_weightTables[indexBits];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    int subset = 0;
-                    if (modeInfo.m_partitioned)
-                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-
-                    int w = weights[indexes[px]];
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
-
-                        if (isSigned)
-                        {
-                            if (comp < 0)
-                                comp = -(((-comp) * 31) >> 5);
-                            else
-                                comp = (comp * 31) >> 5;
-
-                            int s = 0;
-                            if (comp < 0)
-                            {
-                                s = 0x8000;
-                                comp = -comp;
-                            }
-
-                            output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
-                        }
-                        else
-                        {
-                            comp = (comp * 31) >> 6;
-                            output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
-                        }
-                    }
-                    output.m_pixels[px][3] = 0x3c00;	// 1.0
-                }
-            }
-        };
-
-        namespace S3TCSingleColorTables
-        {
-            struct SingleColorTableEntry
-            {
-                uint8_t m_min;
-                uint8_t m_max;
-                uint8_t m_actualColor;
-                uint8_t m_span;
-            };
-
-            SingleColorTableEntry g_singleColor5_3[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
-                { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
-                { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
-                { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
-                { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
-                { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
-                { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
-                { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
-                { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
-                { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
-                { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
-                { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
-                { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
-                { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
-                { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
-                { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
-                { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
-                { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
-                { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
-                { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_3[256] =
-            {
-                { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 69, 0, 23, 69 },
-                { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 65, 8, 27, 57 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 69, 12, 31, 57 },
-                { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 65, 20, 35, 45 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 69, 24, 39, 45 },
-                { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
-                { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
-                { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
-                { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
-                { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
-                { 56, 93, 80, 37 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 60, 97, 84, 37 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
-                { 56, 105, 88, 49 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 60, 109, 92, 49 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
-                { 134, 77, 96, 57 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 130, 85, 100, 45 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
-                { 134, 89, 104, 45 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
-                { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
-                { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
-                { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
-                { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
-                { 142, 146, 144, 4 }, { 121, 158, 145, 37 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 125, 162, 149, 37 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
-                { 150, 154, 152, 4 }, { 121, 170, 153, 49 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 125, 174, 157, 49 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
-                { 158, 162, 160, 4 }, { 199, 142, 161, 57 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 195, 150, 165, 45 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
-                { 166, 170, 168, 4 }, { 199, 154, 169, 45 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
-                { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
-                { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
-                { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
-                { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
-                { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 186, 223, 210, 37 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 190, 227, 214, 37 }, { 215, 215, 215, 0 },
-                { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 186, 235, 218, 49 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 190, 239, 222, 49 }, { 223, 223, 223, 0 },
-                { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 186, 247, 226, 61 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 190, 251, 230, 61 }, { 231, 231, 231, 0 },
-                { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_2[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
-                { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
-                { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
-                { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
-                { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
-                { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
-                { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
-                { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
-                { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
-                { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
-                { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
-                { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
-                { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
-                { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
-                { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
-                { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
-                { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
-                { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
-                { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
-                { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_2[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
-                { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
-                { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
-                { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
-                { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 },
-                { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 },
-                { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
-                { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
-                { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
-                { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
-                { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 },
-                { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 },
-                { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
-                { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
-                { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
-                { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
-                { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
-                { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 },
-                { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 },
-                { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_3_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
-                { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
-                { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
-                { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
-                { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
-                { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
-                { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
-                { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
-                { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
-                { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
-                { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
-                { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
-                { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
-                { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
-                { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
-                { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
-                { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
-                { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
-                { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
-                { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_3_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
-                { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
-                { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
-                { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
-                { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
-                { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
-                { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
-                { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
-                { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
-                { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
-                { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
-                { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
-                { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
-                { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
-                { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
-                { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
-                { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
-                { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
-                { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
-                { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_2_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
-                { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
-                { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
-                { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
-                { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
-                { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
-                { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
-                { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
-                { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
-                { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
-                { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
-                { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
-                { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
-                { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
-                { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
-                { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
-                { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
-                { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
-                { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
-                { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_2_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
-                { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
-                { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
-                { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
-                { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 },
-                { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 },
-                { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
-                { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
-                { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
-                { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
-                { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 },
-                { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 },
-                { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
-                { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
-                { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
-                { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
-                { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
-                { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
-                { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
-                { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-        }
-
-        class S3TCComputer
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            static void Init(MFloat& error)
-            {
-                error = ParallelMath::MakeFloat(FLT_MAX);
-            }
-
-            static void QuantizeTo6Bits(MUInt15& v)
-            {
-                MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
-                v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
-            }
-
-            static void QuantizeTo5Bits(MUInt15& v)
-            {
-                MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
-                v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
-            }
-
-            static void QuantizeTo565(MUInt15 endPoint[3])
-            {
-                QuantizeTo5Bits(endPoint[0]);
-                QuantizeTo6Bits(endPoint[1]);
-                QuantizeTo5Bits(endPoint[2]);
-            }
-
-            static MFloat ParanoidFactorForSpan(const MSInt16& span)
-            {
-                return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
-            }
-
-            static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
-            {
-                MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
-                absDiff = absDiff + d;
-                return absDiff * absDiff;
-            }
-
-            static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
-                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                float channelWeightsSq[3];
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                        totals[ch] = totals[ch] + pixels[px][ch];
-                }
-
-                MUInt15 average[3];
-                for (int ch = 0; ch < 3; ch++)
-                    average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
-
-                const S3TCSingleColorTables::SingleColorTableEntry* rbTable = NULL;
-                const S3TCSingleColorTables::SingleColorTableEntry* gTable = NULL;
-                if (flags & cvtt::Flags::S3TC_Paranoid)
-                {
-                    if (range == 4)
-                    {
-                        rbTable = S3TCSingleColorTables::g_singleColor5_3_p;
-                        gTable = S3TCSingleColorTables::g_singleColor6_3_p;
-                    }
-                    else
-                    {
-                        assert(range == 3);
-                        rbTable = S3TCSingleColorTables::g_singleColor5_2_p;
-                        gTable = S3TCSingleColorTables::g_singleColor6_2_p;
-                    }
-                }
-                else
-                {
-                    if (range == 4)
-                    {
-                        rbTable = S3TCSingleColorTables::g_singleColor5_3;
-                        gTable = S3TCSingleColorTables::g_singleColor6_3;
-                    }
-                    else
-                    {
-                        assert(range == 3);
-                        rbTable = S3TCSingleColorTables::g_singleColor5_2;
-                        gTable = S3TCSingleColorTables::g_singleColor6_2;
-                    }
-                }
-
-                MUInt15 interpolated[3];
-                MUInt15 eps[2][3];
-                MSInt16 spans[3];
-                for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        uint16_t avg = ParallelMath::Extract(average[ch], i);
-                        const S3TCSingleColorTables::SingleColorTableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
-                        ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
-                        ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
-                        ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
-                        ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
-                    }
-                }
-
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & cvtt::Flags::S3TC_Paranoid)
-                {
-                    MFloat spanParanoidFactors[3];
-                    for (int ch = 0; ch < 3; ch++)
-                        spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
-                    }
-                }
-                else
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
-                    }
-                }
-
-                ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
-                ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
-
-                if (ParallelMath::AnySet(better16))
-                {
-                    bestError = ParallelMath::Min(bestError, error);
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
-
-                    MUInt15 vindexes = ParallelMath::MakeUInt15(1);
-                    for (int px = 0; px < 16; px++)
-                        ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
-
-                    ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
-                }
-            }
-
-            static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
-                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                float channelWeightsSq[3];
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                MUInt15 endPoints[2][3];
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < 3; ch++)
-                        endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
-
-                QuantizeTo565(endPoints[0]);
-                QuantizeTo565(endPoints[1]);
-
-                IndexSelector<3> selector;
-                selector.Init<false>(channelWeights, endPoints, range);
-
-                MUInt15 indexes[16];
-
-                MFloat paranoidFactors[3];
-                for (int ch = 0; ch < 3; ch++)
-                    paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
-
-                MFloat error = ParallelMath::MakeFloatZero();
-                AggregatedError<3> aggError;
-                for (int px = 0; px < 16; px++)
-                {
-                    MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
-                    indexes[px] = index;
-
-                    if (refiner)
-                        refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
-
-                    MUInt15 reconstructed[3];
-                    selector.ReconstructLDRPrecise(index, reconstructed);
-
-                    if (flags & Flags::S3TC_Paranoid)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
-                    }
-                    else
-                        BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
-                }
-
-                if (!(flags & Flags::S3TC_Paranoid))
-                    error = aggError.Finalize(flags, channelWeightsSq);
-
-                ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
-
-                if (ParallelMath::AnySet(better))
-                {
-                    ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
-
-                    ParallelMath::ConditionalSet(bestError, better, error);
-
-                    for (int ep = 0; ep < 2; ep++)
-                        for (int ch = 0; ch < 3; ch++)
-                            ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
-
-                    for (int px = 0; px < 16; px++)
-                        ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
-
-                    ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
-                }
-            }
-
-            static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
-                const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
-                const ParallelMath::RoundTowardNearestForScope* rtn)
-            {
-                UNREFERENCED_PARAMETER(alphaTest);
-                UNREFERENCED_PARAMETER(flags);
-
-                EndpointRefiner<3> refiner;
-
-                refiner.Init(nCounts, channelWeights);
-
-                bool escape = false;
-                int e = 0;
-                for (int i = 0; i < nCounts; i++)
-                {
-                    for (int n = 0; n < counts[i]; n++)
-                    {
-                        ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
-                        if (!ParallelMath::AnySet(valid))
-                        {
-                            escape = true;
-                            break;
-                        }
-
-                        if (ParallelMath::AllSet(valid))
-                            refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
-                        else
-                        {
-                            MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
-                            refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
-                        }
-                    }
-
-                    if (escape)
-                        break;
-                }
-
-                MUInt15 endPoints[2][3];
-                refiner.GetRefinedEndpointsLDR(endPoints, rtn);
-
-                TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
-            }
-
-            static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
-            {
-                UNREFERENCED_PARAMETER(flags);
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                float weights[1] = { 1.0f };
-
-                MUInt15 pixels[16];
-                MFloat floatPixels[16];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
-                    floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
-                }
-
-                MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
-
-                IndexSelector<1> selector;
-                selector.Init<false>(weights, ep, 16);
-
-                MUInt15 indexes[16];
-
-                for (int px = 0; px < 16; px++)
-                    indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    for (int px = 0; px < 16; px += 8)
-                    {
-                        int index0 = ParallelMath::Extract(indexes[px], block);
-                        int index1 = ParallelMath::Extract(indexes[px], block);
-
-                        packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
-                    }
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-
-            static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
-            {
-                if (maxTweakRounds < 1)
-                    maxTweakRounds = 1;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                float oneWeight[1] = { 1.0f };
-
-                MUInt15 pixels[16];
-                MFloat floatPixels[16];
-
-                MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
-                MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
-
-                for (int px = 0; px < 16; px++)
-                {
-                    ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
-
-                    if (isSigned)
-                        pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
-
-                    floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
-                }
-
-                MUInt15 sortedPixels[16];
-                for (int px = 0; px < 16; px++)
-                    sortedPixels[px] = pixels[px];
-
-                for (int sortEnd = 15; sortEnd > 0; sortEnd--)
-                {
-                    for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
-                    {
-                        MUInt15 a = sortedPixels[sortOffset];
-                        MUInt15 b = sortedPixels[sortOffset + 1];
-
-                        sortedPixels[sortOffset] = ParallelMath::Min(a, b);
-                        sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
-                    }
-                }
-
-                MUInt15 zero = ParallelMath::MakeUInt15(0);
-                MUInt15 one = ParallelMath::MakeUInt15(1);
-
-                MUInt15 bestIsFullRange = zero;
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-                MUInt15 bestEP[2] = { zero, zero };
-                MUInt15 bestIndexes[16] = {
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero
-                };
-
-                // Full-precision
-                {
-                    MUInt15 minEP = sortedPixels[0];
-                    MUInt15 maxEP = sortedPixels[15];
-
-                    MFloat base[1] = { ParallelMath::ToFloat(minEP) };
-                    MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
-
-                    UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
-
-                    int numTweakRounds = BCCommon::TweakRoundsForRange(8);
-                    if (numTweakRounds > maxTweakRounds)
-                        numTweakRounds = maxTweakRounds;
-
-                    for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                    {
-                        MUInt15 ep[2][1];
-
-                        ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
-
-                        for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
-                        {
-                            EndpointRefiner<1> refiner;
-                            refiner.Init(8, oneWeight);
-
-                            if (isSigned)
-                                for (int epi = 0; epi < 2; epi++)
-                                    ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
-
-                            IndexSelector<1> indexSelector;
-                            indexSelector.Init<false>(oneWeight, ep, 8);
-
-                            MUInt15 indexes[16];
-
-                            AggregatedError<1> aggError;
-                            for (int px = 0; px < 16; px++)
-                            {
-                                MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                                MUInt15 reconstructedPixel;
-
-                                indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
-                                BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
-
-                                if (refinePass != numRefineRounds - 1)
-                                    refiner.ContributeUnweightedPW(&floatPixels[px], index);
-
-                                indexes[px] = index;
-                            }
-                            MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
-
-                            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                            if (ParallelMath::AnySet(errorBetter16))
-                            {
-                                bestError = ParallelMath::Min(error, bestError);
-                                ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
-                                for (int px = 0; px < 16; px++)
-                                    ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
-                            }
-
-                            if (refinePass != numRefineRounds - 1)
-                                refiner.GetRefinedEndpointsLDR(ep, &rtn);
-                        }
-                    }
-                }
-
-                // Reduced precision with special endpoints
-                {
-                    MUInt15 bestHeuristicMin = sortedPixels[0];
-                    MUInt15 bestHeuristicMax = sortedPixels[15];
-
-                    ParallelMath::Int16CompFlag canTryClipping;
-
-                    // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
-                    // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
-                    // This will usually not find anything, but it's cheap to check.
-
-                    {
-                        MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
-                        MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
-
-                        MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
-                        canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
-                    }
-
-                    if (ParallelMath::AnySet(canTryClipping))
-                    {
-                        MUInt15 lowClearances[16];
-                        MUInt15 highClearances[16];
-                        MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
-
-                        lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
-
-                        for (int px = 1; px < 16; px++)
-                        {
-                            lowClearances[px] = sortedPixels[px - 1];
-                            highClearances[px] = highTerminal - sortedPixels[16 - px];
-                        }
-
-                        for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
-                        {
-                            uint16_t numSkippedLow = firstIndex;
-
-                            MUInt15 lowClearance = lowClearances[firstIndex];
-
-                            for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
-                            {
-                                uint16_t numSkippedHigh = 15 - lastIndex;
-                                uint16_t numSkipped = numSkippedLow + numSkippedHigh;
-
-                                MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
-
-                                ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
-
-                                if (!ParallelMath::AnySet(areMoreSkipped))
-                                    continue;
-
-                                MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
-                                MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
-
-                                MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
-
-                                ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
-                                ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
-                                ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
-                            }
-                        }
-                    }
-
-                    MUInt15 bestSimpleMin = one;
-                    MUInt15 bestSimpleMax = highTerminalMinusOne;
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
-                        ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
-                    }
-
-                    MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
-                    MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
-
-                    int minEPRange = 2;
-                    if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
-                        minEPRange = 1;
-
-                    int maxEPRange = 2;
-                    if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
-                        maxEPRange = 1;
-
-                    for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
-                    {
-                        for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
-                        {
-                            MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
-                            MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
-
-                            UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
-
-                            int numTweakRounds = BCCommon::TweakRoundsForRange(6);
-                            if (numTweakRounds > maxTweakRounds)
-                                numTweakRounds = maxTweakRounds;
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                MUInt15 ep[2][1];
-
-                                ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
-
-                                for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
-                                {
-                                    EndpointRefiner<1> refiner;
-                                    refiner.Init(6, oneWeight);
-
-                                    if (isSigned)
-                                        for (int epi = 0; epi < 2; epi++)
-                                            ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
-
-                                    IndexSelector<1> indexSelector;
-                                    indexSelector.Init<false>(oneWeight, ep, 6);
-
-                                    MUInt15 indexes[16];
-                                    MFloat error = ParallelMath::MakeFloatZero();
-
-                                    for (int px = 0; px < 16; px++)
-                                    {
-                                        MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                                        MUInt15 reconstructedPixel;
-
-                                        indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
-
-                                        MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
-                                        MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
-                                        MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
-
-                                        MFloat bestPixelError = zeroError;
-                                        MUInt15 index = ParallelMath::MakeUInt15(6);
-
-                                        ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
-                                        bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
-
-                                        ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
-
-                                        if (ParallelMath::AllSet(selectedIndexBetter))
-                                        {
-                                            if (refinePass != numRefineRounds - 1)
-                                                refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
-                                        }
-                                        else
-                                        {
-                                            MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
-
-                                            if (refinePass != numRefineRounds - 1)
-                                                refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
-                                        }
-
-                                        ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
-                                        bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
-
-                                        error = error + bestPixelError;
-
-                                        indexes[px] = index;
-                                    }
-
-                                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                                    ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                                    if (ParallelMath::AnySet(errorBetter16))
-                                    {
-                                        bestError = ParallelMath::Min(error, bestError);
-                                        ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
-
-                                        for (int epi = 0; epi < 2; epi++)
-                                            ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
-                                    }
-
-                                    if (refinePass != numRefineRounds - 1)
-                                        refiner.GetRefinedEndpointsLDR(ep, &rtn);
-                                }
-                            }
-                        }
-                    }
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    int ep0 = ParallelMath::Extract(bestEP[0], block);
-                    int ep1 = ParallelMath::Extract(bestEP[1], block);
-                    int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
-
-                    if (isSigned)
-                    {
-                        ep0 -= 127;
-                        ep1 -= 127;
-
-                        assert(ep0 >= -127 && ep0 <= 127);
-                        assert(ep1 >= -127 && ep1 <= 127);
-                    }
-
-
-                    bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
-
-                    if (swapEndpoints)
-                        std::swap(ep0, ep1);
-
-                    uint16_t dumpBits = 0;
-                    int dumpBitsOffset = 0;
-                    int dumpByteOffset = 2;
-                    packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
-                    packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
-
-                    int maxValue = (isFullRange != 0) ? 7 : 5;
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int index = ParallelMath::Extract(bestIndexes[px], block);
-
-                        if (swapEndpoints && index <= maxValue)
-                            index = maxValue - index;
-
-                        if (index != 0)
-                        {
-                            if (index == maxValue)
-                                index = 1;
-                            else if (index < maxValue)
-                                index++;
-                        }
-
-                        assert(index >= 0 && index < 8);
-
-                        dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
-                        dumpBitsOffset += 3;
-
-                        if (dumpBitsOffset >= 8)
-                        {
-                            assert(dumpByteOffset < 8);
-                            packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
-                            dumpBits >>= 8;
-                            dumpBitsOffset -= 8;
-                            dumpByteOffset++;
-                        }
-                    }
-
-                    assert(dumpBitsOffset == 0);
-                    assert(dumpByteOffset == 8);
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-
-            static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
-            {
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (maxTweakRounds < 1)
-                    maxTweakRounds = 1;
-
-                EndpointSelector<3, 8> endpointSelector;
-
-                MUInt15 pixels[16][4];
-                MFloat floatPixels[16][4];
-
-                MFloat preWeightedPixels[16][4];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
-                }
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
-                }
-
-                if (alphaTest)
-                {
-                    MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
-                        pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
-                    }
-                }
-
-                BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
-
-                MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
-
-                for (int px = 0; px < 16; px++)
-                    minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
-
-                MFloat pixelWeights[16];
-                for (int px = 0; px < 16; px++)
-                {
-                    pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
-                    if (alphaTest)
-                    {
-                        ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
-
-                        ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
-                    }
-                }
-
-                for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                {
-                    for (int px = 0; px < 16; px++)
-                        endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
-
-                    endpointSelector.FinishPass(pass);
-                }
-
-                UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
-
-                MUInt15 bestEndpoints[2][3];
-                MUInt15 bestIndexes[16];
-                MUInt15 bestRange = ParallelMath::MakeUInt15(0);
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-
-                for (int px = 0; px < 16; px++)
-                    bestIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < 3; ch++)
-                        bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
-
-                if (exhaustive)
-                {
-                    MSInt16 sortBins[16];
-
-                    {
-                        // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
-                        // and pack the original indexes into the low bits.
-
-                        MUInt15 sortEP[2][3];
-                        ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
-
-                        IndexSelector<3> sortSelector;
-                        sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
-
-                        for (int16_t px = 0; px < 16; px++)
-                        {
-                            MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
-
-                            if (alphaTest)
-                            {
-                                ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
-
-                                ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
-                            }
-
-                            sortBin = sortBin + ParallelMath::MakeSInt16(px);
-
-                            sortBins[px] = sortBin;
-                        }
-                    }
-
-                    // Sort bins
-                    for (int sortEnd = 1; sortEnd < 16; sortEnd++)
-                    {
-                        for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
-                        {
-                            MSInt16 a = sortBins[sortLoc];
-                            MSInt16 b = sortBins[sortLoc - 1];
-
-                            sortBins[sortLoc] = ParallelMath::Max(a, b);
-                            sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
-                        }
-                    }
-
-                    MUInt15 firstElement = ParallelMath::MakeUInt15(0);
-                    for (uint16_t e = 0; e < 16; e++)
-                    {
-                        ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
-                        ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
-                        if (!ParallelMath::AnySet(isInvalid))
-                            break;
-                    }
-
-                    MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
-
-                    MUInt15 sortedInputs[16][4];
-                    MFloat floatSortedInputs[16][4];
-                    MFloat pwFloatSortedInputs[16][4];
-
-                    for (int e = 0; e < 16; e++)
-                    {
-                        for (int ch = 0; ch < 4; ch++)
-                            sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
-                    }
-
-                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                    {
-                        for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
-                        {
-                            ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
-                            int originalIndex = (sortBin & 15);
-
-                            for (int ch = 0; ch < 4; ch++)
-                                ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
-                        }
-                    }
-
-                    for (int e = 0; e < 16; e++)
-                    {
-                        for (int ch = 0; ch < 4; ch++)
-                        {
-                            MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
-                            floatSortedInputs[e][ch] = f;
-                            pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
-                        }
-                    }
-
-                    for (int n0 = 0; n0 <= 15; n0++)
-                    {
-                        int remainingFor1 = 16 - n0;
-                        if (remainingFor1 == 16)
-                            remainingFor1 = 15;
-
-                        for (int n1 = 0; n1 <= remainingFor1; n1++)
-                        {
-                            int remainingFor2 = 16 - n1 - n0;
-                            if (remainingFor2 == 16)
-                                remainingFor2 = 15;
-
-                            for (int n2 = 0; n2 <= remainingFor2; n2++)
-                            {
-                                int n3 = 16 - n2 - n1 - n0;
-
-                                if (n3 == 16)
-                                    continue;
-
-                                int counts[4] = { n0, n1, n2, n3 };
-
-                                TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                            }
-                        }
-                    }
-
-                    TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-
-                    if (alphaTest)
-                    {
-                        for (int n0 = 0; n0 <= 15; n0++)
-                        {
-                            int remainingFor1 = 16 - n0;
-                            if (remainingFor1 == 16)
-                                remainingFor1 = 15;
-
-                            for (int n1 = 0; n1 <= remainingFor1; n1++)
-                            {
-                                int n2 = 16 - n1 - n0;
-
-                                if (n2 == 16)
-                                    continue;
-
-                                int counts[3] = { n0, n1, n2 };
-
-                                TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                            }
-                        }
-
-                        TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                    }
-                }
-                else
-                {
-                    int minRange = alphaTest ? 3 : 4;
-
-                    for (int range = minRange; range <= 4; range++)
-                    {
-                        int tweakRounds = BCCommon::TweakRoundsForRange(range);
-                        if (tweakRounds > maxTweakRounds)
-                            tweakRounds = maxTweakRounds;
-
-                        for (int tweak = 0; tweak < tweakRounds; tweak++)
-                        {
-                            MUInt15 endPoints[2][3];
-
-                            ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
-
-                            for (int refine = 0; refine < numRefineRounds; refine++)
-                            {
-                                EndpointRefiner<3> refiner;
-                                refiner.Init(range, channelWeights);
-
-                                TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
-
-                                if (refine != numRefineRounds - 1)
-                                    refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
-                            }
-                        }
-                    }
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
-                    assert(range == 3 || range == 4);
-
-                    ParallelMath::ScalarUInt16 compressedEP[2];
-                    for (int ep = 0; ep < 2; ep++)
-                    {
-                        ParallelMath::ScalarUInt16 endPoint[3];
-                        for (int ch = 0; ch < 3; ch++)
-                            endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
-
-                        int compressed = (endPoint[0] & 0xf8) << 8;
-                        compressed |= (endPoint[1] & 0xfc) << 3;
-                        compressed |= (endPoint[2] & 0xf8) >> 3;
-
-                        compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
-                    }
-
-                    int indexOrder[4];
-
-                    if (range == 4)
-                    {
-                        if (compressedEP[0] == compressedEP[1])
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 0;
-                            indexOrder[2] = 0;
-                            indexOrder[3] = 0;
-                        }
-                        else if (compressedEP[0] < compressedEP[1])
-                        {
-                            std::swap(compressedEP[0], compressedEP[1]);
-                            indexOrder[0] = 1;
-                            indexOrder[1] = 3;
-                            indexOrder[2] = 2;
-                            indexOrder[3] = 0;
-                        }
-                        else
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 3;
-                            indexOrder[3] = 1;
-                        }
-                    }
-                    else
-                    {
-                        assert(range == 3);
-
-                        if (compressedEP[0] > compressedEP[1])
-                        {
-                            std::swap(compressedEP[0], compressedEP[1]);
-                            indexOrder[0] = 1;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 0;
-                        }
-                        else
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 1;
-                        }
-                        indexOrder[3] = 3;
-                    }
-
-                    packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
-                    packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
-                    packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
-                    packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
-
-                    for (int i = 0; i < 16; i += 4)
-                    {
-                        int packedIndexes = 0;
-                        for (int subi = 0; subi < 4; subi++)
-                        {
-                            ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
-                            packedIndexes |= (indexOrder[index] << (subi * 2));
-                        }
-
-                        packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
-                    }
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-        };
-
-        // Signed input blocks are converted into unsigned space, with the maximum value being 254
-        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])
-        {
-            for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
-            {
-                const PixelBlockS8& inputSignedBlock = inputSigned[block];
-                PixelBlockU8& inputNormalizedBlock = inputNormalized[block];
-
-                for (size_t px = 0; px < 16; px++)
-                {
-                    for (size_t ch = 0; ch < 4; ch++)
-                        inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127);
-                }
-            }
-        }
-
-        void FillWeights(const Options &options, float channelWeights[4])
-        {
-            if (options.flags & Flags::Uniform)
-                channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f;
-            else
-            {
-                channelWeights[0] = options.redWeight;
-                channelWeights[1] = options.greenWeight;
-                channelWeights[2] = options.blueWeight;
-                channelWeights[3] = options.alphaWeight;
-            }
-        }
-    }
-
-    namespace Kernels
-    {
-        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, options.seedPoints, options.refineRoundsBC7);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
-                Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
-
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
-                Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
-
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC);
-                pBC += 16;
-            }
-        }
-
-        void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false);
-                pBC += 16;
-            }
-        }
-
-        void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true);
-                pBC += 16;
-            }
-        }
-    }
-}
diff --git a/thirdparty/cvtt/ConvectionKernels.h b/thirdparty/cvtt/ConvectionKernels.h
index fb5ca130f9..3da48405ff 100644
--- a/thirdparty/cvtt/ConvectionKernels.h
+++ b/thirdparty/cvtt/ConvectionKernels.h
@@ -25,21 +25,13 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __CVTT_CONVECTION_KERNELS__
 #define __CVTT_CONVECTION_KERNELS__
 
+#include <stddef.h>
 #include <stdint.h>
 
 namespace cvtt
 {
     namespace Flags
     {
-        // Enable partitioned modes in BC7 encoding (slower, better quality)
-        const uint32_t BC7_EnablePartitioning   = 0x001;
-
-        // Enable 3-partition modes in BC7 encoding (slower, better quality, requires BC7_EnablePartitioning)
-        const uint32_t BC7_Enable3Subsets       = 0x002;
-
-        // Enable dual-plane modes in BC7 encoding (slower, better quality)
-        const uint32_t BC7_EnableDualPlane      = 0x004;
-
         // Use fast indexing in BC7 encoding (about 2x faster, slightly worse quality)
         const uint32_t BC7_FastIndexing         = 0x008;
 
@@ -61,13 +53,19 @@ namespace cvtt
         // Uniform color channel importance
         const uint32_t Uniform                  = 0x200;
 
+        // Use fake BT.709 color space for etc2comp compatibility (slower)
+        const uint32_t ETC_UseFakeBT709         = 0x400;
+
+        // Use accurate quantization functions when quantizing fake BT.709 (much slower, marginal improvement on specific blocks)
+        const uint32_t ETC_FakeBT709Accurate    = 0x800;
+
         // Misc useful default flag combinations
-        const uint32_t Fastest = (BC6H_FastIndexing | S3TC_Paranoid);
-        const uint32_t Faster = (BC7_EnableDualPlane | BC6H_FastIndexing | S3TC_Paranoid);
-        const uint32_t Fast = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_FastIndexing | S3TC_Paranoid);
-        const uint32_t Default = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | BC7_FastIndexing | S3TC_Paranoid);
-        const uint32_t Better = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | S3TC_Paranoid | S3TC_Exhaustive);
-        const uint32_t Ultra = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | BC7_TrySingleColor | S3TC_Paranoid | S3TC_Exhaustive);
+        const uint32_t Fastest = (BC6H_FastIndexing | BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Faster = (BC6H_FastIndexing | BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Fast = (BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Default = (BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Better = (S3TC_Paranoid | S3TC_Exhaustive);
+        const uint32_t Ultra = (BC7_TrySingleColor | S3TC_Paranoid | S3TC_Exhaustive | ETC_FakeBT709Accurate);
     }
 
     const unsigned int NumParallelBlocks = 8;
@@ -81,7 +79,7 @@ namespace cvtt
         float blueWeight;       // Blue channel importance
         float alphaWeight;      // Alpha channel importance
 
-        int refineRoundsBC7;    // Number of refine rounds for BC7
+        int refineRoundsBC7;   // Number of refine rounds for BC7
         int refineRoundsBC6H;   // Number of refine rounds for BC6H (max 3)
         int refineRoundsIIC;    // Number of refine rounds for independent interpolated channels (BC3 alpha, BC4, BC5)
         int refineRoundsS3TC;   // Number of refine rounds for S3TC RGB
@@ -104,6 +102,102 @@ namespace cvtt
         }
     };
 
+    struct BC7FineTuningParams
+    {
+        // Seed point counts for each mode+configuration combination
+        uint8_t mode0SP[16];
+        uint8_t mode1SP[64];
+        uint8_t mode2SP[64];
+        uint8_t mode3SP[64];
+        uint8_t mode4SP[4][2];
+        uint8_t mode5SP[4];
+        uint8_t mode6SP;
+        uint8_t mode7SP[64];
+
+        BC7FineTuningParams()
+        {
+            for (int i = 0; i < 16; i++)
+                this->mode0SP[i] = 4;
+
+            for (int i = 0; i < 64; i++)
+            {
+                this->mode1SP[i] = 4;
+                this->mode2SP[i] = 4;
+                this->mode3SP[i] = 4;
+                this->mode7SP[i] = 4;
+            }
+
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < 2; j++)
+                    this->mode4SP[i][j] = 4;
+
+                this->mode5SP[i] = 4;
+            }
+
+            this->mode6SP = 4;
+        }
+    };
+
+    struct BC7EncodingPlan
+    {
+        static const int kNumRGBAShapes = 129;
+        static const int kNumRGBShapes = 243;
+
+        uint64_t mode1PartitionEnabled;
+        uint64_t mode2PartitionEnabled;
+        uint64_t mode3PartitionEnabled;
+        uint16_t mode0PartitionEnabled;
+        uint64_t mode7RGBAPartitionEnabled;
+        uint64_t mode7RGBPartitionEnabled;
+        uint8_t mode4SP[4][2];
+        uint8_t mode5SP[4];
+        bool mode6Enabled;
+
+        uint8_t seedPointsForShapeRGB[kNumRGBShapes];
+        uint8_t seedPointsForShapeRGBA[kNumRGBAShapes];
+
+        uint8_t rgbaShapeList[kNumRGBAShapes];
+        uint8_t rgbaNumShapesToEvaluate;
+
+        uint8_t rgbShapeList[kNumRGBShapes];
+        uint8_t rgbNumShapesToEvaluate;
+
+        BC7EncodingPlan()
+        {
+            for (int i = 0; i < kNumRGBShapes; i++)
+            {
+                this->rgbShapeList[i] = i;
+                this->seedPointsForShapeRGB[i] = 4;
+            }
+            this->rgbNumShapesToEvaluate = kNumRGBShapes;
+
+            for (int i = 0; i < kNumRGBAShapes; i++)
+            {
+                this->rgbaShapeList[i] = i;
+                this->seedPointsForShapeRGBA[i] = 4;
+            }
+            this->rgbaNumShapesToEvaluate = kNumRGBAShapes;
+
+
+            this->mode0PartitionEnabled = 0xffff;
+            this->mode1PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode2PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode3PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode6Enabled = true;
+            this->mode7RGBPartitionEnabled = 0xffffffffffffffffULL;
+            this->mode7RGBAPartitionEnabled = 0xffffffffffffffffULL;
+
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < 2; j++)
+                    this->mode4SP[i][j] = 4;
+
+                this->mode5SP[i] = 4;
+            }
+        }
+    };
+
     // RGBA input block for unsigned 8-bit formats
     struct PixelBlockU8
     {
@@ -116,14 +210,34 @@ namespace cvtt
         int8_t m_pixels[16][4];
     };
 
+    struct PixelBlockScalarS16
+    {
+        int16_t m_pixels[16];
+    };
+
     // RGBA input block for half-precision float formats (bit-cast to int16_t)
     struct PixelBlockF16
     {
         int16_t m_pixels[16][4];
     };
 
+    class ETC2CompressionData
+    {
+    protected:
+        ETC2CompressionData() {}
+    };
+
+    class ETC1CompressionData
+    {
+    protected:
+        ETC1CompressionData() {}
+    };
+
     namespace Kernels
     {
+        typedef void* allocFunc_t(void *context, size_t size);
+        typedef void freeFunc_t(void *context, void* ptr, size_t size);
+
         // NOTE: All functions accept and output NumParallelBlocks blocks at once
         void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
         void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
@@ -134,7 +248,28 @@ namespace cvtt
         void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options);
         void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const Options &options);
         void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const Options &options);
-        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
+        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, const BC7EncodingPlan &encodingPlan);
+        void EncodeETC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, ETC1CompressionData *compressionData);
+        void EncodeETC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, ETC2CompressionData *compressionData);
+        void EncodeETC2RGBA(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData);
+        void EncodeETC2PunchthroughAlpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData);
+
+        void EncodeETC2Alpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options);
+        void EncodeETC2Alpha11(uint8_t *pBC, const PixelBlockScalarS16 *pBlocks, bool isSigned, const cvtt::Options &options);
+
+        // Generates a BC7 encoding plan from a quality parameter that ranges from 1 (fastest) to 100 (best)
+        void ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality);
+
+        // Generates a BC7 encoding plan from fine-tuning parameters.
+        bool ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params);
+
+        // ETC compression requires temporary storage that normally consumes a large amount of stack space.
+        // To allocate and release it, use one of these functions.
+        ETC2CompressionData *AllocETC2Data(allocFunc_t allocFunc, void *context, const cvtt::Options &options);
+        void ReleaseETC2Data(ETC2CompressionData *compressionData, freeFunc_t freeFunc);
+
+        ETC1CompressionData *AllocETC1Data(allocFunc_t allocFunc, void *context);
+        void ReleaseETC1Data(ETC1CompressionData *compressionData, freeFunc_t freeFunc);
 
         void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC);
         void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC);
diff --git a/thirdparty/cvtt/ConvectionKernels_API.cpp b/thirdparty/cvtt/ConvectionKernels_API.cpp
new file mode 100644
index 0000000000..707e71d474
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_API.cpp
@@ -0,0 +1,346 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include <stdint.h>
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_Util.h"
+#include "ConvectionKernels_BC67.h"
+#include "ConvectionKernels_ETC.h"
+#include "ConvectionKernels_S3TC.h"
+
+#include <assert.h>
+
+namespace cvtt
+{
+    namespace Kernels
+    {
+        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, const BC7EncodingPlan &encodingPlan)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, encodingPlan, options.refineRoundsBC7);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
+                Util::BiasSignedInput(inputBlocks, pBlocks + blockBase);
+
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
+                Util::BiasSignedInput(inputBlocks, pBlocks + blockBase);
+
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeETC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC1CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC1Block(pBC, pBlocks + blockBase, compressionData, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2Block(pBC, pBlocks + blockBase, compressionData, options, false);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2PunchthroughAlpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2Block(pBC, pBlocks + blockBase, compressionData, options, true);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2Alpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2AlphaBlock(pBC, pBlocks + blockBase, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2Alpha11(uint8_t *pBC, const PixelBlockScalarS16 *pBlocks, bool isSigned, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressEACBlock(pBC, pBlocks + blockBase, isSigned, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2RGBA(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            uint8_t alphaBlockData[cvtt::NumParallelBlocks * 8];
+            uint8_t colorBlockData[cvtt::NumParallelBlocks * 8];
+
+            EncodeETC2(colorBlockData, pBlocks, options, compressionData);
+            EncodeETC2Alpha(alphaBlockData, pBlocks, options);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                for (size_t blockData = 0; blockData < 8; blockData++)
+                    pBC[blockBase * 16 + blockData] = alphaBlockData[blockBase * 8 + blockData];
+
+                for (size_t blockData = 0; blockData < 8; blockData++)
+                    pBC[blockBase * 16 + 8 + blockData] = colorBlockData[blockBase * 8 + blockData];
+            }
+        }
+
+        void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC);
+                pBC += 16;
+            }
+        }
+
+        void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false);
+                pBC += 16;
+            }
+        }
+
+        void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true);
+                pBC += 16;
+            }
+        }
+
+        ETC1CompressionData *AllocETC1Data(allocFunc_t allocFunc, void *context)
+        {
+            return cvtt::Internal::ETCComputer::AllocETC1Data(allocFunc, context);
+        }
+
+        void ReleaseETC1Data(ETC1CompressionData *compressionData, freeFunc_t freeFunc)
+        {
+            cvtt::Internal::ETCComputer::ReleaseETC1Data(compressionData, freeFunc);
+        }
+
+        ETC2CompressionData *AllocETC2Data(allocFunc_t allocFunc, void *context, const cvtt::Options &options)
+        {
+            return cvtt::Internal::ETCComputer::AllocETC2Data(allocFunc, context, options);
+        }
+
+        void ReleaseETC2Data(ETC2CompressionData *compressionData, freeFunc_t freeFunc)
+        {
+            cvtt::Internal::ETCComputer::ReleaseETC2Data(compressionData, freeFunc);
+        }
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_AggregatedError.h b/thirdparty/cvtt/ConvectionKernels_AggregatedError.h
new file mode 100644
index 0000000000..9f9356a345
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_AggregatedError.h
@@ -0,0 +1,55 @@
+#pragma once
+#ifndef __CVTT_AGGREGATEDERROR_H__
+#define __CVTT_AGGREGATEDERROR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class AggregatedError
+        {
+        public:
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt31 MUInt31;
+            typedef ParallelMath::Float MFloat;
+
+            AggregatedError()
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0);
+            }
+
+            void Add(const MUInt16 &channelErrorUnweighted, int ch)
+            {
+                m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted);
+            }
+
+            MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const
+            {
+                if (flags & cvtt::Flags::Uniform)
+                {
+                    MUInt31 total = m_errorUnweighted[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        total = total + m_errorUnweighted[ch];
+                    return ParallelMath::ToFloat(total);
+                }
+                else
+                {
+                    MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch];
+                    return total;
+                }
+            }
+
+        private:
+            MUInt31 m_errorUnweighted[TVectorSize];
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_BC67.cpp b/thirdparty/cvtt/ConvectionKernels_BC67.cpp
new file mode 100644
index 0000000000..791859b232
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC67.cpp
@@ -0,0 +1,3485 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC67.h"
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_BCCommon.h"
+#include "ConvectionKernels_BC7_Prio.h"
+#include "ConvectionKernels_BC7_SingleColor.h"
+#include "ConvectionKernels_BC6H_IO.h"
+#include "ConvectionKernels_EndpointRefiner.h"
+#include "ConvectionKernels_EndpointSelector.h"
+#include "ConvectionKernels_IndexSelectorHDR.h"
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        namespace BC67
+        {
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt15 MUInt15;
+
+            struct WorkInfo
+            {
+                MUInt15 m_mode;
+                MFloat m_error;
+                MUInt15 m_ep[3][2][4];
+                MUInt15 m_indexes[16];
+                MUInt15 m_indexes2[16];
+
+                union
+                {
+                    MUInt15 m_partition;
+                    struct IndexSelectorAndRotation
+                    {
+                        MUInt15 m_indexSelector;
+                        MUInt15 m_rotation;
+                    } m_isr;
+                } m_u;
+            };
+        }
+
+        namespace BC7Data
+        {
+            enum AlphaMode
+            {
+                AlphaMode_Combined,
+                AlphaMode_Separate,
+                AlphaMode_None,
+            };
+
+            enum PBitMode
+            {
+                PBitMode_PerEndpoint,
+                PBitMode_PerSubset,
+                PBitMode_None
+            };
+
+            struct BC7ModeInfo
+            {
+                PBitMode m_pBitMode;
+                AlphaMode m_alphaMode;
+                int m_rgbBits;
+                int m_alphaBits;
+                int m_partitionBits;
+                int m_numSubsets;
+                int m_indexBits;
+                int m_alphaIndexBits;
+                bool m_hasIndexSelector;
+            };
+
+            BC7ModeInfo g_modes[] =
+            {
+                { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
+                { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
+                { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
+                { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
+
+                { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
+                { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
+                { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
+                { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
+            };
+
+            const int g_weight2[] = { 0, 21, 43, 64 };
+            const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+            const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+
+            const int *g_weightTables[] =
+            {
+                NULL,
+                NULL,
+                g_weight2,
+                g_weight3,
+                g_weight4
+            };
+
+            struct BC6HModeInfo
+            {
+                uint16_t m_modeID;
+                bool m_partitioned;
+                bool m_transformed;
+                int m_aPrec;
+                int m_bPrec[3];
+            };
+
+            // [partitioned][precision]
+            bool g_hdrModesExistForPrecision[2][17] =
+            {
+                //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
+                { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
+                { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
+            };
+
+            BC6HModeInfo g_hdrModes[] =
+            {
+                { 0x00, true,  true,  10,{ 5, 5, 5 } },
+                { 0x01, true,  true,  7,{ 6, 6, 6 } },
+                { 0x02, true,  true,  11,{ 5, 4, 4 } },
+                { 0x06, true,  true,  11,{ 4, 5, 4 } },
+                { 0x0a, true,  true,  11,{ 4, 4, 5 } },
+                { 0x0e, true,  true,  9,{ 5, 5, 5 } },
+                { 0x12, true,  true,  8,{ 6, 5, 5 } },
+                { 0x16, true,  true,  8,{ 5, 6, 5 } },
+                { 0x1a, true,  true,  8,{ 5, 5, 6 } },
+                { 0x1e, true,  false, 6,{ 6, 6, 6 } },
+                { 0x03, false, false, 10,{ 10, 10, 10 } },
+                { 0x07, false, true,  11,{ 9, 9, 9 } },
+                { 0x0b, false, true,  12,{ 8, 8, 8 } },
+                { 0x0f, false, true,  16,{ 4, 4, 4 } },
+            };
+
+            const int g_maxHDRPrecision = 16;
+
+            static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
+
+            static uint16_t g_partitionMap[64] =
+            {
+                0xCCCC, 0x8888, 0xEEEE, 0xECC8,
+                0xC880, 0xFEEC, 0xFEC8, 0xEC80,
+                0xC800, 0xFFEC, 0xFE80, 0xE800,
+                0xFFE8, 0xFF00, 0xFFF0, 0xF000,
+                0xF710, 0x008E, 0x7100, 0x08CE,
+                0x008C, 0x7310, 0x3100, 0x8CCE,
+                0x088C, 0x3110, 0x6666, 0x366C,
+                0x17E8, 0x0FF0, 0x718E, 0x399C,
+                0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
+                0x3c3c, 0x55aa, 0x9696, 0xa55a,
+                0x73ce, 0x13c8, 0x324c, 0x3bdc,
+                0x6996, 0xc33c, 0x9966, 0x660,
+                0x272, 0x4e4, 0x4e40, 0x2720,
+                0xc936, 0x936c, 0x39c6, 0x639c,
+                0x9336, 0x9cc6, 0x817e, 0xe718,
+                0xccf0, 0xfcc, 0x7744, 0xee22,
+            };
+
+            static uint32_t g_partitionMap2[64] =
+            {
+                0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
+                0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
+                0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
+                0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
+                0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
+                0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
+                0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
+                0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
+                0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
+                0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
+                0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
+                0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
+                0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
+                0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
+                0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
+                0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
+            };
+
+            static int g_fixupIndexes2[64] =
+            {
+                15,15,15,15,
+                15,15,15,15,
+                15,15,15,15,
+                15,15,15,15,
+                15, 2, 8, 2,
+                2, 8, 8,15,
+                2, 8, 2, 2,
+                8, 8, 2, 2,
+
+                15,15, 6, 8,
+                2, 8,15,15,
+                2, 8, 2, 2,
+                2,15,15, 6,
+                6, 2, 6, 8,
+                15,15, 2, 2,
+                15,15,15,15,
+                15, 2, 2,15,
+            };
+
+            static int g_fixupIndexes3[64][2] =
+            {
+                { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
+                { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
+                { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
+                { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
+                { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
+                { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
+                { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
+                { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
+
+                { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
+                { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
+                { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
+                { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
+                { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
+                { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
+                { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
+                { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
+            };
+
+            static const unsigned char g_fragments[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
+                0, 1, 2, 3,  // 16, 4
+                0, 1, 4,  // 20, 3
+                0, 1, 2, 4,  // 23, 4
+                2, 3, 7,  // 27, 3
+                1, 2, 3, 7,  // 30, 4
+                0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
+                0, 1, 4, 8,  // 42, 4
+                0, 1, 2, 4, 5, 8,  // 46, 6
+                0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
+                1, 4, 5, 6, 9,  // 60, 5
+                2, 5, 6, 7, 10,  // 65, 5
+                5, 6, 9, 10,  // 70, 4
+                2, 3, 7, 11,  // 74, 4
+                1, 2, 3, 6, 7, 11,  // 78, 6
+                0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
+                0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
+                2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
+                4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
+                0, 4, 8, 12,  // 128, 4
+                0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
+                0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
+                0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
+                3, 6, 7, 8, 9, 12,  // 158, 6
+                3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
+                0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
+                5, 8, 9, 10, 13,  // 192, 5
+                8, 12, 13,  // 197, 3
+                4, 8, 12, 13,  // 200, 4
+                2, 3, 6, 9, 12, 13,  // 204, 6
+                0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
+                0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
+                2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
+                2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
+                0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
+                0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
+                2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
+                1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
+                2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
+                2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
+                0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
+                6, 9, 10, 11, 14,  // 317, 5
+                0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
+                1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
+                1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
+                0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
+                0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
+                2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
+                1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
+                0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
+                0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
+                1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
+                0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
+                0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
+                8, 12, 13, 14,  // 418, 4
+                1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
+                4, 8, 9, 12, 13, 14,  // 430, 6
+                0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
+                1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
+                2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
+                0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
+                0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
+                1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
+                3, 7, 11, 15,  // 490, 4
+                0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
+                0, 4, 5, 10, 11, 15,  // 502, 6
+                1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
+                0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
+                0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
+                1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
+                2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
+                0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
+                1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
+                1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
+                0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
+                0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
+                1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
+                1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
+                0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
+                0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
+                1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
+                0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
+                2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
+                0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
+                0, 1, 5, 10, 14, 15,  // 669, 6
+                0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
+                0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
+                11, 14, 15,  // 691, 3
+                7, 11, 14, 15,  // 694, 4
+                1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
+                0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
+                0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
+                2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
+                4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
+                0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
+                0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
+                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
+                0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
+                3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
+                11, 13, 14, 15,  // 792, 4
+                0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
+                0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
+                7, 10, 11, 13, 14, 15,  // 814, 6
+                3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
+                1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
+                1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
+                12, 13, 14, 15,  // 848, 4
+                0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
+                0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
+                4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
+                4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
+                0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
+                0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
+                0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
+                0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
+                0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
+                7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
+                3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
+                2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
+                8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
+                0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
+                0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
+                3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
+                2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
+                0, 2,  // 1040, 2
+                1, 3,  // 1042, 2
+                0, 1, 4, 5,  // 1044, 4
+                0, 1, 2, 4, 5,  // 1048, 5
+                2, 3, 6,  // 1053, 3
+                0, 2, 4, 6,  // 1056, 4
+                1, 2, 5, 6,  // 1060, 4
+                0, 1, 2, 3, 5, 6,  // 1064, 6
+                0, 1, 2, 4, 5, 6,  // 1070, 6
+                0, 1, 2, 3, 4, 5, 6,  // 1076, 7
+                0, 3, 4, 7,  // 1083, 4
+                0, 1, 2, 3, 4, 7,  // 1087, 6
+                1, 3, 5, 7,  // 1093, 4
+                2, 3, 6, 7,  // 1097, 4
+                1, 2, 3, 6, 7,  // 1101, 5
+                1, 2, 3, 5, 6, 7,  // 1106, 6
+                0, 1, 2, 3, 5, 6, 7,  // 1112, 7
+                4, 5, 6, 7,  // 1119, 4
+                0, 8,  // 1123, 2
+                0, 1, 4, 5, 8,  // 1125, 5
+                0, 1, 8, 9,  // 1130, 4
+                4, 5, 8, 9,  // 1134, 4
+                0, 1, 4, 5, 8, 9,  // 1138, 6
+                2, 6, 8, 9,  // 1144, 4
+                6, 7, 8, 9,  // 1148, 4
+                0, 2, 4, 6, 8, 10,  // 1152, 6
+                1, 2, 5, 6, 9, 10,  // 1158, 6
+                0, 3, 4, 7, 9, 10,  // 1164, 6
+                0, 1, 2, 8, 9, 10,  // 1170, 6
+                4, 5, 6, 8, 9, 10,  // 1176, 6
+                3, 11,  // 1182, 2
+                2, 3, 6, 7, 11,  // 1184, 5
+                0, 3, 8, 11,  // 1189, 4
+                0, 3, 4, 7, 8, 11,  // 1193, 6
+                1, 3, 5, 7, 9, 11,  // 1199, 6
+                2, 3, 10, 11,  // 1205, 4
+                1, 5, 10, 11,  // 1209, 4
+                4, 5, 10, 11,  // 1213, 4
+                6, 7, 10, 11,  // 1217, 4
+                2, 3, 6, 7, 10, 11,  // 1221, 6
+                1, 2, 3, 9, 10, 11,  // 1227, 6
+                5, 6, 7, 9, 10, 11,  // 1233, 6
+                8, 9, 10, 11,  // 1239, 4
+                4, 12,  // 1243, 2
+                0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
+                8, 9, 12,  // 1253, 3
+                0, 4, 5, 8, 9, 12,  // 1256, 6
+                0, 1, 4, 5, 8, 9, 12,  // 1262, 7
+                2, 3, 5, 6, 8, 9, 12,  // 1269, 7
+                1, 5, 9, 13,  // 1276, 4
+                6, 7, 9, 13,  // 1280, 4
+                1, 4, 7, 10, 13,  // 1284, 5
+                1, 6, 8, 11, 13,  // 1289, 5
+                0, 1, 12, 13,  // 1294, 4
+                4, 5, 12, 13,  // 1298, 4
+                0, 1, 6, 7, 12, 13,  // 1302, 6
+                0, 1, 4, 8, 12, 13,  // 1308, 6
+                8, 9, 12, 13,  // 1314, 4
+                4, 8, 9, 12, 13,  // 1318, 5
+                4, 5, 8, 9, 12, 13,  // 1323, 6
+                0, 4, 5, 8, 9, 12, 13,  // 1329, 7
+                0, 1, 6, 10, 12, 13,  // 1336, 6
+                3, 6, 7, 9, 10, 12, 13,  // 1342, 7
+                0, 1, 10, 11, 12, 13,  // 1349, 6
+                2, 4, 7, 9, 14,  // 1355, 5
+                4, 5, 10, 14,  // 1360, 4
+                2, 6, 10, 14,  // 1364, 4
+                2, 5, 8, 11, 14,  // 1368, 5
+                0, 2, 12, 14,  // 1373, 4
+                8, 10, 12, 14,  // 1377, 4
+                4, 6, 8, 10, 12, 14,  // 1381, 6
+                13, 14,  // 1387, 2
+                9, 10, 13, 14,  // 1389, 4
+                5, 6, 9, 10, 13, 14,  // 1393, 6
+                0, 1, 2, 12, 13, 14,  // 1399, 6
+                4, 5, 6, 12, 13, 14,  // 1405, 6
+                8, 9, 12, 13, 14,  // 1411, 5
+                8, 9, 10, 12, 13, 14,  // 1416, 6
+                7, 15,  // 1422, 2
+                0, 5, 10, 15,  // 1424, 4
+                0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
+                10, 11, 15,  // 1436, 3
+                0, 1, 5, 6, 10, 11, 15,  // 1439, 7
+                3, 6, 7, 10, 11, 15,  // 1446, 6
+                12, 15,  // 1452, 2
+                0, 3, 12, 15,  // 1454, 4
+                4, 7, 12, 15,  // 1458, 4
+                0, 3, 6, 9, 12, 15,  // 1462, 6
+                0, 3, 5, 10, 12, 15,  // 1468, 6
+                8, 11, 12, 15,  // 1474, 4
+                5, 6, 8, 11, 12, 15,  // 1478, 6
+                4, 7, 8, 11, 12, 15,  // 1484, 6
+                1, 3, 13, 15,  // 1490, 4
+                9, 11, 13, 15,  // 1494, 4
+                5, 7, 9, 11, 13, 15,  // 1498, 6
+                2, 3, 14, 15,  // 1504, 4
+                2, 3, 4, 5, 14, 15,  // 1508, 6
+                6, 7, 14, 15,  // 1514, 4
+                2, 3, 5, 9, 14, 15,  // 1518, 6
+                2, 3, 8, 9, 14, 15,  // 1524, 6
+                10, 14, 15,  // 1530, 3
+                0, 4, 5, 9, 10, 14, 15,  // 1533, 7
+                2, 3, 7, 11, 14, 15,  // 1540, 6
+                10, 11, 14, 15,  // 1546, 4
+                7, 10, 11, 14, 15,  // 1550, 5
+                6, 7, 10, 11, 14, 15,  // 1555, 6
+                1, 2, 3, 13, 14, 15,  // 1561, 6
+                5, 6, 7, 13, 14, 15,  // 1567, 6
+                10, 11, 13, 14, 15,  // 1573, 5
+                9, 10, 11, 13, 14, 15,  // 1578, 6
+                0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
+                9, 10, 12, 13, 14, 15,  // 1592, 6
+                8, 11, 12, 13, 14, 15,  // 1598, 6
+                3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
+            };
+            static const int g_shapeRanges[][2] =
+            {
+                { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
+                { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
+                { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
+                { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
+                { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
+                { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
+                { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
+                { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
+                { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
+                { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
+                { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
+                { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
+                { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
+                { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
+                { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
+                { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
+                { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
+                { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
+                { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
+                { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
+                { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
+                { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
+                { 1604, 8 },
+            };
+            static const int g_shapes1[][2] =
+            {
+                { 0, 16 }
+            };
+            static const int g_shapes2[64][2] =
+            {
+                { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
+                { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
+                { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
+                { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
+                { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
+                { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
+                { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
+                { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
+            };
+            static const int g_shapes3[64][3] =
+            {
+                { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
+                { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
+                { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
+                { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
+                { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
+                { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
+                { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
+                { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
+            };
+
+            static const int g_shapeList1[] =
+            {
+                0,
+            };
+
+            static const int g_shapeList2[] =
+            {
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
+                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+                45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+                67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+                78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
+                89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+                100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
+                111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+                122, 123, 124, 125, 126, 127, 128,
+            };
+
+            static const int g_shapeList12[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+                121, 122, 123, 124, 125, 126, 127, 128,
+            };
+
+            static const int g_shapeList3[] =
+            {
+                1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
+                33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
+                110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
+                136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
+                147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
+                158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+                169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+                180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+                191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
+                202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
+                213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+                224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
+                235, 236, 237, 238, 239, 240, 241, 242,
+            };
+
+            static const int g_shapeList3Short[] =
+            {
+                1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
+                106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
+                171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
+                233, 237, 240,
+            };
+
+            static const int g_shapeListAll[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+                121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+                132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+                143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
+                154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+                165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+                176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
+                187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
+                198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
+                209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
+                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
+                231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+                242,
+            };
+
+            static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
+            static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
+            static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
+            static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
+            static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
+            static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
+            static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
+        }
+
+        struct PackingVector
+        {
+            uint32_t m_vector[4];
+            int m_offset;
+
+            void Init()
+            {
+                for (int i = 0; i < 4; i++)
+                    m_vector[i] = 0;
+
+                m_offset = 0;
+            }
+
+            void InitPacked(const uint32_t *v, int bits)
+            {
+                for (int b = 0; b < bits; b += 32)
+                    m_vector[b / 32] = v[b / 32];
+
+                m_offset = bits;
+            }
+
+            inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
+            {
+                int vOffset = m_offset >> 5;
+                int bitOffset = m_offset & 0x1f;
+
+                m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
+
+                int overflowBits = bitOffset + bits - 32;
+                if (overflowBits > 0)
+                    m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
+
+                m_offset += bits;
+            }
+
+            inline void Flush(uint8_t* output)
+            {
+                assert(m_offset == 128);
+
+                for (int v = 0; v < 4; v++)
+                {
+                    uint32_t chunk = m_vector[v];
+                    for (int b = 0; b < 4; b++)
+                        output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
+                }
+            }
+        };
+
+
+        struct UnpackingVector
+        {
+            uint32_t m_vector[4];
+
+            void Init(const uint8_t *bytes)
+            {
+                for (int i = 0; i < 4; i++)
+                    m_vector[i] = 0;
+
+                for (int b = 0; b < 16; b++)
+                    m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
+            }
+
+            inline void UnpackStart(uint32_t *v, int bits)
+            {
+                for (int b = 0; b < bits; b += 32)
+                    v[b / 32] = m_vector[b / 32];
+
+                int entriesShifted = bits / 32;
+                int carry = bits % 32;
+
+                for (int i = entriesShifted; i < 4; i++)
+                    m_vector[i - entriesShifted] = m_vector[i];
+
+                int entriesRemaining = 4 - entriesShifted;
+                if (carry)
+                {
+                    uint32_t bitMask = (1 << carry) - 1;
+                    for (int i = 0; i < 4; i++)
+                    {
+                        m_vector[i] >>= carry;
+                        if (i != 3)
+                            m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - carry);
+                    }
+                }
+            }
+
+            inline ParallelMath::ScalarUInt16 Unpack(int bits)
+            {
+                uint32_t bitMask = (1 << bits) - 1;
+
+                ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
+
+                for (int i = 0; i < 4; i++)
+                {
+                    m_vector[i] >>= bits;
+                    if (i != 3)
+                        m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
+                }
+
+                return result;
+            }
+        };
+
+        ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
+        {
+            if (isSigned)
+            {
+                ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
+                return (v * 32.0f + offset) / 31.0f;
+            }
+            else
+                return (v * 64.0f + 30.0f) / 31.0f;
+        }
+
+        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
+        {
+#ifdef CVTT_ENABLE_ASSERTS
+            for (int i = 0; i < ParallelMath::ParallelSize; i++)
+                assert(ParallelMath::Extract(v, i) != -32768)
+#endif
+
+                ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
+            ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
+
+            ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
+            ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
+            ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
+            ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
+
+            return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
+        }
+
+        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
+        {
+            return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
+        }
+
+        void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
+        {
+            for (int epi = 0; epi < 2; epi++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    if (isSigned)
+                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
+                    else
+                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
+                }
+            }
+        }
+
+        struct SinglePlaneTemporaries
+        {
+            UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
+            UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
+
+            ParallelMath::UInt15 fragmentBestIndexes[BC7Data::g_numFragments];
+            ParallelMath::UInt15 shapeBestEP[BC7Data::g_numShapesAll][2][4];
+            ParallelMath::Float shapeBestError[BC7Data::g_numShapesAll];
+        };
+    }
+}
+
+void cvtt::Internal::BC7Computer::TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
+{
+    ParallelMath::RoundTowardNearestForScope roundingMode;
+
+    float tf[2];
+    Util::ComputeTweakFactors(tweak, range, tf);
+
+    MFloat base = ParallelMath::ToFloat(original[0]);
+    MFloat offs = ParallelMath::ToFloat(original[1]) - base;
+
+    result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
+    result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
+}
+
+void cvtt::Internal::BC7Computer::Quantize(MUInt15* color, int bits, int channels)
+{
+    for (int ch = 0; ch < channels; ch++)
+        color[ch] = ParallelMath::RightShift(((color[ch] << bits) - color[ch]) + ParallelMath::MakeUInt15(127 + (1 << (7 - bits))), 8);
+}
+
+void cvtt::Internal::BC7Computer::QuantizeP(MUInt15* color, int bits, uint16_t p, int channels)
+{
+    int16_t addend;
+    if (p)
+        addend = ((1 << (8 - bits)) - 1);
+    else
+        addend = 255;
+
+    for (int ch = 0; ch < channels; ch++)
+    {
+        MUInt16 ch16 = ParallelMath::LosslessCast<MUInt16>::Cast(color[ch]);
+        ch16 = ParallelMath::RightShift((ch16 << (bits + 1)) - ch16 + addend, 9);
+        ch16 = (ch16 << 1) | ParallelMath::MakeUInt16(p);
+        color[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ch16);
+    }
+}
+
+void cvtt::Internal::BC7Computer::Unquantize(MUInt15* color, int bits, int channels)
+{
+    for (int ch = 0; ch < channels; ch++)
+    {
+        MUInt15 clr = color[ch];
+        clr = clr << (8 - bits);
+        color[ch] = clr | ParallelMath::RightShift(clr, bits);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 4, p[j], 3);
+        Unquantize(ep[j], 5, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints1(MUInt15 ep[2][4], uint16_t p)
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 6, p, 3);
+        Unquantize(ep[j], 7, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints2(MUInt15 ep[2][4])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(ep[j], 5, 3);
+        Unquantize(ep[j], 5, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 7, p[j], 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(epRGB[j], 5, 3);
+        Unquantize(epRGB[j], 5, 3);
+
+        Quantize(epA + j, 6, 1);
+        Unquantize(epA + j, 6, 1);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(epRGB[j], 7, 3);
+        Unquantize(epRGB[j], 7, 3);
+    }
+
+    // Alpha is full precision
+    (void)epA;
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+        QuantizeP(ep[j], 7, p[j], 4);
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 5, p[j], 4);
+        Unquantize(ep[j], 6, 4);
+    }
+}
+
+void cvtt::Internal::BC7Computer::TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
+
+    MUInt15 intAverage[4];
+    for (int ch = 0; ch < 4; ch++)
+        intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
+
+    MUInt15 eps[2][4];
+    MUInt15 reconstructed[4];
+    MUInt15 index = ParallelMath::MakeUInt15(0);
+
+    for (int epi = 0; epi < 2; epi++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            eps[epi][ch] = ParallelMath::MakeUInt15(0);
+        eps[epi][3] = ParallelMath::MakeUInt15(255);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        reconstructed[ch] = ParallelMath::MakeUInt15(0);
+    reconstructed[3] = ParallelMath::MakeUInt15(255);
+
+    // Depending on the target index and parity bits, there are multiple valid solid colors.
+    // We want to find the one closest to the actual average.
+    MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
+    for (int t = 0; t < numTables; t++)
+    {
+        const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
+
+        ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
+
+        MUInt15 candidateReconstructed[4];
+        MUInt15 candidateEPs[2][4];
+
+        for (int i = 0; i < ParallelMath::ParallelSize; i++)
+        {
+            for (int ch = 0; ch < numRealChannels; ch++)
+            {
+                ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
+                assert(avgValue >= 0 && avgValue <= 255);
+
+                const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
+
+                ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
+                ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
+                ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
+            }
+        }
+
+        MFloat avgError = ParallelMath::MakeFloatZero();
+        for (int ch = 0; ch < numRealChannels; ch++)
+        {
+            MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
+            avgError = avgError + delta * delta * channelWeightsSq[ch];
+        }
+
+        ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
+        better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
+
+        if (ParallelMath::AnySet(better))
+        {
+            ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
+
+            MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
+
+            ParallelMath::ConditionalSet(index, better, candidateIndex);
+
+            for (int ch = 0; ch < numRealChannels; ch++)
+                ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
+
+            for (int epi = 0; epi < 2; epi++)
+                for (int ch = 0; ch < numRealChannels; ch++)
+                    ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
+        }
+    }
+
+    AggregatedError<4> aggError;
+    for (int pxi = 0; pxi < shapeLength; pxi++)
+    {
+        int px = fragmentStart[pxi];
+
+        BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
+    }
+
+    MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
+
+    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
+    if (ParallelMath::AnySet(better))
+    {
+        shapeBestError = ParallelMath::Min(shapeBestError, error);
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < numRealChannels; ch++)
+                ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
+        }
+
+        for (int pxi = 0; pxi < shapeLength; pxi++)
+            ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
+    }
+}
+
+void cvtt::Internal::BC7Computer::TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    float channelWeightsSq[4];
+
+    for (int ch = 0; ch < 4; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    SinglePlaneTemporaries temps;
+
+    MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
+    ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
+    for (int px = 0; px < 16; px++)
+    {
+        MUInt15 a = pixels[px][3];
+        maxAlpha = ParallelMath::Max(maxAlpha, a);
+        minAlpha = ParallelMath::Min(minAlpha, a);
+
+        isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
+    }
+
+    ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
+    ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
+
+    bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
+
+    // Try RGB modes if any block has a min alpha 251 or higher
+    bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
+
+    // Try mode 7 if any block has alpha.
+    // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
+    // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
+    // situations, and only by at most 1 unit of error per pixel.
+    bool allowMode7 = anyBlockHasAlpha || (encodingPlan.mode7RGBPartitionEnabled != 0);
+
+    MFloat preWeightedPixels[16][4];
+
+    BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
+
+    // Get initial RGB endpoints
+    if (allowRGBModes)
+    {
+        const uint8_t *shapeList = encodingPlan.rgbShapeList;
+        int numShapesToEvaluate = encodingPlan.rgbNumShapesToEvaluate;
+
+        for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            int shapeStart = BC7Data::g_shapeRanges[shape][0];
+            int shapeSize = BC7Data::g_shapeRanges[shape][1];
+
+            EndpointSelector<3, 8> epSelector;
+
+            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+            {
+                for (int spx = 0; spx < shapeSize; spx++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + spx];
+                    epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
+                }
+                epSelector.FinishPass(epPass);
+            }
+            temps.unfinishedRGB[shape] = epSelector.GetEndpoints(channelWeights);
+        }
+    }
+
+    // Get initial RGBA endpoints
+    {
+        const uint8_t *shapeList = encodingPlan.rgbaShapeList;
+        int numShapesToEvaluate = encodingPlan.rgbaNumShapesToEvaluate;
+
+        for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            if (anyBlockHasAlpha || !allowRGBModes)
+            {
+                int shapeStart = BC7Data::g_shapeRanges[shape][0];
+                int shapeSize = BC7Data::g_shapeRanges[shape][1];
+
+                EndpointSelector<4, 8> epSelector;
+
+                for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+                {
+                    for (int spx = 0; spx < shapeSize; spx++)
+                    {
+                        int px = BC7Data::g_fragments[shapeStart + spx];
+                        epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
+                    }
+                    epSelector.FinishPass(epPass);
+                }
+                temps.unfinishedRGBA[shape] = epSelector.GetEndpoints(channelWeights);
+            }
+            else
+            {
+                temps.unfinishedRGBA[shape] = temps.unfinishedRGB[shape].ExpandTo<4>(255);
+            }
+        }
+    }
+
+    for (uint16_t mode = 0; mode <= 7; mode++)
+    {
+        if (mode == 4 || mode == 5)
+            continue;
+
+        if (mode < 4 && !allowRGBModes)
+            continue;
+
+        if (mode == 7 && !allowMode7)
+            continue;
+
+        uint64_t partitionEnabledBits = 0;
+        switch (mode)
+        {
+        case 0:
+            partitionEnabledBits = encodingPlan.mode0PartitionEnabled;
+            break;
+        case 1:
+            partitionEnabledBits = encodingPlan.mode1PartitionEnabled;
+            break;
+        case 2:
+            partitionEnabledBits = encodingPlan.mode2PartitionEnabled;
+            break;
+        case 3:
+            partitionEnabledBits = encodingPlan.mode3PartitionEnabled;
+            break;
+        case 6:
+            partitionEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
+            break;
+        case 7:
+            if (anyBlockHasAlpha)
+                partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
+            else
+                partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
+            break;
+        default:
+            break;
+        }
+
+        bool isRGB = (mode < 4);
+
+        unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
+        int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
+        int indexPrec = BC7Data::g_modes[mode].m_indexBits;
+
+        int parityBitMax = 1;
+        if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+            parityBitMax = 4;
+        else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
+            parityBitMax = 2;
+
+        int numRealChannels = isRGB ? 3 : 4;
+
+        int numShapes;
+        const int *shapeList;
+
+        if (numSubsets == 1)
+        {
+            numShapes = BC7Data::g_numShapes1;
+            shapeList = BC7Data::g_shapeList1;
+        }
+        else if (numSubsets == 2)
+        {
+            numShapes = BC7Data::g_numShapes2;
+            shapeList = BC7Data::g_shapeList2;
+        }
+        else
+        {
+            assert(numSubsets == 3);
+            if (numPartitions == 16)
+            {
+                numShapes = BC7Data::g_numShapes3Short;
+                shapeList = BC7Data::g_shapeList3Short;
+            }
+            else
+            {
+                assert(numPartitions == 64);
+                numShapes = BC7Data::g_numShapes3;
+                shapeList = BC7Data::g_shapeList3;
+            }
+        }
+
+        for (int slot = 0; slot < BC7Data::g_numShapesAll; slot++)
+            temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
+
+        for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            int numTweakRounds = 0;
+            if (isRGB)
+                numTweakRounds = encodingPlan.seedPointsForShapeRGB[shape];
+            else
+                numTweakRounds = encodingPlan.seedPointsForShapeRGBA[shape];
+
+            if (numTweakRounds == 0)
+                continue;
+
+            if (numTweakRounds > MaxTweakRounds)
+                numTweakRounds = MaxTweakRounds;
+
+            int shapeStart = BC7Data::g_shapeRanges[shape][0];
+            int shapeLength = BC7Data::g_shapeRanges[shape][1];
+
+            AggregatedError<1> alphaAggError;
+            if (isRGB && anyBlockHasAlpha)
+            {
+                MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
+
+                for (int pxi = 0; pxi < shapeLength; pxi++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + pxi];
+                    MUInt15 original[1] = { pixels[px][3] };
+                    BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
+                }
+            }
+
+            float alphaWeightsSq[1] = { channelWeightsSq[3] };
+            MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
+
+            MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
+
+            for (int tweak = 0; tweak < numTweakRounds; tweak++)
+            {
+                if (isRGB)
+                {
+                    temps.unfinishedRGB[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
+                    tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
+                }
+                else
+                {
+                    temps.unfinishedRGBA[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
+                }
+            }
+
+            ParallelMath::Int16CompFlag punchThroughInvalid[4];
+            for (int pIter = 0; pIter < parityBitMax; pIter++)
+            {
+                punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
+
+                if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
+                {
+                    // Modes 6 and 7 have parity bits that affect alpha
+                    if (pIter == 0)
+                        punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
+                    else if (pIter == parityBitMax - 1)
+                        punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
+                    else
+                        punchThroughInvalid[pIter] = isPunchThrough;
+                }
+            }
+
+            for (int pIter = 0; pIter < parityBitMax; pIter++)
+            {
+                if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
+                    continue;
+
+                bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    uint16_t p[2];
+                    p[0] = (pIter & 1);
+                    p[1] = ((pIter >> 1) & 1);
+
+                    MUInt15 ep[2][4];
+
+                    for (int epi = 0; epi < 2; epi++)
+                        for (int ch = 0; ch < 4; ch++)
+                            ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
+
+                    for (int refine = 0; refine < numRefineRounds; refine++)
+                    {
+                        switch (mode)
+                        {
+                        case 0:
+                            CompressEndpoints0(ep, p);
+                            break;
+                        case 1:
+                            CompressEndpoints1(ep, p[0]);
+                            break;
+                        case 2:
+                            CompressEndpoints2(ep);
+                            break;
+                        case 3:
+                            CompressEndpoints3(ep, p);
+                            break;
+                        case 6:
+                            CompressEndpoints6(ep, p);
+                            break;
+                        case 7:
+                            CompressEndpoints7(ep, p);
+                            break;
+                        default:
+                            assert(false);
+                            break;
+                        };
+
+                        MFloat shapeError = ParallelMath::MakeFloatZero();
+
+                        IndexSelector<4> indexSelector;
+                        indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
+
+                        EndpointRefiner<4> epRefiner;
+                        epRefiner.Init(1 << indexPrec, channelWeights);
+
+                        MUInt15 indexes[16];
+
+                        AggregatedError<4> aggError;
+                        for (int pxi = 0; pxi < shapeLength; pxi++)
+                        {
+                            int px = BC7Data::g_fragments[shapeStart + pxi];
+
+                            MUInt15 index;
+                            MUInt15 reconstructed[4];
+
+                            index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
+                            indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
+
+                            if (flags & cvtt::Flags::BC7_FastIndexing)
+                                BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
+                            else
+                            {
+                                MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
+
+                                MUInt15 altIndexes[2];
+                                altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
+
+                                for (int ii = 0; ii < 2; ii++)
+                                {
+                                    indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
+
+                                    MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
+                                    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
+                                    error = ParallelMath::Min(error, altError);
+                                    ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
+                                }
+
+                                shapeError = shapeError + error;
+                            }
+
+                            if (refine != numRefineRounds - 1)
+                                epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
+
+                            indexes[pxi] = index;
+                        }
+
+                        if (flags & cvtt::Flags::BC7_FastIndexing)
+                            shapeError = aggError.Finalize(flags, channelWeightsSq);
+
+                        if (isRGB)
+                            shapeError = shapeError + staticAlphaError;
+
+                        ParallelMath::FloatCompFlag shapeErrorBetter;
+                        ParallelMath::Int16CompFlag shapeErrorBetter16;
+
+                        shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shape]);
+                        shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
+
+                        if (ParallelMath::AnySet(shapeErrorBetter16))
+                        {
+                            bool punchThroughOK = true;
+                            if (needPunchThroughCheck)
+                            {
+                                shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
+                                shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
+
+                                if (!ParallelMath::AnySet(shapeErrorBetter16))
+                                    punchThroughOK = false;
+                            }
+
+                            if (punchThroughOK)
+                            {
+                                ParallelMath::ConditionalSet(temps.shapeBestError[shape], shapeErrorBetter, shapeError);
+                                for (int epi = 0; epi < 2; epi++)
+                                    for (int ch = 0; ch < numRealChannels; ch++)
+                                        ParallelMath::ConditionalSet(temps.shapeBestEP[shape][epi][ch], shapeErrorBetter16, ep[epi][ch]);
+
+                                for (int pxi = 0; pxi < shapeLength; pxi++)
+                                    ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
+                            }
+                        }
+
+                        if (refine != numRefineRounds - 1)
+                            epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
+                    } // refine
+                } // tweak
+            } // p
+
+            if (flags & cvtt::Flags::BC7_TrySingleColor)
+            {
+                MUInt15 total[4];
+                for (int ch = 0; ch < 4; ch++)
+                    total[ch] = ParallelMath::MakeUInt15(0);
+
+                for (int pxi = 0; pxi < shapeLength; pxi++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + pxi];
+                    for (int ch = 0; ch < 4; ch++)
+                        total[ch] = total[ch] + pixels[pxi][ch];
+                }
+
+                MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
+                MFloat average[4];
+                for (int ch = 0; ch < 4; ch++)
+                    average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
+
+                const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
+                MFloat &shapeBestError = temps.shapeBestError[shape];
+                MUInt15 (&shapeBestEP)[2][4] = temps.shapeBestEP[shape];
+                MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
+
+                const cvtt::Tables::BC7SC::Table **scTables = NULL;
+                int numSCTables = 0;
+
+                const cvtt::Tables::BC7SC::Table *tables0[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i3,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables1[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i1,
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i2,
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i3,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i1,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i2,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i3,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables2[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode2,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables3[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode3_p0,
+                    &cvtt::Tables::BC7SC::g_mode3_p1,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables6[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i1,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i2,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i3,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i4,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i5,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i6,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i7,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i1,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i2,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i3,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i4,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i5,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i6,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i7,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables7[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode7_p00,
+                    &cvtt::Tables::BC7SC::g_mode7_p01,
+                    &cvtt::Tables::BC7SC::g_mode7_p10,
+                    &cvtt::Tables::BC7SC::g_mode7_p11,
+                };
+
+                switch (mode)
+                {
+                case 0:
+                {
+                    scTables = tables0;
+                    numSCTables = sizeof(tables0) / sizeof(tables0[0]);
+                }
+                break;
+                case 1:
+                {
+                    scTables = tables1;
+                    numSCTables = sizeof(tables1) / sizeof(tables1[0]);
+                }
+                break;
+                case 2:
+                {
+
+                    scTables = tables2;
+                    numSCTables = sizeof(tables2) / sizeof(tables2[0]);
+                }
+                break;
+                case 3:
+                {
+                    scTables = tables3;
+                    numSCTables = sizeof(tables3) / sizeof(tables3[0]);
+                }
+                break;
+                case 6:
+                {
+                    scTables = tables6;
+                    numSCTables = sizeof(tables6) / sizeof(tables6[0]);
+                }
+                break;
+                case 7:
+                {
+                    scTables = tables7;
+                    numSCTables = sizeof(tables7) / sizeof(tables7[0]);
+                }
+                break;
+                default:
+                    assert(false);
+                    break;
+                }
+
+                TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
+            }
+        } // shapeIter
+
+        uint64_t partitionsEnabledBits = 0xffffffffffffffffULL;
+
+        switch (mode)
+        {
+        case 0:
+            partitionsEnabledBits = encodingPlan.mode0PartitionEnabled;
+            break;
+        case 1:
+            partitionsEnabledBits = encodingPlan.mode1PartitionEnabled;
+            break;
+        case 2:
+            partitionsEnabledBits = encodingPlan.mode2PartitionEnabled;
+            break;
+        case 3:
+            partitionsEnabledBits = encodingPlan.mode3PartitionEnabled;
+            break;
+        case 6:
+            partitionsEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
+            break;
+        case 7:
+            if (anyBlockHasAlpha)
+                partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
+            else
+                partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
+            break;
+        default:
+            break;
+        };
+
+        for (uint16_t partition = 0; partition < numPartitions; partition++)
+        {
+            if (((partitionsEnabledBits >> partition) & 1) == 0)
+                continue;
+
+            const int *partitionShapes;
+            if (numSubsets == 1)
+                partitionShapes = BC7Data::g_shapes1[partition];
+            else if (numSubsets == 2)
+                partitionShapes = BC7Data::g_shapes2[partition];
+            else
+            {
+                assert(numSubsets == 3);
+                partitionShapes = BC7Data::g_shapes3[partition];
+            }
+
+            MFloat totalError = ParallelMath::MakeFloatZero();
+            for (int subset = 0; subset < numSubsets; subset++)
+                totalError = totalError + temps.shapeBestError[partitionShapes[subset]];
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
+            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+            if (mode == 7 && anyBlockHasAlpha)
+            {
+                // Some lanes could be better, but we filter them out to ensure consistency with scalar
+                bool isRGBAllowedForThisPartition = (((encodingPlan.mode7RGBPartitionEnabled >> partition) & 1) != 0);
+
+                if (!isRGBAllowedForThisPartition)
+                {
+                    errorBetter16 = (errorBetter16 & blockHasNonMaxAlpha);
+                    errorBetter = ParallelMath::Int16FlagToFloat(errorBetter16);
+                }
+            }
+
+            if (ParallelMath::AnySet(errorBetter16))
+            {
+                for (int subset = 0; subset < numSubsets; subset++)
+                {
+                    int shape = partitionShapes[subset];
+                    int shapeStart = BC7Data::g_shapeRanges[shape][0];
+                    int shapeLength = BC7Data::g_shapeRanges[shape][1];
+
+                    for (int epi = 0; epi < 2; epi++)
+                        for (int ch = 0; ch < 4; ch++)
+                            ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shape][epi][ch]);
+
+                    for (int pxi = 0; pxi < shapeLength; pxi++)
+                    {
+                        int px = BC7Data::g_fragments[shapeStart + pxi];
+                        ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
+                    }
+                }
+
+                ParallelMath::ConditionalSet(work.m_error, errorBetter, totalError);
+                ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
+                ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
+            }
+        }
+    }
+}
+
+void cvtt::Internal::BC7Computer::TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
+    // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
+    // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
+    // - Separate alpha channel, then weighted RGB
+    // - Alpha+2 other channels, then the independent channel
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    float channelWeightsSq[4];
+    for (int ch = 0; ch < 4; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    for (uint16_t mode = 4; mode <= 5; mode++)
+    {
+        int numSP[2] = { 0, 0 };
+
+        for (uint16_t rotation = 0; rotation < 4; rotation++)
+        {
+            if (mode == 4)
+            {
+                numSP[0] = encodingPlan.mode4SP[rotation][0];
+                numSP[1] = encodingPlan.mode4SP[rotation][1];
+            }
+            else
+                numSP[0] = numSP[1] = encodingPlan.mode5SP[rotation];
+
+            if (numSP[0] == 0 && numSP[1] == 0)
+                continue;
+
+            int alphaChannel = (rotation + 3) & 3;
+            int redChannel = (rotation == 1) ? 3 : 0;
+            int greenChannel = (rotation == 2) ? 3 : 1;
+            int blueChannel = (rotation == 3) ? 3 : 2;
+
+            MUInt15 rotatedRGB[16][3];
+            MFloat floatRotatedRGB[16][3];
+
+            for (int px = 0; px < 16; px++)
+            {
+                rotatedRGB[px][0] = pixels[px][redChannel];
+                rotatedRGB[px][1] = pixels[px][greenChannel];
+                rotatedRGB[px][2] = pixels[px][blueChannel];
+
+                for (int ch = 0; ch < 3; ch++)
+                    floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
+            }
+
+            uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
+
+            float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
+            float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
+            float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
+            float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
+
+            float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
+
+            MFloat preWeightedRotatedRGB[16][3];
+            BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
+
+            for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
+            {
+                int numTweakRounds = numSP[indexSelector];
+
+                if (numTweakRounds <= 0)
+                    continue;
+
+                if (numTweakRounds > MaxTweakRounds)
+                    numTweakRounds = MaxTweakRounds;
+
+                EndpointSelector<3, 8> rgbSelector;
+
+                for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+                {
+                    for (int px = 0; px < 16; px++)
+                        rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
+
+                    rgbSelector.FinishPass(epPass);
+                }
+
+                MUInt15 alphaRange[2];
+
+                alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
+                for (int px = 1; px < 16; px++)
+                {
+                    alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
+                    alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
+                }
+
+                int rgbPrec = 0;
+                int alphaPrec = 0;
+
+                if (mode == 4)
+                {
+                    rgbPrec = indexSelector ? 3 : 2;
+                    alphaPrec = indexSelector ? 2 : 3;
+                }
+                else
+                    rgbPrec = alphaPrec = 2;
+
+                UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
+
+                MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
+                MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
+
+                MUInt15 bestRGBIndexes[16];
+                MUInt15 bestAlphaIndexes[16];
+                MUInt15 bestEP[2][4];
+
+                for (int px = 0; px < 16; px++)
+                    bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    MUInt15 rgbEP[2][3];
+                    MUInt15 alphaEP[2];
+
+                    unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
+
+                    TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
+
+                    for (int refine = 0; refine < numRefineRounds; refine++)
+                    {
+                        if (mode == 4)
+                            CompressEndpoints4(rgbEP, alphaEP);
+                        else
+                            CompressEndpoints5(rgbEP, alphaEP);
+
+
+                        IndexSelector<1> alphaIndexSelector;
+                        IndexSelector<3> rgbIndexSelector;
+
+                        {
+                            MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
+                            alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
+                        }
+                        rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
+
+                        EndpointRefiner<3> rgbRefiner;
+                        EndpointRefiner<1> alphaRefiner;
+
+                        rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
+                        alphaRefiner.Init(1 << alphaPrec, uniformWeight);
+
+                        MFloat errorRGB = ParallelMath::MakeFloatZero();
+                        MFloat errorA = ParallelMath::MakeFloatZero();
+
+                        MUInt15 rgbIndexes[16];
+                        MUInt15 alphaIndexes[16];
+
+                        AggregatedError<3> rgbAggError;
+                        AggregatedError<1> alphaAggError;
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
+                            MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
+
+                            MUInt15 reconstructedRGB[3];
+                            MUInt15 reconstructedAlpha[1];
+
+                            rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
+                            alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
+
+                            if (flags & cvtt::Flags::BC7_FastIndexing)
+                            {
+                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
+                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
+                            }
+                            else
+                            {
+                                AggregatedError<3> baseRGBAggError;
+                                AggregatedError<1> baseAlphaAggError;
+
+                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
+                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
+
+                                MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+
+                                MUInt15 altRGBIndexes[2];
+                                MUInt15 altAlphaIndexes[2];
+
+                                altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
+
+                                altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
+
+                                for (int ii = 0; ii < 2; ii++)
+                                {
+                                    rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
+                                    alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
+
+                                    AggregatedError<3> altRGBAggError;
+                                    AggregatedError<1> altAlphaAggError;
+
+                                    BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
+                                    BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
+
+                                    MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                    MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+
+                                    ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
+                                    ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
+
+                                    rgbError = ParallelMath::Min(altRGBError, rgbError);
+                                    alphaError = ParallelMath::Min(altAlphaError, alphaError);
+
+                                    ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
+                                    ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
+                                }
+
+                                errorRGB = errorRGB + rgbError;
+                                errorA = errorA + alphaError;
+                            }
+
+                            if (refine != numRefineRounds - 1)
+                            {
+                                rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
+                                alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
+                            }
+
+                            if (flags & Flags::BC7_FastIndexing)
+                            {
+                                errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                errorA = alphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+                            }
+
+                            rgbIndexes[px] = rgbIndex;
+                            alphaIndexes[px] = alphaIndex;
+                        }
+
+                        ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
+                        ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
+
+                        ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
+                        ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
+
+                        if (ParallelMath::AnySet(rgbBetterInt16))
+                        {
+                            bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
+
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
+
+                            for (int ep = 0; ep < 2; ep++)
+                            {
+                                for (int ch = 0; ch < 3; ch++)
+                                    ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
+                            }
+                        }
+
+                        if (ParallelMath::AnySet(alphaBetterInt16))
+                        {
+                            bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
+
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
+
+                            for (int ep = 0; ep < 2; ep++)
+                                ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
+                        }
+
+                        if (refine != numRefineRounds - 1)
+                        {
+                            rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
+
+                            MUInt15 alphaEPTemp[2][1];
+                            alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
+
+                            for (int i = 0; i < 2; i++)
+                                alphaEP[i] = alphaEPTemp[i][0];
+                        }
+                    }	// refine
+                } // tweak
+
+                MFloat combinedError = bestRGBError + bestAlphaError;
+
+                ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
+                ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                work.m_error = ParallelMath::Min(combinedError, work.m_error);
+
+                ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
+                ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
+                ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
+
+                for (int px = 0; px < 16; px++)
+                {
+                    ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
+                    ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
+                }
+
+                for (int ep = 0; ep < 2; ep++)
+                    for (int ch = 0; ch < 4; ch++)
+                        ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
+            }
+        }
+    }
+}
+
+template<class T>
+void cvtt::Internal::BC7Computer::Swap(T& a, T& b)
+{
+    T temp = a;
+    a = b;
+    b = temp;
+}
+
+void cvtt::Internal::BC7Computer::Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds)
+{
+    MUInt15 pixels[16][4];
+    MFloat floatPixels[16][4];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
+    }
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+    }
+
+    BC67::WorkInfo work;
+    memset(&work, 0, sizeof(work));
+
+    work.m_error = ParallelMath::MakeFloat(FLT_MAX);
+
+    {
+        ParallelMath::RoundTowardNearestForScope rtn;
+        TrySinglePlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
+        TryDualPlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        PackingVector pv;
+        pv.Init();
+
+        ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
+        ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
+        ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
+
+        const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
+
+        ParallelMath::ScalarUInt16 indexes[16];
+        ParallelMath::ScalarUInt16 indexes2[16];
+        ParallelMath::ScalarUInt16 endPoints[3][2][4];
+
+        for (int i = 0; i < 16; i++)
+        {
+            indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
+            if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+                indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
+        }
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+            {
+                for (int ch = 0; ch < 4; ch++)
+                    endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
+            }
+        }
+
+        int fixups[3] = { 0, 0, 0 };
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
+            bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
+
+            if (flipRGB)
+            {
+                uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                    indexes[px] = highIndex - indexes[px];
+            }
+
+            if (flipAlpha)
+            {
+                uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                    indexes2[px] = highIndex - indexes2[px];
+            }
+
+            if (indexSelector)
+                Swap(flipRGB, flipAlpha);
+
+            if (flipRGB)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
+            }
+            if (flipAlpha)
+                Swap(endPoints[0][0][3], endPoints[0][1][3]);
+
+        }
+        else
+        {
+            if (modeInfo.m_numSubsets == 2)
+                fixups[1] = BC7Data::g_fixupIndexes2[partition];
+            else if (modeInfo.m_numSubsets == 3)
+            {
+                fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
+                fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
+            }
+
+            bool flip[3] = { false, false, false };
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
+
+            if (flip[0] || flip[1] || flip[2])
+            {
+                uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                {
+                    int subset = 0;
+                    if (modeInfo.m_numSubsets == 2)
+                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+                    else if (modeInfo.m_numSubsets == 3)
+                        subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
+
+                    if (flip[subset])
+                        indexes[px] = highIndex - indexes[px];
+                }
+
+                int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
+                for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                {
+                    if (flip[subset])
+                        for (int ch = 0; ch < maxCH; ch++)
+                            Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
+                }
+            }
+        }
+
+        pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
+
+        if (modeInfo.m_partitionBits)
+            pv.Pack(partition, modeInfo.m_partitionBits);
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
+            pv.Pack(rotation, 2);
+        }
+
+        if (modeInfo.m_hasIndexSelector)
+            pv.Pack(indexSelector, 1);
+
+        // Encode RGB
+        for (int ch = 0; ch < 3; ch++)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
+                    epPart >>= (8 - modeInfo.m_rgbBits);
+
+                    pv.Pack(epPart, modeInfo.m_rgbBits);
+                }
+            }
+        }
+
+        // Encode alpha
+        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
+                    epPart >>= (8 - modeInfo.m_alphaBits);
+
+                    pv.Pack(epPart, modeInfo.m_alphaBits);
+                }
+            }
+        }
+
+        // Encode parity bits
+        if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
+                epPart >>= (7 - modeInfo.m_rgbBits);
+                epPart &= 1;
+
+                pv.Pack(epPart, 1);
+            }
+        }
+        else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
+                    epPart >>= (7 - modeInfo.m_rgbBits);
+                    epPart &= 1;
+
+                    pv.Pack(epPart, 1);
+                }
+            }
+        }
+
+        // Encode indexes
+        for (int px = 0; px < 16; px++)
+        {
+            int bits = modeInfo.m_indexBits;
+            if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
+                bits--;
+
+            pv.Pack(indexes[px], bits);
+        }
+
+        // Encode secondary indexes
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                int bits = modeInfo.m_alphaIndexBits;
+                if (px == 0)
+                    bits--;
+
+                pv.Pack(indexes2[px], bits);
+            }
+        }
+
+        pv.Flush(packedBlocks);
+
+        packedBlocks += 16;
+    }
+}
+
+void cvtt::Internal::BC7Computer::UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
+{
+    UnpackingVector pv;
+    pv.Init(packedBlock);
+
+    int mode = 8;
+    for (int i = 0; i < 8; i++)
+    {
+        if (pv.Unpack(1) == 1)
+        {
+            mode = i;
+            break;
+        }
+    }
+
+    if (mode > 7)
+    {
+        for (int px = 0; px < 16; px++)
+            for (int ch = 0; ch < 4; ch++)
+                output.m_pixels[px][ch] = 0;
+
+        return;
+    }
+
+    const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
+
+    int partition = 0;
+    if (modeInfo.m_partitionBits)
+        partition = pv.Unpack(modeInfo.m_partitionBits);
+
+    int rotation = 0;
+    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        rotation = pv.Unpack(2);
+
+    int indexSelector = 0;
+    if (modeInfo.m_hasIndexSelector)
+        indexSelector = pv.Unpack(1);
+
+    // Resolve fixups
+    int fixups[3] = { 0, 0, 0 };
+
+    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
+    {
+        if (modeInfo.m_numSubsets == 2)
+            fixups[1] = BC7Data::g_fixupIndexes2[partition];
+        else if (modeInfo.m_numSubsets == 3)
+        {
+            fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
+            fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
+        }
+    }
+
+    int endPoints[3][2][4];
+
+    // Decode RGB
+    for (int ch = 0; ch < 3; ch++)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
+        }
+    }
+
+    // Decode alpha
+    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
+        }
+    }
+    else
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][3] = 255;
+        }
+    }
+
+    int parityBits = 0;
+
+    // Decode parity bits
+    if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            int p = pv.Unpack(1);
+
+            for (int ep = 0; ep < 2; ep++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
+
+                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                    endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
+            }
+        }
+
+        parityBits = 1;
+    }
+    else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+            {
+                int p = pv.Unpack(1);
+
+                for (int ch = 0; ch < 3; ch++)
+                    endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
+
+                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                    endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
+            }
+        }
+
+        parityBits = 1;
+    }
+
+    // Fill endpoint bits
+    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+    {
+        for (int ep = 0; ep < 2; ep++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
+
+            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
+        }
+    }
+
+    int indexes[16];
+    int indexes2[16];
+
+    // Decode indexes
+    for (int px = 0; px < 16; px++)
+    {
+        int bits = modeInfo.m_indexBits;
+        if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
+            bits--;
+
+        indexes[px] = pv.Unpack(bits);
+    }
+
+    // Decode secondary indexes
+    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            int bits = modeInfo.m_alphaIndexBits;
+            if (px == 0)
+                bits--;
+
+            indexes2[px] = pv.Unpack(bits);
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+            indexes2[px] = 0;
+    }
+
+    const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
+    const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
+
+    // Decode each pixel
+    for (int px = 0; px < 16; px++)
+    {
+        int rgbWeight = 0;
+        int alphaWeight = 0;
+
+        int rgbIndex = indexes[px];
+
+        rgbWeight = rgbWeights[indexes[px]];
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
+            alphaWeight = rgbWeight;
+        else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+            alphaWeight = alphaWeights[indexes2[px]];
+
+        if (indexSelector == 1)
+        {
+            int temp = rgbWeight;
+            rgbWeight = alphaWeight;
+            alphaWeight = temp;
+        }
+
+        int pixel[4] = { 0, 0, 0, 255 };
+
+        int subset = 0;
+
+        if (modeInfo.m_numSubsets == 2)
+            subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+        else if (modeInfo.m_numSubsets == 3)
+            subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
+
+        for (int ch = 0; ch < 3; ch++)
+            pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
+
+        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+            pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
+
+        if (rotation != 0)
+        {
+            int ch = rotation - 1;
+            int temp = pixel[ch];
+            pixel[ch] = pixel[3];
+            pixel[3] = temp;
+        }
+
+        for (int ch = 0; ch < 4; ch++)
+            output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
+    }
+}
+
+cvtt::ParallelMath::SInt16 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
+{
+    assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
+    assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
+
+    // Expand to full range
+    ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
+    MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
+
+    absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
+
+    MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
+
+    return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
+}
+
+cvtt::ParallelMath::UInt15 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
+{
+    MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
+    return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
+}
+
+void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
+{
+    MSInt16 zero = ParallelMath::MakeSInt16(0);
+
+    ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
+    MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
+
+    MSInt16 unq;
+    MUInt15 absUnq;
+
+    if (precision >= 16)
+    {
+        unq = comp;
+        absUnq = absComp;
+    }
+    else
+    {
+        MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
+        ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
+        ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
+
+        absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
+        ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
+        ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
+
+        unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
+    }
+
+    outUnquantized = unq;
+
+    MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
+
+    outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
+}
+
+void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
+{
+    MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
+    if (precision < 15)
+    {
+        MUInt15 zero = ParallelMath::MakeUInt15(0);
+        MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
+
+        ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
+        ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
+
+        unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
+
+        ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
+        ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
+    }
+
+    outUnquantized = unq;
+    outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
+}
+
+void cvtt::Internal::BC6HComputer::QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MSInt16 unquantizedEP[2][3];
+    MSInt16 finishedUnquantizedEP[2][3];
+
+    {
+        ParallelMath::RoundUpForScope ru;
+
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
+                UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
+                quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
+            }
+        }
+    }
+
+    indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
+    indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
+
+    MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
+
+    MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
+
+    ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
+
+    if (ParallelMath::AnySet(invert))
+    {
+        ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
+
+        indexSelector.ConditionalInvert(invert);
+
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MAInt16 firstEP = quantizedEndPoints[0][ch];
+            MAInt16 secondEP = quantizedEndPoints[1][ch];
+
+            quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
+            quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
+        }
+    }
+
+    indexes[fixupIndex] = index;
+}
+
+void cvtt::Internal::BC6HComputer::QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MUInt16 unquantizedEP[2][3];
+    MUInt16 finishedUnquantizedEP[2][3];
+
+    {
+        ParallelMath::RoundUpForScope ru;
+
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
+                UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
+                quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
+            }
+        }
+    }
+
+    indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
+    indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
+
+    MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
+
+    MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
+
+    ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
+
+    if (ParallelMath::AnySet(invert))
+    {
+        ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
+
+        indexSelector.ConditionalInvert(invert);
+
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MAInt16 firstEP = quantizedEndPoints[0][ch];
+            MAInt16 secondEP = quantizedEndPoints[1][ch];
+
+            quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
+            quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
+        }
+    }
+
+    indexes[fixupIndex] = index;
+}
+
+void cvtt::Internal::BC6HComputer::EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
+{
+    ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
+
+    MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        outEncodedEPs[0][0][ch] = ep0[0][ch];
+        outEncodedEPs[0][1][ch] = ep0[1][ch];
+        outEncodedEPs[1][0][ch] = ep1[0][ch];
+        outEncodedEPs[1][1][ch] = ep1[1][ch];
+
+        if (isTransformed)
+        {
+            for (int subset = 0; subset < 2; subset++)
+            {
+                for (int epi = 0; epi < 2; epi++)
+                {
+                    if (epi == 0 && subset == 0)
+                        continue;
+
+                    MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
+
+                    MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
+
+                    outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
+
+                    MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
+                    allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
+                }
+            }
+        }
+
+        if (!ParallelMath::AnySet(allLegal))
+            break;
+    }
+
+    outIsLegal = allLegal;
+}
+
+void cvtt::Internal::BC6HComputer::EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
+{
+    ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
+
+    MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        outEncodedEPs[0][ch] = ep[0][ch];
+        outEncodedEPs[1][ch] = ep[1][ch];
+
+        if (isTransformed)
+        {
+            MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
+
+            MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
+
+            outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
+
+            MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
+            allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
+        }
+    }
+
+    outIsLegal = allLegal;
+}
+
+void cvtt::Internal::BC6HComputer::Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
+{
+    if (numTweakRounds < 1)
+        numTweakRounds = 1;
+    else if (numTweakRounds > MaxTweakRounds)
+        numTweakRounds = MaxTweakRounds;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+    else if (numRefineRounds > MaxRefineRounds)
+        numRefineRounds = MaxRefineRounds;
+
+    bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
+    float channelWeightsSq[3];
+
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    MSInt16 pixels[16][3];
+    MFloat floatPixels2CL[16][3];
+    MFloat floatPixelsLinearWeighted[16][3];
+
+    MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MSInt16 pixelValue;
+            ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
+
+            // Convert from sign+magnitude to 2CL
+            if (isSigned)
+            {
+                ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
+                MSInt16 magnitude = (pixelValue & low15Bits);
+                ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
+                pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
+            }
+            else
+                pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
+
+            pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
+
+            pixels[px][ch] = pixelValue;
+            floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
+            floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
+        }
+    }
+
+    MFloat preWeightedPixels[16][3];
+
+    BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
+
+    MAInt16 bestEndPoints[2][2][3];
+    MUInt15 bestIndexes[16];
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestMode = ParallelMath::MakeUInt15(0);
+    MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    for (int subset = 0; subset < 2; subset++)
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
+
+    UnfinishedEndpoints<3> partitionedUFEP[32][2];
+    UnfinishedEndpoints<3> singleUFEP;
+
+    // Generate UFEP for partitions
+    for (int p = 0; p < 32; p++)
+    {
+        int partitionMask = BC7Data::g_partitionMap[p];
+
+        EndpointSelector<3, 8> epSelectors[2];
+
+        for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                int subset = (partitionMask >> px) & 1;
+                epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
+            }
+
+            for (int subset = 0; subset < 2; subset++)
+                epSelectors[subset].FinishPass(pass);
+        }
+
+        for (int subset = 0; subset < 2; subset++)
+            partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
+    }
+
+    // Generate UFEP for single
+    {
+        EndpointSelector<3, 8> epSelector;
+
+        for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+        {
+            for (int px = 0; px < 16; px++)
+                epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
+
+            epSelector.FinishPass(pass);
+        }
+
+        singleUFEP = epSelector.GetEndpoints(channelWeights);
+    }
+
+    for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
+    {
+        bool partitioned = (partitionedInt == 1);
+
+        for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
+        {
+            if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
+                continue;
+
+            int numPartitions = partitioned ? 32 : 1;
+            int numSubsets = partitioned ? 2 : 1;
+            int indexBits = partitioned ? 3 : 4;
+            int indexRange = (1 << indexBits);
+
+            for (int p = 0; p < numPartitions; p++)
+            {
+                int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
+
+                const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
+
+                MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
+                MUInt15 metaIndexes[MaxMetaRounds][16];
+                MFloat metaError[MaxMetaRounds][2];
+
+                bool roundValid[MaxMetaRounds][2];
+
+                for (int r = 0; r < MaxMetaRounds; r++)
+                    for (int subset = 0; subset < 2; subset++)
+                        roundValid[r][subset] = true;
+
+                for (int subset = 0; subset < numSubsets; subset++)
+                {
+                    for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
+                    {
+                        EndpointRefiner<3> refiners[2];
+
+                        bool abortRemainingRefines = false;
+                        for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
+                        {
+                            int metaRound = tweak * MaxRefineRounds + refinePass;
+
+                            if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
+                                abortRemainingRefines = true;
+
+                            if (abortRemainingRefines)
+                            {
+                                roundValid[metaRound][subset] = false;
+                                continue;
+                            }
+
+                            MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
+                            MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
+
+                            MSInt16 endPointsColorSpace[2][3];
+
+                            if (refinePass == 0)
+                            {
+                                UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
+
+                                if (isSigned)
+                                    ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
+                                else
+                                    ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
+                            }
+                            else
+                                refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
+
+                            refiners[subset].Init(indexRange, channelWeights);
+
+                            int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
+
+                            IndexSelectorHDR<3> indexSelector;
+                            if (isSigned)
+                                QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
+                            else
+                                QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
+
+                            if (metaRound > 0)
+                            {
+                                ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
+
+                                for (int prevRound = 0; prevRound < metaRound; prevRound++)
+                                {
+                                    MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
+
+                                    ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
+
+                                    for (int epi = 0; epi < 2; epi++)
+                                        for (int ch = 0; ch < 3; ch++)
+                                            same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
+
+                                    anySame = (anySame | same);
+                                    if (ParallelMath::AllSet(anySame))
+                                        break;
+                                }
+
+                                if (ParallelMath::AllSet(anySame))
+                                {
+                                    roundValid[metaRound][subset] = false;
+                                    continue;
+                                }
+                            }
+
+                            MFloat subsetError = ParallelMath::MakeFloatZero();
+
+                            {
+                                for (int px = 0; px < 16; px++)
+                                {
+                                    if (subset != ((partitionMask >> px) & 1))
+                                        continue;
+
+                                    MUInt15 index;
+                                    if (px == fixupIndex)
+                                        index = mrIndexes[px];
+                                    else
+                                    {
+                                        index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
+                                        mrIndexes[px] = index;
+                                    }
+
+                                    MSInt16 reconstructed[3];
+                                    if (isSigned)
+                                        indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
+                                    else
+                                        indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
+
+                                    subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
+
+                                    if (refinePass != numRefineRounds - 1)
+                                        refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
+                                }
+                            }
+
+                            metaError[metaRound][subset] = subsetError;
+                        }
+                    }
+                }
+
+                // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
+                int numMeta1 = partitioned ? MaxMetaRounds : 1;
+                for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
+                {
+                    if (!roundValid[meta0][0])
+                        continue;
+
+                    for (int meta1 = 0; meta1 < numMeta1; meta1++)
+                    {
+                        MFloat combinedError = metaError[meta0][0];
+                        if (partitioned)
+                        {
+                            if (!roundValid[meta1][1])
+                                continue;
+
+                            combinedError = combinedError + metaError[meta1][1];
+                        }
+
+                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
+                        if (!ParallelMath::AnySet(errorBetter))
+                            continue;
+
+                        ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                        // Figure out if this is encodable
+                        for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
+                        {
+                            const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
+
+                            if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
+                                continue;
+
+                            MAInt16 encodedEPs[2][2][3];
+                            ParallelMath::Int16CompFlag isLegal;
+                            if (partitioned)
+                                EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
+                            else
+                                EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
+
+                            ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
+                            if (!ParallelMath::AnySet(isLegalAndBetter))
+                                continue;
+
+                            ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
+
+                            ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
+                            ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
+                            ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
+
+                            for (int subset = 0; subset < numSubsets; subset++)
+                            {
+                                for (int epi = 0; epi < 2; epi++)
+                                {
+                                    for (int ch = 0; ch < 3; ch++)
+                                        ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
+                                }
+                            }
+
+                            for (int px = 0; px < 16; px++)
+                            {
+                                int subset = ((partitionMask >> px) & 1);
+                                if (subset == 0)
+                                    ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
+                                else
+                                    ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
+                            }
+
+                            needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
+                            if (!ParallelMath::AnySet(needsCommit))
+                                break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // At this point, everything should be set
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
+        ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
+        int32_t eps[2][2][3];
+        ParallelMath::ScalarUInt16 indexes[16];
+
+        const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
+
+        BC6H_IO::WriteFunc_t writeFunc = BC6H_IO::g_writeFuncs[mode];
+
+        const int headerBits = modeInfo.m_partitioned ? 82 : 65;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            for (int epi = 0; epi < 2; epi++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
+            }
+        }
+
+        for (int px = 0; px < 16; px++)
+            indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
+
+        uint16_t modeID = modeInfo.m_modeID;
+
+        PackingVector pv;
+
+        {
+            uint32_t header[3];
+            writeFunc(header, modeID, partition,
+                eps[0][0][0], eps[0][1][0], eps[1][0][0], eps[1][1][0],
+                eps[0][0][1], eps[0][1][1], eps[1][0][1], eps[1][1][1],
+                eps[0][0][2], eps[0][1][2], eps[1][0][2], eps[1][1][2]
+            );
+
+            pv.InitPacked(header, headerBits);
+        }
+
+        int fixupIndex1 = 0;
+        int indexBits = 4;
+        if (modeInfo.m_partitioned)
+        {
+            fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
+            indexBits = 3;
+        }
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
+            if (px == 0 || px == fixupIndex1)
+                pv.Pack(index, indexBits - 1);
+            else
+                pv.Pack(index, indexBits);
+        }
+
+        pv.Flush(packedBlocks + 16 * block);
+    }
+}
+
+void cvtt::Internal::BC6HComputer::SignExtendSingle(int &v, int bits)
+{
+    if (v & (1 << (bits - 1)))
+        v |= -(1 << bits);
+}
+
+void cvtt::Internal::BC6HComputer::UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
+{
+    UnpackingVector pv;
+    pv.Init(pBC);
+
+    int numModeBits = 2;
+    int modeBits = pv.Unpack(2);
+    if (modeBits != 0 && modeBits != 1)
+    {
+        modeBits |= pv.Unpack(3) << 2;
+        numModeBits += 3;
+    }
+
+    int mode = -1;
+    for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
+    {
+        if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
+        {
+            mode = possibleMode;
+            break;
+        }
+    }
+
+    if (mode < 0)
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                output.m_pixels[px][ch] = 0;
+            output.m_pixels[px][3] = 0x3c00;	// 1.0
+        }
+        return;
+    }
+
+    const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
+    const int headerBits = modeInfo.m_partitioned ? 82 : 65;
+    const BC6H_IO::ReadFunc_t readFunc = BC6H_IO::g_readFuncs[mode];
+
+    uint16_t partition = 0;
+    int32_t eps[2][2][3];
+
+    for (int subset = 0; subset < 2; subset++)
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                eps[subset][epi][ch] = 0;
+
+    {
+        uint32_t header[3];
+        uint16_t codedEPs[2][2][3];
+        pv.UnpackStart(header, headerBits);
+
+        readFunc(header, partition,
+            codedEPs[0][0][0], codedEPs[0][1][0], codedEPs[1][0][0], codedEPs[1][1][0],
+            codedEPs[0][0][1], codedEPs[0][1][1], codedEPs[1][0][1], codedEPs[1][1][1],
+            codedEPs[0][0][2], codedEPs[0][1][2], codedEPs[1][0][2], codedEPs[1][1][2]
+        );
+
+        for (int subset = 0; subset < 2; subset++)
+            for (int epi = 0; epi < 2; epi++)
+                for (int ch = 0; ch < 3; ch++)
+                    eps[subset][epi][ch] = codedEPs[subset][epi][ch];
+    }
+
+    uint16_t modeID = modeInfo.m_modeID;
+
+    int fixupIndex1 = 0;
+    int indexBits = 4;
+    int numSubsets = 1;
+    if (modeInfo.m_partitioned)
+    {
+        fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
+        indexBits = 3;
+        numSubsets = 2;
+    }
+
+    int indexes[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (px == 0 || px == fixupIndex1)
+            indexes[px] = pv.Unpack(indexBits - 1);
+        else
+            indexes[px] = pv.Unpack(indexBits);
+    }
+
+    if (modeInfo.m_partitioned)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            if (isSigned)
+                SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
+            if (modeInfo.m_transformed || isSigned)
+            {
+                SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
+                SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
+                SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
+            }
+        }
+    }
+    else
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            if (isSigned)
+                SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
+            if (modeInfo.m_transformed || isSigned)
+                SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
+        }
+    }
+
+    int aPrec = modeInfo.m_aPrec;
+
+    if (modeInfo.m_transformed)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            int wrapMask = (1 << aPrec) - 1;
+
+            eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
+            if (isSigned)
+                SignExtendSingle(eps[0][1][ch], aPrec);
+
+            if (modeInfo.m_partitioned)
+            {
+                eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
+                eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
+
+                if (isSigned)
+                {
+                    SignExtendSingle(eps[1][0][ch], aPrec);
+                    SignExtendSingle(eps[1][1][ch], aPrec);
+                }
+            }
+        }
+    }
+
+    // Unquantize endpoints
+    for (int subset = 0; subset < numSubsets; subset++)
+    {
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                int &v = eps[subset][epi][ch];
+
+                if (isSigned)
+                {
+                    if (aPrec >= 16)
+                    {
+                        // Nothing
+                    }
+                    else
+                    {
+                        bool s = false;
+                        int comp = v;
+                        if (v < 0)
+                        {
+                            s = true;
+                            comp = -comp;
+                        }
+
+                        int unq = 0;
+                        if (comp == 0)
+                            unq = 0;
+                        else if (comp >= ((1 << (aPrec - 1)) - 1))
+                            unq = 0x7fff;
+                        else
+                            unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
+
+                        if (s)
+                            unq = -unq;
+
+                        v = unq;
+                    }
+                }
+                else
+                {
+                    if (aPrec >= 15)
+                    {
+                        // Nothing
+                    }
+                    else if (v == 0)
+                    {
+                        // Nothing
+                    }
+                    else if (v == ((1 << aPrec) - 1))
+                        v = 0xffff;
+                    else
+                        v = ((v << 16) + 0x8000) >> aPrec;
+                }
+            }
+        }
+    }
+
+    const int *weights = BC7Data::g_weightTables[indexBits];
+
+    for (int px = 0; px < 16; px++)
+    {
+        int subset = 0;
+        if (modeInfo.m_partitioned)
+            subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+
+        int w = weights[indexes[px]];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
+
+            if (isSigned)
+            {
+                if (comp < 0)
+                    comp = -(((-comp) * 31) >> 5);
+                else
+                    comp = (comp * 31) >> 5;
+
+                int s = 0;
+                if (comp < 0)
+                {
+                    s = 0x8000;
+                    comp = -comp;
+                }
+
+                output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
+            }
+            else
+            {
+                comp = (comp * 31) >> 6;
+                output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
+            }
+        }
+        output.m_pixels[px][3] = 0x3c00;	// 1.0
+    }
+}
+
+void cvtt::Kernels::ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality)
+{
+    static const int kMaxQuality = 100;
+
+    if (quality < 1)
+        quality = 1;
+    else if (quality > kMaxQuality)
+        quality = kMaxQuality;
+
+    const int numRGBModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGB * quality / kMaxQuality;
+    const int numRGBAModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGBA * quality / kMaxQuality;
+
+    const uint16_t *prioLists[] = { cvtt::Tables::BC7Prio::g_bc7PrioCodesRGB, cvtt::Tables::BC7Prio::g_bc7PrioCodesRGBA };
+    const int prioListSizes[] = { numRGBModes, numRGBAModes };
+
+    BC7FineTuningParams ftParams;
+    memset(&ftParams, 0, sizeof(ftParams));
+
+    for (int listIndex = 0; listIndex < 2; listIndex++)
+    {
+        int prioListSize = prioListSizes[listIndex];
+        const uint16_t *prioList = prioLists[listIndex];
+
+        for (int prioIndex = 0; prioIndex < prioListSize; prioIndex++)
+        {
+            const uint16_t packedMode = prioList[prioIndex];
+
+            uint8_t seedPoints = static_cast<uint8_t>(cvtt::Tables::BC7Prio::UnpackSeedPointCount(packedMode));
+            int mode = cvtt::Tables::BC7Prio::UnpackMode(packedMode);
+
+            switch (mode)
+            {
+            case 0:
+                ftParams.mode0SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 1:
+                ftParams.mode1SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 2:
+                ftParams.mode2SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 3:
+                ftParams.mode3SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 4:
+                ftParams.mode4SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)][cvtt::Tables::BC7Prio::UnpackIndexSelector(packedMode)] = seedPoints;
+                break;
+            case 5:
+                ftParams.mode5SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)] = seedPoints;
+                break;
+            case 6:
+                ftParams.mode6SP = seedPoints;
+                break;
+            case 7:
+                ftParams.mode7SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            }
+        }
+    }
+
+    ConfigureBC7EncodingPlanFromFineTuningParams(encodingPlan, ftParams);
+}
+
+// Generates a BC7 encoding plan from fine-tuning parameters.
+bool cvtt::Kernels::ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params)
+{
+    memset(&encodingPlan, 0, sizeof(encodingPlan));
+
+    // Mode 0
+    for (int partition = 0; partition < 16; partition++)
+    {
+        uint8_t sp = params.mode0SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode0PartitionEnabled |= static_cast<uint16_t>(1) << partition;
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 1
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode1SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode1PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 2
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode2SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode2PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 3
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode3SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode3PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 4
+    for (int rotation = 0; rotation < 4; rotation++)
+    {
+        for (int indexMode = 0; indexMode < 2; indexMode++)
+            encodingPlan.mode4SP[rotation][indexMode] = params.mode4SP[rotation][indexMode];
+    }
+
+    // Mode 5
+    for (int rotation = 0; rotation < 4; rotation++)
+        encodingPlan.mode5SP[rotation] = params.mode5SP[rotation];
+
+    // Mode 6
+    {
+        uint8_t sp = params.mode6SP;
+        if (sp != 0)
+        {
+            encodingPlan.mode6Enabled = true;
+
+            int shape = cvtt::Internal::BC7Data::g_shapes1[0][0];
+            encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
+        }
+    }
+
+    // Mode 7
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode7SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode7RGBAPartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
+        }
+    }
+
+    for (int i = 0; i < BC7EncodingPlan::kNumRGBShapes; i++)
+    {
+        if (encodingPlan.seedPointsForShapeRGB[i] > 0)
+        {
+            encodingPlan.rgbShapeList[encodingPlan.rgbNumShapesToEvaluate] = i;
+            encodingPlan.rgbNumShapesToEvaluate++;
+        }
+    }
+
+    for (int i = 0; i < BC7EncodingPlan::kNumRGBAShapes; i++)
+    {
+        if (encodingPlan.seedPointsForShapeRGBA[i] > 0)
+        {
+            encodingPlan.rgbaShapeList[encodingPlan.rgbaNumShapesToEvaluate] = i;
+            encodingPlan.rgbaNumShapesToEvaluate++;
+        }
+    }
+
+    encodingPlan.mode7RGBPartitionEnabled = (encodingPlan.mode7RGBAPartitionEnabled & ~encodingPlan.mode3PartitionEnabled);
+
+    return true;
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC67.h b/thirdparty/cvtt/ConvectionKernels_BC67.h
new file mode 100644
index 0000000000..b929711187
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC67.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "ConvectionKernels_ParallelMath.h"
+
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace BC7SC
+        {
+            struct Table;
+        }
+    }
+
+    namespace Internal
+    {
+        namespace BC67
+        {
+            struct WorkInfo;
+        }
+
+        template<int TVectorSize>
+        class IndexSelectorHDR;
+    }
+
+    struct PixelBlockU8;
+}
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class BC7Computer
+        {
+        public:
+            static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds);
+            static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock);
+
+        private:
+            static const int MaxTweakRounds = 4;
+
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::Float MFloat;
+
+            static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2]);
+            static void Quantize(MUInt15* color, int bits, int channels);
+            static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels);
+            static void Unquantize(MUInt15* color, int bits, int channels);
+            static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p);
+            static void CompressEndpoints2(MUInt15 ep[2][4]);
+            static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2]);
+            static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2]);
+            static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2]);
+            static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn);
+
+            template<class T>
+            static void Swap(T& a, T& b);
+        };
+
+
+        class BC6HComputer
+        {
+        public:
+            static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds);
+            static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned);
+
+        private:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+            static const int MaxTweakRounds = 4;
+            static const int MaxRefineRounds = 3;
+
+            static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru);
+            static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru);
+            static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL);
+            static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished);
+            static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal);
+            static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal);
+            static void SignExtendSingle(int &v, int bits);
+        };
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp
new file mode 100644
index 0000000000..753b6f9000
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp
@@ -0,0 +1,881 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC6H_IO.h"
+
+namespace cvtt
+{
+    namespace BC6H_IO
+    {
+        void WriteMode0(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x3u) | ((gy >> 2) & 0x4u) | ((by >> 1) & 0x8u) | (bz & 0x10u) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode1(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x3u) | ((gy >> 3) & 0x4u) | ((gz >> 1) & 0x18u) | ((rw << 5) & 0xfe0u) | ((bz << 12) & 0x3000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x3f8000u) | ((by << 17) & 0x400000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bz >> 3) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode2(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xf8u) | ((rw >> 2) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x1e000u) | ((gw << 7) & 0x20000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x7800000u) | ((bw << 17) & 0x8000000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode3(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 3) & 0x80u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((gw << 8) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x7800000u) | ((bw << 17) & 0x8000000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x1eu) | ((bz << 5) & 0x20u) | ((bz << 4) & 0x40u) | ((rz << 7) & 0x780u) | ((gy << 7) & 0x800u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode4(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 3) & 0x80u) | ((by << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x1e000u) | ((gw << 7) & 0x20000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bw << 18) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x1eu) | ((bz << 4) & 0x60u) | ((rz << 7) & 0x780u) | ((bz << 7) & 0x800u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode5(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x3fe0u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0xff8000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x3u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode6(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((gz << 9) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((bz >> 2) & 0x6u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode7(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((bz << 13) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((gy << 18) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((gz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode8(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((bz << 12) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((by << 18) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode9(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7e0u) | ((gz << 7) & 0x800u) | ((bz << 12) & 0x3000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x1f8000u) | ((gy << 16) & 0x200000u) | ((by << 17) & 0x400000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0x7e000000u) | ((gz << 26) & 0x80000000u);
+            encoded[1] = ((bz >> 3) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode10(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x1ff8u) | ((gx << 13) & 0x7fe000u) | ((bx << 23) & 0xff800000u);
+            encoded[2] = ((bx >> 9) & 0x1u);
+        }
+
+        void WriteMode11(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xff8u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x3fe000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0xff800000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void WriteMode12(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x7f8u) | (rw & 0x800u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x1fe000u) | ((gw << 10) & 0x200000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0x7f800000u) | ((bw << 20) & 0x80000000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void WriteMode13(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 8) & 0x80u) | ((rw >> 6) & 0x100u) | ((rw >> 4) & 0x200u) | ((rw >> 2) & 0x400u) | (rw & 0x800u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x1e000u) | ((gw << 2) & 0x20000u) | ((gw << 4) & 0x40000u) | ((gw << 6) & 0x80000u) | ((gw << 8) & 0x100000u) | ((gw << 10) & 0x200000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0x7800000u) | ((bw << 12) & 0x8000000u) | ((bw << 14) & 0x10000000u) | ((bw << 16) & 0x20000000u) | ((bw << 18) & 0x40000000u) | ((bw << 20) & 0x80000000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void ReadMode0(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            gy |= ((encoded[0] << 2) & 0x10u);
+            by |= ((encoded[0] << 1) & 0x10u);
+            bz |= (encoded[0] & 0x10u);
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode1(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            gy |= ((encoded[0] << 3) & 0x20u);
+            gz |= ((encoded[0] << 1) & 0x30u);
+            rw |= ((encoded[0] >> 5) & 0x7fu);
+            bz |= ((encoded[0] >> 12) & 0x3u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x7fu);
+            by |= ((encoded[0] >> 17) & 0x20u);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bz |= ((encoded[1] << 3) & 0x8u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode2(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            rw |= ((encoded[1] << 2) & 0x400u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 7) & 0x400u);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 17) & 0x400u);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode3(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 3) & 0x400u);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            gw |= ((encoded[1] >> 8) & 0x400u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 17) & 0x400u);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0xfu);
+            bz |= ((encoded[2] >> 5) & 0x1u);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0xfu);
+            gy |= ((encoded[2] >> 7) & 0x10u);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode4(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 3) & 0x400u);
+            by |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 7) & 0x400u);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bw |= ((encoded[1] >> 18) & 0x400u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0xfu);
+            bz |= ((encoded[2] >> 4) & 0x6u);
+            rz |= ((encoded[2] >> 7) & 0xfu);
+            bz |= ((encoded[2] >> 7) & 0x10u);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode5(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x1ffu);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x1ffu);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x180u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode6(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            gz |= ((encoded[0] >> 9) & 0x10u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            bz |= ((encoded[1] << 2) & 0x18u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode7(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            bz |= ((encoded[0] >> 13) & 0x1u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            gy |= ((encoded[0] >> 18) & 0x20u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            gz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode8(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            bz |= ((encoded[0] >> 12) & 0x2u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            by |= ((encoded[0] >> 18) & 0x20u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode9(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3fu);
+            gz |= ((encoded[0] >> 7) & 0x10u);
+            bz |= ((encoded[0] >> 12) & 0x3u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x3fu);
+            gy |= ((encoded[0] >> 16) & 0x20u);
+            by |= ((encoded[0] >> 17) & 0x20u);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x3fu);
+            gz |= ((encoded[0] >> 26) & 0x20u);
+            bz |= ((encoded[1] << 3) & 0x8u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode10(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x3ffu);
+            gx |= ((encoded[1] >> 13) & 0x3ffu);
+            bx |= ((encoded[1] >> 23) & 0x1ffu);
+            bx |= ((encoded[2] << 9) & 0x200u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode11(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1ffu);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0x1ffu);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0x1ffu);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode12(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xffu);
+            rw |= (encoded[1] & 0x800u);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0xffu);
+            gw |= ((encoded[1] >> 10) & 0x800u);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0xffu);
+            bw |= ((encoded[1] >> 20) & 0x800u);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode13(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 8) & 0x8000u);
+            rw |= ((encoded[1] << 6) & 0x4000u);
+            rw |= ((encoded[1] << 4) & 0x2000u);
+            rw |= ((encoded[1] << 2) & 0x1000u);
+            rw |= (encoded[1] & 0x800u);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 2) & 0x8000u);
+            gw |= ((encoded[1] >> 4) & 0x4000u);
+            gw |= ((encoded[1] >> 6) & 0x2000u);
+            gw |= ((encoded[1] >> 8) & 0x1000u);
+            gw |= ((encoded[1] >> 10) & 0x800u);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 12) & 0x8000u);
+            bw |= ((encoded[1] >> 14) & 0x4000u);
+            bw |= ((encoded[1] >> 16) & 0x2000u);
+            bw |= ((encoded[1] >> 18) & 0x1000u);
+            bw |= ((encoded[1] >> 20) & 0x800u);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        const ReadFunc_t g_readFuncs[14] =
+        {
+            ReadMode0,
+            ReadMode1,
+            ReadMode2,
+            ReadMode3,
+            ReadMode4,
+            ReadMode5,
+            ReadMode6,
+            ReadMode7,
+            ReadMode8,
+            ReadMode9,
+            ReadMode10,
+            ReadMode11,
+            ReadMode12,
+            ReadMode13
+        };
+
+        const WriteFunc_t g_writeFuncs[14] =
+        {
+            WriteMode0,
+            WriteMode1,
+            WriteMode2,
+            WriteMode3,
+            WriteMode4,
+            WriteMode5,
+            WriteMode6,
+            WriteMode7,
+            WriteMode8,
+            WriteMode9,
+            WriteMode10,
+            WriteMode11,
+            WriteMode12,
+            WriteMode13
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h
new file mode 100644
index 0000000000..a7bb517b54
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <stdint.h>
+#include "ConvectionKernels_BC6H_IO.h"
+
+namespace cvtt
+{
+    namespace BC6H_IO
+    {
+        typedef void (*ReadFunc_t)(const uint32_t *encoded, uint16_t &d, uint16_t &rw, uint16_t &rx, uint16_t &ry, uint16_t &rz, uint16_t &gw, uint16_t &gx, uint16_t &gy, uint16_t &gz, uint16_t &bw, uint16_t &bx, uint16_t &by, uint16_t &bz);
+        typedef void (*WriteFunc_t)(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz);
+
+        extern const ReadFunc_t g_readFuncs[14];
+        extern const WriteFunc_t g_writeFuncs[14];
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h b/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h
new file mode 100644
index 0000000000..1880e22d0f
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <stdint.h>
+
+namespace cvtt { namespace Tables { namespace BC7Prio {
+    extern const uint16_t *g_bc7PrioCodesRGB;
+    extern const int g_bc7NumPrioCodesRGB;
+
+    extern const uint16_t *g_bc7PrioCodesRGBA;
+    extern const int g_bc7NumPrioCodesRGBA;
+
+    int UnpackMode(uint16_t packed);
+    int UnpackSeedPointCount(uint16_t packed);
+    int UnpackPartition(uint16_t packed);
+    int UnpackRotation(uint16_t packed);
+    int UnpackIndexSelector(uint16_t packed);
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp b/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp
new file mode 100644
index 0000000000..5b3134f860
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp
@@ -0,0 +1,1301 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC7_Prio.h"
+
+#define BC7_PARTITION_BITS  6
+#define BC7_PARTITION_OFFSET_BITS  0
+
+#define BC7_ROTATION_BITS   2
+#define BC7_ROTATION_OFFSET_BITS    0
+
+#define BC7_INDEX_MODE_BITS 1
+#define BC7_INDEX_MODE_OFFSET_BITS (BC7_ROTATION_OFFSET_BITS + BC7_ROTATION_BITS)
+
+#define BC7_MODE_BITS 3
+#define BC7_MODE_OFFSET_BITS (BC7_PARTITION_OFFSET_BITS + BC7_PARTITION_BITS)
+#define BC7_SEED_POINT_COUNT_BITS  2
+#define BC7_SEED_POINT_COUNT_OFFSET_BITS  (BC7_MODE_BITS + BC7_MODE_OFFSET_BITS)
+
+
+
+#define BC7_MODE_PRIO_DUAL_PLANE(subData)   \
+    ( \
+        ((subData / 10) << BC7_ROTATION_OFFSET_BITS) | \
+        ((subData % 10) << BC7_INDEX_MODE_OFFSET_BITS) \
+    )
+
+#define BC7_MODE_PRIO_CODE(seedPointCount, mode, subData)   \
+    (\
+        ((seedPointCount - 1) << BC7_SEED_POINT_COUNT_OFFSET_BITS) |  \
+        (mode << BC7_MODE_OFFSET_BITS) |   \
+        ((mode == 4 || mode == 5) ? BC7_MODE_PRIO_DUAL_PLANE(subData) : (subData << BC7_PARTITION_OFFSET_BITS)) \
+    )
+
+namespace cvtt { namespace Tables { namespace BC7Prio {
+    const uint16_t g_bc7PrioCodesRGBData[] =
+    {
+        BC7_MODE_PRIO_CODE(1, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 0, 1),
+        BC7_MODE_PRIO_CODE(1, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 9),
+        BC7_MODE_PRIO_CODE(1, 1, 6),
+        BC7_MODE_PRIO_CODE(1, 1, 1),
+        BC7_MODE_PRIO_CODE(1, 1, 2),
+        BC7_MODE_PRIO_CODE(1, 0, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 7),
+        BC7_MODE_PRIO_CODE(1, 1, 16),
+        BC7_MODE_PRIO_CODE(1, 1, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 14),
+        BC7_MODE_PRIO_CODE(1, 0, 13),
+        BC7_MODE_PRIO_CODE(1, 0, 14),
+        BC7_MODE_PRIO_CODE(1, 0, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 22),
+        BC7_MODE_PRIO_CODE(1, 0, 8),
+        BC7_MODE_PRIO_CODE(1, 0, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 8),
+        BC7_MODE_PRIO_CODE(1, 3, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 19),
+        BC7_MODE_PRIO_CODE(1, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 23),
+        BC7_MODE_PRIO_CODE(1, 1, 3),
+        BC7_MODE_PRIO_CODE(2, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 1, 21),
+        BC7_MODE_PRIO_CODE(1, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 29),
+        BC7_MODE_PRIO_CODE(1, 1, 26),
+        BC7_MODE_PRIO_CODE(1, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 0, 4),
+        BC7_MODE_PRIO_CODE(2, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 0),
+        BC7_MODE_PRIO_CODE(2, 0, 10),
+        BC7_MODE_PRIO_CODE(3, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 11),
+        BC7_MODE_PRIO_CODE(1, 4, 10),
+        BC7_MODE_PRIO_CODE(2, 0, 8),
+        BC7_MODE_PRIO_CODE(2, 0, 11),
+        BC7_MODE_PRIO_CODE(2, 0, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 4),
+        BC7_MODE_PRIO_CODE(3, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 12),
+        BC7_MODE_PRIO_CODE(1, 1, 18),
+        BC7_MODE_PRIO_CODE(1, 3, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 5),
+        BC7_MODE_PRIO_CODE(1, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 1, 25),
+        BC7_MODE_PRIO_CODE(1, 0, 7),
+        BC7_MODE_PRIO_CODE(3, 0, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 24),
+        BC7_MODE_PRIO_CODE(3, 0, 8),
+        BC7_MODE_PRIO_CODE(3, 1, 0),
+        BC7_MODE_PRIO_CODE(2, 1, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 14),
+        BC7_MODE_PRIO_CODE(3, 0, 13),
+        BC7_MODE_PRIO_CODE(3, 0, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 16),
+        BC7_MODE_PRIO_CODE(2, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 3),
+        BC7_MODE_PRIO_CODE(4, 0, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 1),
+        BC7_MODE_PRIO_CODE(1, 0, 2),
+        BC7_MODE_PRIO_CODE(2, 1, 2),
+        BC7_MODE_PRIO_CODE(4, 0, 8),
+        BC7_MODE_PRIO_CODE(1, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 0, 15),
+        BC7_MODE_PRIO_CODE(1, 0, 6),
+        BC7_MODE_PRIO_CODE(1, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 1, 23),
+        BC7_MODE_PRIO_CODE(4, 0, 13),
+        BC7_MODE_PRIO_CODE(4, 0, 11),
+        BC7_MODE_PRIO_CODE(1, 2, 17),
+        BC7_MODE_PRIO_CODE(2, 1, 6),
+        BC7_MODE_PRIO_CODE(2, 1, 7),
+        BC7_MODE_PRIO_CODE(4, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 1, 19),
+        BC7_MODE_PRIO_CODE(1, 1, 30),
+        BC7_MODE_PRIO_CODE(2, 3, 13),
+        BC7_MODE_PRIO_CODE(3, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 21),
+        BC7_MODE_PRIO_CODE(4, 1, 0),
+        BC7_MODE_PRIO_CODE(3, 0, 15),
+        BC7_MODE_PRIO_CODE(2, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 1, 28),
+        BC7_MODE_PRIO_CODE(1, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 0, 4),
+        BC7_MODE_PRIO_CODE(1, 2, 63),
+        BC7_MODE_PRIO_CODE(4, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 26),
+        BC7_MODE_PRIO_CODE(2, 0, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 1, 61),
+        BC7_MODE_PRIO_CODE(2, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 0, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 10),
+        BC7_MODE_PRIO_CODE(2, 4, 31),
+        BC7_MODE_PRIO_CODE(2, 0, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 11),
+        BC7_MODE_PRIO_CODE(4, 0, 15),
+        BC7_MODE_PRIO_CODE(3, 1, 14),
+        BC7_MODE_PRIO_CODE(2, 0, 0),
+        BC7_MODE_PRIO_CODE(3, 1, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 0),
+        BC7_MODE_PRIO_CODE(3, 0, 1),
+        BC7_MODE_PRIO_CODE(1, 1, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 5),
+        BC7_MODE_PRIO_CODE(1, 1, 27),
+        BC7_MODE_PRIO_CODE(2, 1, 18),
+        BC7_MODE_PRIO_CODE(3, 0, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 3),
+        BC7_MODE_PRIO_CODE(2, 0, 2),
+        BC7_MODE_PRIO_CODE(3, 1, 16),
+        BC7_MODE_PRIO_CODE(3, 1, 2),
+        BC7_MODE_PRIO_CODE(1, 1, 31),
+        BC7_MODE_PRIO_CODE(3, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 1, 4),
+        BC7_MODE_PRIO_CODE(1, 1, 62),
+        BC7_MODE_PRIO_CODE(2, 0, 12),
+        BC7_MODE_PRIO_CODE(3, 0, 4),
+        BC7_MODE_PRIO_CODE(4, 0, 4),
+        BC7_MODE_PRIO_CODE(1, 1, 33),
+        BC7_MODE_PRIO_CODE(3, 1, 23),
+        BC7_MODE_PRIO_CODE(2, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 0, 6),
+        BC7_MODE_PRIO_CODE(2, 1, 24),
+        BC7_MODE_PRIO_CODE(1, 1, 59),
+        BC7_MODE_PRIO_CODE(1, 1, 63),
+        BC7_MODE_PRIO_CODE(3, 0, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 52),
+        BC7_MODE_PRIO_CODE(4, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 1, 22),
+        BC7_MODE_PRIO_CODE(4, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 2, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 7),
+        BC7_MODE_PRIO_CODE(4, 0, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 8),
+        BC7_MODE_PRIO_CODE(4, 0, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 0, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 6),
+        BC7_MODE_PRIO_CODE(4, 1, 14),
+        BC7_MODE_PRIO_CODE(1, 3, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 0, 6),
+        BC7_MODE_PRIO_CODE(3, 0, 2),
+        BC7_MODE_PRIO_CODE(1, 1, 32),
+        BC7_MODE_PRIO_CODE(4, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 8),
+        BC7_MODE_PRIO_CODE(2, 1, 9),
+        BC7_MODE_PRIO_CODE(1, 2, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 15),
+        BC7_MODE_PRIO_CODE(4, 0, 6),
+        BC7_MODE_PRIO_CODE(3, 1, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 31),
+        BC7_MODE_PRIO_CODE(3, 3, 13),
+        BC7_MODE_PRIO_CODE(4, 0, 0),
+        BC7_MODE_PRIO_CODE(3, 1, 19),
+        BC7_MODE_PRIO_CODE(4, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 1),
+        BC7_MODE_PRIO_CODE(4, 0, 2),
+        BC7_MODE_PRIO_CODE(1, 3, 2),
+        BC7_MODE_PRIO_CODE(1, 2, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 58),
+        BC7_MODE_PRIO_CODE(1, 3, 14),
+        BC7_MODE_PRIO_CODE(4, 1, 3),
+        BC7_MODE_PRIO_CODE(3, 1, 21),
+        BC7_MODE_PRIO_CODE(2, 2, 8),
+        BC7_MODE_PRIO_CODE(1, 2, 19),
+        BC7_MODE_PRIO_CODE(4, 1, 16),
+        BC7_MODE_PRIO_CODE(4, 1, 2),
+        BC7_MODE_PRIO_CODE(2, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 2, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 1, 47),
+        BC7_MODE_PRIO_CODE(1, 3, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 21),
+        BC7_MODE_PRIO_CODE(1, 2, 62),
+        BC7_MODE_PRIO_CODE(2, 2, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 26),
+        BC7_MODE_PRIO_CODE(1, 1, 53),
+        BC7_MODE_PRIO_CODE(2, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 2, 13),
+        BC7_MODE_PRIO_CODE(4, 1, 23),
+        BC7_MODE_PRIO_CODE(4, 1, 6),
+        BC7_MODE_PRIO_CODE(4, 1, 7),
+        BC7_MODE_PRIO_CODE(1, 2, 25),
+        BC7_MODE_PRIO_CODE(1, 1, 57),
+        BC7_MODE_PRIO_CODE(2, 1, 60),
+        BC7_MODE_PRIO_CODE(1, 2, 20),
+        BC7_MODE_PRIO_CODE(3, 1, 8),
+        BC7_MODE_PRIO_CODE(4, 1, 29),
+        BC7_MODE_PRIO_CODE(4, 1, 19),
+        BC7_MODE_PRIO_CODE(3, 2, 8),
+        BC7_MODE_PRIO_CODE(2, 4, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 61),
+        BC7_MODE_PRIO_CODE(2, 1, 30),
+        BC7_MODE_PRIO_CODE(3, 1, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 63),
+        BC7_MODE_PRIO_CODE(2, 3, 1),
+        BC7_MODE_PRIO_CODE(2, 1, 28),
+        BC7_MODE_PRIO_CODE(2, 1, 62),
+        BC7_MODE_PRIO_CODE(3, 2, 13),
+        BC7_MODE_PRIO_CODE(2, 2, 63),
+        BC7_MODE_PRIO_CODE(2, 1, 33),
+        BC7_MODE_PRIO_CODE(2, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 18),
+        BC7_MODE_PRIO_CODE(2, 5, 30),
+        BC7_MODE_PRIO_CODE(3, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 1, 55),
+        BC7_MODE_PRIO_CODE(3, 1, 17),
+        BC7_MODE_PRIO_CODE(2, 3, 2),
+        BC7_MODE_PRIO_CODE(1, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 27),
+        BC7_MODE_PRIO_CODE(1, 2, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 26),
+        BC7_MODE_PRIO_CODE(3, 1, 9),
+        BC7_MODE_PRIO_CODE(2, 3, 14),
+        BC7_MODE_PRIO_CODE(3, 1, 4),
+        BC7_MODE_PRIO_CODE(3, 1, 24),
+        BC7_MODE_PRIO_CODE(3, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 3, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 12),
+        BC7_MODE_PRIO_CODE(2, 1, 32),
+        BC7_MODE_PRIO_CODE(2, 3, 15),
+        BC7_MODE_PRIO_CODE(4, 2, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 60),
+        BC7_MODE_PRIO_CODE(1, 2, 32),
+        BC7_MODE_PRIO_CODE(1, 1, 40),
+        BC7_MODE_PRIO_CODE(4, 1, 18),
+        BC7_MODE_PRIO_CODE(2, 1, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 22),
+        BC7_MODE_PRIO_CODE(3, 2, 16),
+        BC7_MODE_PRIO_CODE(3, 1, 20),
+        BC7_MODE_PRIO_CODE(4, 1, 4),
+        BC7_MODE_PRIO_CODE(2, 1, 31),
+        BC7_MODE_PRIO_CODE(4, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 2, 24),
+        BC7_MODE_PRIO_CODE(4, 1, 24),
+        BC7_MODE_PRIO_CODE(2, 1, 58),
+        BC7_MODE_PRIO_CODE(4, 2, 8),
+        BC7_MODE_PRIO_CODE(1, 2, 22),
+        BC7_MODE_PRIO_CODE(1, 2, 23),
+        BC7_MODE_PRIO_CODE(1, 3, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 1, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 29),
+        BC7_MODE_PRIO_CODE(1, 2, 57),
+        BC7_MODE_PRIO_CODE(2, 2, 19),
+        BC7_MODE_PRIO_CODE(1, 2, 53),
+        BC7_MODE_PRIO_CODE(1, 2, 55),
+        BC7_MODE_PRIO_CODE(3, 2, 63),
+        BC7_MODE_PRIO_CODE(3, 1, 60),
+        BC7_MODE_PRIO_CODE(4, 1, 8),
+        BC7_MODE_PRIO_CODE(2, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 1, 35),
+        BC7_MODE_PRIO_CODE(4, 4, 31),
+        BC7_MODE_PRIO_CODE(4, 1, 9),
+        BC7_MODE_PRIO_CODE(1, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 58),
+        BC7_MODE_PRIO_CODE(2, 3, 29),
+        BC7_MODE_PRIO_CODE(1, 1, 45),
+        BC7_MODE_PRIO_CODE(4, 2, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 42),
+        BC7_MODE_PRIO_CODE(1, 3, 3),
+        BC7_MODE_PRIO_CODE(4, 2, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 63),
+        BC7_MODE_PRIO_CODE(3, 1, 30),
+        BC7_MODE_PRIO_CODE(1, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 1, 62),
+        BC7_MODE_PRIO_CODE(1, 1, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 2, 14),
+        BC7_MODE_PRIO_CODE(1, 1, 48),
+        BC7_MODE_PRIO_CODE(2, 1, 57),
+        BC7_MODE_PRIO_CODE(2, 1, 52),
+        BC7_MODE_PRIO_CODE(1, 2, 61),
+        BC7_MODE_PRIO_CODE(3, 1, 33),
+        BC7_MODE_PRIO_CODE(1, 1, 51),
+        BC7_MODE_PRIO_CODE(4, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 1, 22),
+        BC7_MODE_PRIO_CODE(1, 3, 19),
+        BC7_MODE_PRIO_CODE(1, 2, 36),
+        BC7_MODE_PRIO_CODE(2, 5, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 28),
+        BC7_MODE_PRIO_CODE(2, 2, 14),
+        BC7_MODE_PRIO_CODE(1, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 26),
+        BC7_MODE_PRIO_CODE(2, 1, 53),
+        BC7_MODE_PRIO_CODE(4, 3, 13),
+        BC7_MODE_PRIO_CODE(2, 2, 21),
+        BC7_MODE_PRIO_CODE(3, 4, 10),
+        BC7_MODE_PRIO_CODE(4, 1, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 2, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 47),
+        BC7_MODE_PRIO_CODE(1, 2, 52),
+        BC7_MODE_PRIO_CODE(3, 1, 32),
+        BC7_MODE_PRIO_CODE(1, 2, 40),
+        BC7_MODE_PRIO_CODE(1, 2, 31),
+        BC7_MODE_PRIO_CODE(3, 1, 27),
+        BC7_MODE_PRIO_CODE(3, 2, 18),
+        BC7_MODE_PRIO_CODE(2, 3, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 55),
+        BC7_MODE_PRIO_CODE(4, 1, 61),
+        BC7_MODE_PRIO_CODE(3, 2, 14),
+        BC7_MODE_PRIO_CODE(3, 1, 31),
+        BC7_MODE_PRIO_CODE(1, 2, 34),
+        BC7_MODE_PRIO_CODE(3, 2, 19),
+        BC7_MODE_PRIO_CODE(2, 3, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 30),
+        BC7_MODE_PRIO_CODE(1, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 2, 28),
+        BC7_MODE_PRIO_CODE(4, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 40),
+        BC7_MODE_PRIO_CODE(2, 2, 22),
+        BC7_MODE_PRIO_CODE(4, 1, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 7),
+        BC7_MODE_PRIO_CODE(1, 1, 50),
+        BC7_MODE_PRIO_CODE(2, 1, 41),
+        BC7_MODE_PRIO_CODE(1, 2, 9),
+        BC7_MODE_PRIO_CODE(1, 2, 39),
+        BC7_MODE_PRIO_CODE(2, 2, 25),
+        BC7_MODE_PRIO_CODE(1, 3, 6),
+        BC7_MODE_PRIO_CODE(3, 2, 21),
+        BC7_MODE_PRIO_CODE(1, 1, 37),
+        BC7_MODE_PRIO_CODE(2, 2, 58),
+        BC7_MODE_PRIO_CODE(3, 3, 29),
+        BC7_MODE_PRIO_CODE(4, 1, 62),
+        BC7_MODE_PRIO_CODE(1, 2, 35),
+        BC7_MODE_PRIO_CODE(3, 1, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 28),
+        BC7_MODE_PRIO_CODE(1, 3, 23),
+        BC7_MODE_PRIO_CODE(4, 1, 30),
+        BC7_MODE_PRIO_CODE(2, 1, 45),
+        BC7_MODE_PRIO_CODE(1, 3, 16),
+        BC7_MODE_PRIO_CODE(4, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 38),
+        BC7_MODE_PRIO_CODE(4, 1, 63),
+        BC7_MODE_PRIO_CODE(1, 3, 22),
+        BC7_MODE_PRIO_CODE(1, 2, 30),
+        BC7_MODE_PRIO_CODE(2, 2, 31),
+        BC7_MODE_PRIO_CODE(1, 3, 20),
+        BC7_MODE_PRIO_CODE(2, 2, 9),
+        BC7_MODE_PRIO_CODE(2, 3, 3),
+        BC7_MODE_PRIO_CODE(3, 2, 22),
+        BC7_MODE_PRIO_CODE(2, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 2, 62),
+        BC7_MODE_PRIO_CODE(3, 2, 20),
+        BC7_MODE_PRIO_CODE(4, 1, 32),
+        BC7_MODE_PRIO_CODE(2, 1, 43),
+        BC7_MODE_PRIO_CODE(3, 1, 58),
+        BC7_MODE_PRIO_CODE(2, 3, 19),
+        BC7_MODE_PRIO_CODE(2, 2, 32),
+        BC7_MODE_PRIO_CODE(2, 2, 57),
+        BC7_MODE_PRIO_CODE(4, 1, 27),
+        BC7_MODE_PRIO_CODE(2, 2, 34),
+        BC7_MODE_PRIO_CODE(4, 1, 58),
+        BC7_MODE_PRIO_CODE(1, 2, 12),
+        BC7_MODE_PRIO_CODE(2, 2, 12),
+        BC7_MODE_PRIO_CODE(1, 4, 20),
+        BC7_MODE_PRIO_CODE(1, 2, 56),
+        BC7_MODE_PRIO_CODE(2, 1, 48),
+        BC7_MODE_PRIO_CODE(2, 1, 36),
+        BC7_MODE_PRIO_CODE(4, 3, 0),
+        BC7_MODE_PRIO_CODE(2, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 1, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 7),
+        BC7_MODE_PRIO_CODE(1, 2, 37),
+        BC7_MODE_PRIO_CODE(2, 2, 35),
+        BC7_MODE_PRIO_CODE(3, 1, 52),
+        BC7_MODE_PRIO_CODE(2, 3, 6),
+        BC7_MODE_PRIO_CODE(3, 1, 57),
+        BC7_MODE_PRIO_CODE(4, 1, 31),
+        BC7_MODE_PRIO_CODE(4, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 44),
+        BC7_MODE_PRIO_CODE(3, 3, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 54),
+        BC7_MODE_PRIO_CODE(2, 1, 50),
+        BC7_MODE_PRIO_CODE(3, 3, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 27),
+        BC7_MODE_PRIO_CODE(3, 4, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 14),
+        BC7_MODE_PRIO_CODE(3, 2, 25),
+        BC7_MODE_PRIO_CODE(2, 3, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 2, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 23),
+        BC7_MODE_PRIO_CODE(3, 2, 12),
+        BC7_MODE_PRIO_CODE(3, 3, 2),
+        BC7_MODE_PRIO_CODE(4, 2, 14),
+        BC7_MODE_PRIO_CODE(2, 3, 16),
+        BC7_MODE_PRIO_CODE(1, 2, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 11),
+        BC7_MODE_PRIO_CODE(1, 2, 4),
+        BC7_MODE_PRIO_CODE(4, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 3, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 43),
+        BC7_MODE_PRIO_CODE(2, 4, 21),
+        BC7_MODE_PRIO_CODE(4, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 1, 53),
+        BC7_MODE_PRIO_CODE(3, 1, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 61),
+        BC7_MODE_PRIO_CODE(2, 2, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 23),
+        BC7_MODE_PRIO_CODE(3, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 3, 8),
+        BC7_MODE_PRIO_CODE(3, 1, 55),
+        BC7_MODE_PRIO_CODE(4, 1, 59),
+        BC7_MODE_PRIO_CODE(3, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 3, 20),
+        BC7_MODE_PRIO_CODE(3, 2, 57),
+        BC7_MODE_PRIO_CODE(3, 1, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 35),
+        BC7_MODE_PRIO_CODE(1, 1, 38),
+        BC7_MODE_PRIO_CODE(1, 2, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 6),
+        BC7_MODE_PRIO_CODE(3, 2, 23),
+        BC7_MODE_PRIO_CODE(2, 2, 59),
+        BC7_MODE_PRIO_CODE(3, 2, 5),
+        BC7_MODE_PRIO_CODE(4, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 1, 37),
+        BC7_MODE_PRIO_CODE(3, 2, 59),
+        BC7_MODE_PRIO_CODE(4, 2, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 4),
+        BC7_MODE_PRIO_CODE(2, 2, 56),
+        BC7_MODE_PRIO_CODE(1, 3, 33),
+        BC7_MODE_PRIO_CODE(2, 3, 33),
+        BC7_MODE_PRIO_CODE(2, 3, 22),
+        BC7_MODE_PRIO_CODE(2, 3, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 34),
+        BC7_MODE_PRIO_CODE(3, 2, 56),
+        BC7_MODE_PRIO_CODE(3, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 2, 7),
+        BC7_MODE_PRIO_CODE(2, 2, 7),
+        BC7_MODE_PRIO_CODE(3, 2, 7),
+        BC7_MODE_PRIO_CODE(2, 2, 36),
+        BC7_MODE_PRIO_CODE(3, 2, 36),
+        BC7_MODE_PRIO_CODE(4, 1, 52),
+        BC7_MODE_PRIO_CODE(2, 2, 33),
+        BC7_MODE_PRIO_CODE(3, 1, 45),
+        BC7_MODE_PRIO_CODE(1, 3, 4),
+        BC7_MODE_PRIO_CODE(4, 2, 15),
+        BC7_MODE_PRIO_CODE(3, 1, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 4),
+        BC7_MODE_PRIO_CODE(2, 5, 20),
+        BC7_MODE_PRIO_CODE(3, 2, 62),
+        BC7_MODE_PRIO_CODE(1, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 1, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 52),
+        BC7_MODE_PRIO_CODE(3, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 1, 39),
+        BC7_MODE_PRIO_CODE(3, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 48),
+        BC7_MODE_PRIO_CODE(3, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 2, 32),
+        BC7_MODE_PRIO_CODE(3, 3, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 17),
+        BC7_MODE_PRIO_CODE(4, 1, 57),
+        BC7_MODE_PRIO_CODE(1, 3, 25),
+        BC7_MODE_PRIO_CODE(2, 3, 11),
+        BC7_MODE_PRIO_CODE(1, 3, 61),
+        BC7_MODE_PRIO_CODE(4, 1, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 60),
+        BC7_MODE_PRIO_CODE(2, 3, 60),
+        BC7_MODE_PRIO_CODE(2, 2, 28),
+        BC7_MODE_PRIO_CODE(3, 2, 28),
+        BC7_MODE_PRIO_CODE(4, 1, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 51),
+        BC7_MODE_PRIO_CODE(4, 1, 53),
+        BC7_MODE_PRIO_CODE(4, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 3, 32),
+        BC7_MODE_PRIO_CODE(1, 3, 24),
+        BC7_MODE_PRIO_CODE(4, 1, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 51),
+        BC7_MODE_PRIO_CODE(4, 2, 12),
+        BC7_MODE_PRIO_CODE(2, 3, 61),
+        BC7_MODE_PRIO_CODE(3, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 3, 32),
+        BC7_MODE_PRIO_CODE(3, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 3, 29),
+        BC7_MODE_PRIO_CODE(4, 2, 63),
+        BC7_MODE_PRIO_CODE(2, 2, 27),
+        BC7_MODE_PRIO_CODE(2, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 1, 50),
+        BC7_MODE_PRIO_CODE(3, 2, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 63),
+        BC7_MODE_PRIO_CODE(2, 3, 63),
+        BC7_MODE_PRIO_CODE(3, 2, 27),
+        BC7_MODE_PRIO_CODE(4, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 26),
+        BC7_MODE_PRIO_CODE(2, 3, 4),
+        BC7_MODE_PRIO_CODE(2, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 45),
+        BC7_MODE_PRIO_CODE(4, 1, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 1),
+        BC7_MODE_PRIO_CODE(4, 2, 6),
+        BC7_MODE_PRIO_CODE(1, 3, 62),
+        BC7_MODE_PRIO_CODE(2, 3, 62),
+        BC7_MODE_PRIO_CODE(2, 1, 44),
+        BC7_MODE_PRIO_CODE(4, 1, 49),
+        BC7_MODE_PRIO_CODE(3, 5, 30),
+        BC7_MODE_PRIO_CODE(2, 3, 25),
+        BC7_MODE_PRIO_CODE(1, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 1, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 3),
+        BC7_MODE_PRIO_CODE(3, 1, 37),
+        BC7_MODE_PRIO_CODE(1, 2, 0),
+        BC7_MODE_PRIO_CODE(2, 2, 0),
+        BC7_MODE_PRIO_CODE(2, 3, 35),
+        BC7_MODE_PRIO_CODE(2, 3, 24),
+        BC7_MODE_PRIO_CODE(2, 2, 53),
+        BC7_MODE_PRIO_CODE(3, 2, 53),
+        BC7_MODE_PRIO_CODE(4, 2, 59),
+        BC7_MODE_PRIO_CODE(3, 3, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 3),
+        BC7_MODE_PRIO_CODE(2, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 3, 32),
+        BC7_MODE_PRIO_CODE(1, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 2, 62),
+        BC7_MODE_PRIO_CODE(4, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 2, 30),
+        BC7_MODE_PRIO_CODE(1, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 2, 36),
+        BC7_MODE_PRIO_CODE(2, 2, 1),
+        BC7_MODE_PRIO_CODE(3, 2, 1),
+        BC7_MODE_PRIO_CODE(3, 2, 58),
+        BC7_MODE_PRIO_CODE(4, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 3, 16),
+        BC7_MODE_PRIO_CODE(2, 3, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 39),
+        BC7_MODE_PRIO_CODE(4, 1, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 21),
+        BC7_MODE_PRIO_CODE(2, 1, 38),
+        BC7_MODE_PRIO_CODE(4, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 3, 23),
+        BC7_MODE_PRIO_CODE(1, 2, 43),
+        BC7_MODE_PRIO_CODE(1, 2, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 41),
+        BC7_MODE_PRIO_CODE(1, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 2, 35),
+        BC7_MODE_PRIO_CODE(4, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 1, 34),
+        BC7_MODE_PRIO_CODE(2, 2, 29),
+        BC7_MODE_PRIO_CODE(3, 2, 29),
+        BC7_MODE_PRIO_CODE(3, 2, 52),
+        BC7_MODE_PRIO_CODE(1, 3, 58),
+        BC7_MODE_PRIO_CODE(4, 5, 30),
+        BC7_MODE_PRIO_CODE(4, 3, 33),
+        BC7_MODE_PRIO_CODE(3, 2, 30),
+        BC7_MODE_PRIO_CODE(1, 2, 44),
+        BC7_MODE_PRIO_CODE(1, 2, 2),
+        BC7_MODE_PRIO_CODE(2, 2, 2),
+        BC7_MODE_PRIO_CODE(3, 2, 2),
+        BC7_MODE_PRIO_CODE(1, 2, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 47),
+        BC7_MODE_PRIO_CODE(3, 3, 7),
+        BC7_MODE_PRIO_CODE(2, 3, 58),
+        BC7_MODE_PRIO_CODE(3, 2, 55),
+        BC7_MODE_PRIO_CODE(4, 2, 4),
+        BC7_MODE_PRIO_CODE(3, 2, 0),
+        BC7_MODE_PRIO_CODE(1, 3, 31),
+        BC7_MODE_PRIO_CODE(3, 2, 31),
+        BC7_MODE_PRIO_CODE(3, 3, 12),
+        BC7_MODE_PRIO_CODE(3, 2, 51),
+        BC7_MODE_PRIO_CODE(2, 1, 39),
+        BC7_MODE_PRIO_CODE(1, 3, 48),
+        BC7_MODE_PRIO_CODE(1, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 2, 25),
+        BC7_MODE_PRIO_CODE(4, 2, 22),
+        BC7_MODE_PRIO_CODE(4, 2, 18),
+        BC7_MODE_PRIO_CODE(2, 2, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 28),
+        BC7_MODE_PRIO_CODE(3, 1, 44),
+        BC7_MODE_PRIO_CODE(2, 1, 34),
+        BC7_MODE_PRIO_CODE(3, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 2, 54),
+        BC7_MODE_PRIO_CODE(4, 2, 7),
+        BC7_MODE_PRIO_CODE(4, 2, 20),
+        BC7_MODE_PRIO_CODE(2, 2, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 43),
+        BC7_MODE_PRIO_CODE(2, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 3, 30),
+        BC7_MODE_PRIO_CODE(4, 2, 5),
+        BC7_MODE_PRIO_CODE(4, 2, 61),
+        BC7_MODE_PRIO_CODE(4, 2, 19),
+        BC7_MODE_PRIO_CODE(4, 2, 23),
+        BC7_MODE_PRIO_CODE(3, 2, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 27),
+        BC7_MODE_PRIO_CODE(1, 3, 57),
+        BC7_MODE_PRIO_CODE(2, 3, 57),
+        BC7_MODE_PRIO_CODE(3, 3, 21),
+        BC7_MODE_PRIO_CODE(3, 3, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 48),
+        BC7_MODE_PRIO_CODE(4, 1, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 19),
+        BC7_MODE_PRIO_CODE(3, 1, 38),
+        BC7_MODE_PRIO_CODE(2, 2, 38),
+        BC7_MODE_PRIO_CODE(2, 3, 31),
+        BC7_MODE_PRIO_CODE(2, 2, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 40),
+        BC7_MODE_PRIO_CODE(1, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 1, 38),
+        BC7_MODE_PRIO_CODE(1, 3, 41),
+        BC7_MODE_PRIO_CODE(1, 3, 50),
+        BC7_MODE_PRIO_CODE(2, 3, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 3, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 34),
+        BC7_MODE_PRIO_CODE(4, 1, 34),
+        BC7_MODE_PRIO_CODE(2, 3, 50),
+        BC7_MODE_PRIO_CODE(1, 3, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 40),
+        BC7_MODE_PRIO_CODE(1, 3, 51),
+        BC7_MODE_PRIO_CODE(2, 3, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 45),
+        BC7_MODE_PRIO_CODE(2, 3, 45),
+        BC7_MODE_PRIO_CODE(2, 3, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 20),
+        BC7_MODE_PRIO_CODE(2, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 2, 57),
+        BC7_MODE_PRIO_CODE(2, 4, 20),
+        BC7_MODE_PRIO_CODE(3, 3, 4),
+        BC7_MODE_PRIO_CODE(3, 3, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 46),
+        BC7_MODE_PRIO_CODE(2, 3, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 1),
+        BC7_MODE_PRIO_CODE(3, 3, 22),
+        BC7_MODE_PRIO_CODE(1, 3, 49),
+        BC7_MODE_PRIO_CODE(2, 3, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 15),
+        BC7_MODE_PRIO_CODE(3, 3, 5),
+        BC7_MODE_PRIO_CODE(4, 1, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 14),
+        BC7_MODE_PRIO_CODE(4, 3, 2),
+        BC7_MODE_PRIO_CODE(3, 3, 60),
+        BC7_MODE_PRIO_CODE(1, 3, 53),
+        BC7_MODE_PRIO_CODE(2, 3, 53),
+        BC7_MODE_PRIO_CODE(4, 3, 32),
+        BC7_MODE_PRIO_CODE(3, 3, 24),
+        BC7_MODE_PRIO_CODE(3, 3, 63),
+        BC7_MODE_PRIO_CODE(3, 2, 37),
+        BC7_MODE_PRIO_CODE(1, 3, 52),
+        BC7_MODE_PRIO_CODE(2, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 4, 30),
+        BC7_MODE_PRIO_CODE(4, 2, 34),
+        BC7_MODE_PRIO_CODE(1, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 3, 62),
+        BC7_MODE_PRIO_CODE(3, 3, 18),
+        BC7_MODE_PRIO_CODE(3, 2, 41),
+        BC7_MODE_PRIO_CODE(4, 2, 58),
+        BC7_MODE_PRIO_CODE(1, 3, 42),
+        BC7_MODE_PRIO_CODE(2, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 2, 0),
+        BC7_MODE_PRIO_CODE(4, 2, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 47),
+        BC7_MODE_PRIO_CODE(4, 2, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 55),
+        BC7_MODE_PRIO_CODE(4, 2, 32),
+        BC7_MODE_PRIO_CODE(3, 2, 43),
+        BC7_MODE_PRIO_CODE(3, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 5, 20),
+        BC7_MODE_PRIO_CODE(4, 5, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 36),
+        BC7_MODE_PRIO_CODE(2, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 2, 54),
+        BC7_MODE_PRIO_CODE(2, 2, 49),
+        BC7_MODE_PRIO_CODE(3, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 1, 39),
+        BC7_MODE_PRIO_CODE(4, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 2, 52),
+        BC7_MODE_PRIO_CODE(4, 2, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 50),
+        BC7_MODE_PRIO_CODE(3, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 31),
+        BC7_MODE_PRIO_CODE(4, 3, 3),
+        BC7_MODE_PRIO_CODE(1, 2, 48),
+        BC7_MODE_PRIO_CODE(2, 2, 48),
+        BC7_MODE_PRIO_CODE(3, 2, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 3, 9),
+        BC7_MODE_PRIO_CODE(1, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 10),
+        BC7_MODE_PRIO_CODE(3, 3, 31),
+        BC7_MODE_PRIO_CODE(4, 2, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 37),
+        BC7_MODE_PRIO_CODE(2, 3, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 50),
+        BC7_MODE_PRIO_CODE(2, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 20),
+        BC7_MODE_PRIO_CODE(3, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 3, 6),
+        BC7_MODE_PRIO_CODE(4, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 2, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 58),
+        BC7_MODE_PRIO_CODE(3, 3, 59),
+        BC7_MODE_PRIO_CODE(4, 2, 56),
+        BC7_MODE_PRIO_CODE(1, 3, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 2, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 7),
+        BC7_MODE_PRIO_CODE(3, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 3, 23),
+        BC7_MODE_PRIO_CODE(3, 3, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 22),
+        BC7_MODE_PRIO_CODE(3, 3, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 42),
+        BC7_MODE_PRIO_CODE(2, 2, 42),
+        BC7_MODE_PRIO_CODE(3, 2, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 19),
+        BC7_MODE_PRIO_CODE(4, 3, 21),
+        BC7_MODE_PRIO_CODE(2, 2, 46),
+        BC7_MODE_PRIO_CODE(3, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 2, 28),
+        BC7_MODE_PRIO_CODE(3, 3, 49),
+        BC7_MODE_PRIO_CODE(3, 3, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 55),
+        BC7_MODE_PRIO_CODE(2, 2, 26),
+        BC7_MODE_PRIO_CODE(3, 2, 26),
+        BC7_MODE_PRIO_CODE(4, 2, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 2, 41),
+        BC7_MODE_PRIO_CODE(4, 2, 29),
+        BC7_MODE_PRIO_CODE(1, 3, 34),
+        BC7_MODE_PRIO_CODE(2, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 2, 44),
+        BC7_MODE_PRIO_CODE(3, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 2, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 11),
+        BC7_MODE_PRIO_CODE(3, 3, 57),
+        BC7_MODE_PRIO_CODE(3, 2, 38),
+        BC7_MODE_PRIO_CODE(3, 3, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 25),
+        BC7_MODE_PRIO_CODE(4, 3, 4),
+        BC7_MODE_PRIO_CODE(3, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 61),
+        BC7_MODE_PRIO_CODE(4, 2, 48),
+        BC7_MODE_PRIO_CODE(4, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 3, 54),
+        BC7_MODE_PRIO_CODE(4, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 3, 24),
+        BC7_MODE_PRIO_CODE(4, 3, 12),
+        BC7_MODE_PRIO_CODE(4, 2, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 63),
+        BC7_MODE_PRIO_CODE(4, 3, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 3, 60),
+        BC7_MODE_PRIO_CODE(4, 2, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 62),
+        BC7_MODE_PRIO_CODE(4, 3, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 58),
+        BC7_MODE_PRIO_CODE(4, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 2, 26),
+        BC7_MODE_PRIO_CODE(4, 2, 27),
+        BC7_MODE_PRIO_CODE(3, 3, 37),
+        BC7_MODE_PRIO_CODE(4, 3, 57),
+        BC7_MODE_PRIO_CODE(4, 3, 48),
+        BC7_MODE_PRIO_CODE(4, 3, 31),
+        BC7_MODE_PRIO_CODE(4, 3, 51),
+        BC7_MODE_PRIO_CODE(4, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 3, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 40),
+        BC7_MODE_PRIO_CODE(4, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 2, 2),
+        BC7_MODE_PRIO_CODE(3, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 2, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 3, 30),
+        BC7_MODE_PRIO_CODE(4, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 2, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 45),
+        BC7_MODE_PRIO_CODE(4, 2, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 46),
+        BC7_MODE_PRIO_CODE(2, 2, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 3, 37),
+        BC7_MODE_PRIO_CODE(4, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 55),
+        BC7_MODE_PRIO_CODE(4, 3, 44),
+        BC7_MODE_PRIO_CODE(3, 2, 45),
+        BC7_MODE_PRIO_CODE(1, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 5, 0),
+        BC7_MODE_PRIO_CODE(4, 2, 45),
+        BC7_MODE_PRIO_CODE(2, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 4, 1),
+        BC7_MODE_PRIO_CODE(2, 5, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 1),
+        BC7_MODE_PRIO_CODE(3, 5, 0),
+        BC7_MODE_PRIO_CODE(4, 4, 0),
+        BC7_MODE_PRIO_CODE(4, 4, 1),
+        BC7_MODE_PRIO_CODE(4, 5, 0),
+    };
+
+    const uint16_t *g_bc7PrioCodesRGB = g_bc7PrioCodesRGBData;
+    const int g_bc7NumPrioCodesRGB = sizeof(g_bc7PrioCodesRGBData) / sizeof(g_bc7PrioCodesRGBData[0]);
+
+    const uint16_t g_bc7PrioCodesRGBAData[] =
+    {
+        BC7_MODE_PRIO_CODE(1, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 0),
+        BC7_MODE_PRIO_CODE(2, 4, 1),
+        BC7_MODE_PRIO_CODE(3, 4, 1),
+        BC7_MODE_PRIO_CODE(2, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 6),
+        BC7_MODE_PRIO_CODE(1, 4, 10),
+        BC7_MODE_PRIO_CODE(1, 7, 15),
+        BC7_MODE_PRIO_CODE(1, 7, 14),
+        BC7_MODE_PRIO_CODE(1, 4, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 7),
+        BC7_MODE_PRIO_CODE(3, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 19),
+        BC7_MODE_PRIO_CODE(3, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 7, 1),
+        BC7_MODE_PRIO_CODE(1, 7, 21),
+        BC7_MODE_PRIO_CODE(4, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 7, 10),
+        BC7_MODE_PRIO_CODE(1, 7, 3),
+        BC7_MODE_PRIO_CODE(4, 6, 0),
+        BC7_MODE_PRIO_CODE(3, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 7, 16),
+        BC7_MODE_PRIO_CODE(1, 7, 8),
+        BC7_MODE_PRIO_CODE(2, 5, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 9),
+        BC7_MODE_PRIO_CODE(2, 4, 11),
+        BC7_MODE_PRIO_CODE(3, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 7, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 22),
+        BC7_MODE_PRIO_CODE(4, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 13),
+        BC7_MODE_PRIO_CODE(3, 7, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 12),
+        BC7_MODE_PRIO_CODE(1, 7, 29),
+        BC7_MODE_PRIO_CODE(3, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 7, 11),
+        BC7_MODE_PRIO_CODE(1, 7, 18),
+        BC7_MODE_PRIO_CODE(1, 7, 4),
+        BC7_MODE_PRIO_CODE(2, 7, 15),
+        BC7_MODE_PRIO_CODE(2, 7, 14),
+        BC7_MODE_PRIO_CODE(1, 7, 5),
+        BC7_MODE_PRIO_CODE(1, 7, 25),
+        BC7_MODE_PRIO_CODE(1, 7, 17),
+        BC7_MODE_PRIO_CODE(1, 7, 24),
+        BC7_MODE_PRIO_CODE(1, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 5, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 1),
+        BC7_MODE_PRIO_CODE(2, 7, 29),
+        BC7_MODE_PRIO_CODE(2, 4, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 6),
+        BC7_MODE_PRIO_CODE(2, 7, 7),
+        BC7_MODE_PRIO_CODE(3, 7, 14),
+        BC7_MODE_PRIO_CODE(3, 7, 15),
+        BC7_MODE_PRIO_CODE(4, 4, 31),
+        BC7_MODE_PRIO_CODE(2, 7, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 7, 29),
+        BC7_MODE_PRIO_CODE(2, 7, 19),
+        BC7_MODE_PRIO_CODE(2, 7, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 1),
+        BC7_MODE_PRIO_CODE(4, 7, 29),
+        BC7_MODE_PRIO_CODE(3, 7, 7),
+        BC7_MODE_PRIO_CODE(1, 4, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 2),
+        BC7_MODE_PRIO_CODE(2, 7, 16),
+        BC7_MODE_PRIO_CODE(2, 7, 3),
+        BC7_MODE_PRIO_CODE(2, 5, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 23),
+        BC7_MODE_PRIO_CODE(3, 7, 6),
+        BC7_MODE_PRIO_CODE(2, 7, 12),
+        BC7_MODE_PRIO_CODE(1, 7, 61),
+        BC7_MODE_PRIO_CODE(4, 4, 11),
+        BC7_MODE_PRIO_CODE(3, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 10),
+        BC7_MODE_PRIO_CODE(2, 7, 8),
+        BC7_MODE_PRIO_CODE(2, 7, 22),
+        BC7_MODE_PRIO_CODE(2, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 9),
+        BC7_MODE_PRIO_CODE(3, 7, 19),
+        BC7_MODE_PRIO_CODE(2, 7, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 7, 24),
+        BC7_MODE_PRIO_CODE(1, 7, 60),
+        BC7_MODE_PRIO_CODE(2, 7, 11),
+        BC7_MODE_PRIO_CODE(2, 7, 18),
+        BC7_MODE_PRIO_CODE(2, 7, 17),
+        BC7_MODE_PRIO_CODE(2, 7, 4),
+        BC7_MODE_PRIO_CODE(2, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 3),
+        BC7_MODE_PRIO_CODE(3, 7, 16),
+        BC7_MODE_PRIO_CODE(3, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 7, 21),
+        BC7_MODE_PRIO_CODE(1, 7, 62),
+        BC7_MODE_PRIO_CODE(2, 7, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 33),
+        BC7_MODE_PRIO_CODE(2, 7, 33),
+        BC7_MODE_PRIO_CODE(3, 7, 33),
+        BC7_MODE_PRIO_CODE(4, 7, 33),
+        BC7_MODE_PRIO_CODE(3, 7, 11),
+        BC7_MODE_PRIO_CODE(3, 7, 12),
+        BC7_MODE_PRIO_CODE(4, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 7, 25),
+        BC7_MODE_PRIO_CODE(1, 7, 63),
+        BC7_MODE_PRIO_CODE(2, 5, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 8),
+        BC7_MODE_PRIO_CODE(4, 5, 0),
+        BC7_MODE_PRIO_CODE(3, 7, 24),
+        BC7_MODE_PRIO_CODE(3, 7, 22),
+        BC7_MODE_PRIO_CODE(3, 7, 9),
+        BC7_MODE_PRIO_CODE(1, 7, 32),
+        BC7_MODE_PRIO_CODE(2, 7, 61),
+        BC7_MODE_PRIO_CODE(3, 7, 4),
+        BC7_MODE_PRIO_CODE(3, 5, 30),
+        BC7_MODE_PRIO_CODE(3, 7, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 14),
+        BC7_MODE_PRIO_CODE(3, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 18),
+        BC7_MODE_PRIO_CODE(1, 7, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 4, 21),
+        BC7_MODE_PRIO_CODE(4, 7, 15),
+        BC7_MODE_PRIO_CODE(3, 7, 17),
+        BC7_MODE_PRIO_CODE(2, 7, 32),
+        BC7_MODE_PRIO_CODE(3, 7, 32),
+        BC7_MODE_PRIO_CODE(2, 5, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 1),
+        BC7_MODE_PRIO_CODE(4, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 7, 28),
+        BC7_MODE_PRIO_CODE(1, 7, 54),
+        BC7_MODE_PRIO_CODE(4, 7, 32),
+        BC7_MODE_PRIO_CODE(1, 7, 27),
+        BC7_MODE_PRIO_CODE(4, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 7, 60),
+        BC7_MODE_PRIO_CODE(2, 4, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 63),
+        BC7_MODE_PRIO_CODE(4, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 62),
+        BC7_MODE_PRIO_CODE(1, 7, 41),
+        BC7_MODE_PRIO_CODE(1, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 60),
+        BC7_MODE_PRIO_CODE(1, 7, 40),
+        BC7_MODE_PRIO_CODE(1, 7, 55),
+        BC7_MODE_PRIO_CODE(2, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 8),
+        BC7_MODE_PRIO_CODE(4, 7, 6),
+        BC7_MODE_PRIO_CODE(1, 7, 53),
+        BC7_MODE_PRIO_CODE(4, 7, 9),
+        BC7_MODE_PRIO_CODE(3, 7, 61),
+        BC7_MODE_PRIO_CODE(3, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 22),
+        BC7_MODE_PRIO_CODE(4, 7, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 62),
+        BC7_MODE_PRIO_CODE(4, 7, 7),
+        BC7_MODE_PRIO_CODE(1, 7, 42),
+        BC7_MODE_PRIO_CODE(1, 7, 52),
+        BC7_MODE_PRIO_CODE(4, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 56),
+        BC7_MODE_PRIO_CODE(1, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 5, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 48),
+        BC7_MODE_PRIO_CODE(2, 7, 28),
+        BC7_MODE_PRIO_CODE(3, 7, 28),
+        BC7_MODE_PRIO_CODE(4, 7, 19),
+        BC7_MODE_PRIO_CODE(3, 7, 35),
+        BC7_MODE_PRIO_CODE(1, 7, 59),
+        BC7_MODE_PRIO_CODE(2, 7, 30),
+        BC7_MODE_PRIO_CODE(3, 7, 63),
+        BC7_MODE_PRIO_CODE(4, 7, 21),
+        BC7_MODE_PRIO_CODE(4, 7, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 3),
+        BC7_MODE_PRIO_CODE(1, 7, 47),
+        BC7_MODE_PRIO_CODE(1, 7, 37),
+        BC7_MODE_PRIO_CODE(4, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 57),
+        BC7_MODE_PRIO_CODE(4, 7, 17),
+        BC7_MODE_PRIO_CODE(1, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 24),
+        BC7_MODE_PRIO_CODE(4, 7, 60),
+        BC7_MODE_PRIO_CODE(1, 7, 50),
+        BC7_MODE_PRIO_CODE(2, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 7, 25),
+        BC7_MODE_PRIO_CODE(3, 7, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 59),
+        BC7_MODE_PRIO_CODE(2, 7, 55),
+        BC7_MODE_PRIO_CODE(4, 7, 18),
+        BC7_MODE_PRIO_CODE(4, 7, 12),
+        BC7_MODE_PRIO_CODE(4, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 59),
+        BC7_MODE_PRIO_CODE(1, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 16),
+        BC7_MODE_PRIO_CODE(4, 7, 11),
+        BC7_MODE_PRIO_CODE(2, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 4),
+        BC7_MODE_PRIO_CODE(1, 7, 49),
+        BC7_MODE_PRIO_CODE(2, 7, 27),
+        BC7_MODE_PRIO_CODE(3, 7, 27),
+        BC7_MODE_PRIO_CODE(4, 7, 62),
+        BC7_MODE_PRIO_CODE(3, 7, 58),
+        BC7_MODE_PRIO_CODE(4, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 53),
+        BC7_MODE_PRIO_CODE(3, 7, 53),
+        BC7_MODE_PRIO_CODE(2, 7, 40),
+        BC7_MODE_PRIO_CODE(3, 7, 40),
+        BC7_MODE_PRIO_CODE(2, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 7, 31),
+        BC7_MODE_PRIO_CODE(4, 7, 61),
+        BC7_MODE_PRIO_CODE(1, 7, 36),
+        BC7_MODE_PRIO_CODE(4, 7, 63),
+        BC7_MODE_PRIO_CODE(1, 7, 46),
+        BC7_MODE_PRIO_CODE(3, 7, 55),
+        BC7_MODE_PRIO_CODE(2, 7, 52),
+        BC7_MODE_PRIO_CODE(2, 7, 56),
+        BC7_MODE_PRIO_CODE(2, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 37),
+        BC7_MODE_PRIO_CODE(2, 7, 57),
+        BC7_MODE_PRIO_CODE(3, 7, 57),
+        BC7_MODE_PRIO_CODE(2, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 57),
+        BC7_MODE_PRIO_CODE(2, 7, 49),
+        BC7_MODE_PRIO_CODE(3, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 43),
+        BC7_MODE_PRIO_CODE(3, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 7, 28),
+        BC7_MODE_PRIO_CODE(2, 7, 48),
+        BC7_MODE_PRIO_CODE(3, 7, 52),
+        BC7_MODE_PRIO_CODE(3, 7, 49),
+        BC7_MODE_PRIO_CODE(4, 7, 59),
+        BC7_MODE_PRIO_CODE(4, 7, 40),
+        BC7_MODE_PRIO_CODE(4, 7, 27),
+        BC7_MODE_PRIO_CODE(3, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 55),
+        BC7_MODE_PRIO_CODE(3, 7, 56),
+        BC7_MODE_PRIO_CODE(4, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 54),
+        BC7_MODE_PRIO_CODE(3, 7, 54),
+        BC7_MODE_PRIO_CODE(4, 7, 54),
+        BC7_MODE_PRIO_CODE(2, 7, 47),
+        BC7_MODE_PRIO_CODE(3, 7, 47),
+        BC7_MODE_PRIO_CODE(4, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 7, 37),
+        BC7_MODE_PRIO_CODE(3, 7, 48),
+        BC7_MODE_PRIO_CODE(4, 7, 48),
+        BC7_MODE_PRIO_CODE(4, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 47),
+        BC7_MODE_PRIO_CODE(2, 7, 36),
+        BC7_MODE_PRIO_CODE(1, 7, 44),
+        BC7_MODE_PRIO_CODE(4, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 36),
+        BC7_MODE_PRIO_CODE(2, 7, 50),
+        BC7_MODE_PRIO_CODE(3, 7, 50),
+        BC7_MODE_PRIO_CODE(4, 7, 50),
+        BC7_MODE_PRIO_CODE(4, 7, 52),
+        BC7_MODE_PRIO_CODE(1, 7, 39),
+        BC7_MODE_PRIO_CODE(1, 7, 34),
+        BC7_MODE_PRIO_CODE(1, 7, 38),
+        BC7_MODE_PRIO_CODE(2, 7, 38),
+        BC7_MODE_PRIO_CODE(3, 7, 38),
+        BC7_MODE_PRIO_CODE(4, 7, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 7, 53),
+        BC7_MODE_PRIO_CODE(2, 7, 46),
+        BC7_MODE_PRIO_CODE(3, 7, 46),
+        BC7_MODE_PRIO_CODE(4, 7, 49),
+        BC7_MODE_PRIO_CODE(4, 7, 56),
+        BC7_MODE_PRIO_CODE(4, 7, 37),
+        BC7_MODE_PRIO_CODE(2, 7, 44),
+        BC7_MODE_PRIO_CODE(3, 7, 44),
+        BC7_MODE_PRIO_CODE(4, 7, 36),
+        BC7_MODE_PRIO_CODE(2, 7, 39),
+        BC7_MODE_PRIO_CODE(2, 7, 34),
+        BC7_MODE_PRIO_CODE(4, 7, 38),
+        BC7_MODE_PRIO_CODE(3, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 46),
+        BC7_MODE_PRIO_CODE(4, 7, 44),
+        BC7_MODE_PRIO_CODE(3, 7, 39),
+        BC7_MODE_PRIO_CODE(3, 7, 34),
+        BC7_MODE_PRIO_CODE(4, 7, 39),
+        BC7_MODE_PRIO_CODE(4, 7, 34),
+    };
+
+    const uint16_t *g_bc7PrioCodesRGBA = g_bc7PrioCodesRGBAData;
+    const int g_bc7NumPrioCodesRGBA = sizeof(g_bc7PrioCodesRGBAData) / sizeof(g_bc7PrioCodesRGBA[0]);
+
+    int UnpackMode(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_MODE_OFFSET_BITS) & ((1 << BC7_MODE_BITS) - 1));
+    }
+
+    int UnpackSeedPointCount(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_SEED_POINT_COUNT_OFFSET_BITS) & ((1 << BC7_SEED_POINT_COUNT_BITS) - 1)) + 1;
+    }
+
+    int UnpackPartition(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_PARTITION_OFFSET_BITS) & ((1 << BC7_PARTITION_BITS) - 1));
+    }
+
+    int UnpackRotation(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_ROTATION_OFFSET_BITS) & ((1 << BC7_ROTATION_BITS) - 1));
+    }
+
+    int UnpackIndexSelector(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_INDEX_MODE_OFFSET_BITS) & ((1 << BC7_INDEX_MODE_BITS) - 1));
+    }
+}}}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h b/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
index b5564c0dab..b45ba5eca8 100644
--- a/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
@@ -1,6 +1,8 @@
 #pragma once
 #include <stdint.h>
 
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
 namespace cvtt { namespace Tables { namespace BC7SC {
 
 struct TableEntry
diff --git a/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp b/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp
new file mode 100644
index 0000000000..be16d1db06
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp
@@ -0,0 +1,46 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BCCommon.h"
+
+int cvtt::Internal::BCCommon::TweakRoundsForRange(int range)
+{
+    if (range == 3)
+        return 3;
+    return 4;
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BCCommon.h b/thirdparty/cvtt/ConvectionKernels_BCCommon.h
new file mode 100644
index 0000000000..3e13151acd
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BCCommon.h
@@ -0,0 +1,104 @@
+#pragma once
+#ifndef __CVTT_BCCOMMON_H__
+#define __CVTT_BCCOMMON_H__
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class BCCommon
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            static int TweakRoundsForRange(int range);
+
+            template<int TVectorSize>
+            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
+            {
+                for (int ch = 0; ch < numRealChannels; ch++)
+                    aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
+            }
+
+            template<int TVectorSize>
+            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
+            {
+                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
+            {
+                AggregatedError<TVectorSize> aggError;
+                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
+                return aggError.Finalize(flags, channelWeightsSq);
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
+            {
+                MFloat error = ParallelMath::MakeFloatZero();
+                if (flags & Flags::Uniform)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
+                }
+                else
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
+                }
+
+                return error;
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
+            {
+                MFloat error = ParallelMath::MakeFloatZero();
+                if (flags & Flags::Uniform)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
+                }
+                else
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
+                }
+
+                return error;
+            }
+
+            template<int TChannelCount>
+            static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
+            {
+                for (int px = 0; px < 16; px++)
+                {
+                    for (int ch = 0; ch < TChannelCount; ch++)
+                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
+                }
+            }
+
+            template<int TChannelCount>
+            static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
+            {
+                for (int px = 0; px < 16; px++)
+                {
+                    for (int ch = 0; ch < TChannelCount; ch++)
+                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
+                }
+            }
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_Config.h b/thirdparty/cvtt/ConvectionKernels_Config.h
new file mode 100644
index 0000000000..e79d32b1da
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Config.h
@@ -0,0 +1,12 @@
+#pragma once
+#ifndef __CVTT_CONFIG_H__
+#define __CVTT_CONFIG_H__
+
+#if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__)
+#define CVTT_USE_SSE2
+#endif
+
+// Define this to compile everything as a single source file
+//#define CVTT_SINGLE_FILE
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC.cpp b/thirdparty/cvtt/ConvectionKernels_ETC.cpp
new file mode 100644
index 0000000000..cb202a6e9c
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC.cpp
@@ -0,0 +1,3147 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ETC.h"
+#include "ConvectionKernels_ETC1.h"
+#include "ConvectionKernels_ETC2.h"
+#include "ConvectionKernels_ETC2_Rounding.h"
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_FakeBT709_Rounding.h"
+
+#include <cmath>
+
+const int cvtt::Internal::ETCComputer::g_flipTables[2][2][8] =
+{
+    {
+        { 0, 1, 4, 5, 8, 9, 12, 13 },
+        { 2, 3, 6, 7, 10, 11, 14, 15 }
+    },
+    {
+        { 0, 1, 2, 3, 4, 5, 6, 7 },
+        { 8, 9, 10, 11, 12, 13, 14, 15 }
+    },
+};
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3])
+{
+    MSInt16 d0 = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[0]);
+    MFloat fd0 = ParallelMath::ToFloat(d0);
+    MFloat error = fd0 * fd0;
+    for (int ch = 1; ch < 3; ch++)
+    {
+        MSInt16 d = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[ch]);
+        MFloat fd = ParallelMath::ToFloat(d);
+        error = error + fd * fd;
+    }
+    return error;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3], const Options options)
+{
+    MFloat dr = ParallelMath::ToFloat(reconstructed[0]) * options.redWeight - preWeightedPixel[0];
+    MFloat dg = ParallelMath::ToFloat(reconstructed[1]) * options.greenWeight - preWeightedPixel[1];
+    MFloat db = ParallelMath::ToFloat(reconstructed[2]) * options.blueWeight - preWeightedPixel[2];
+
+    return dr * dr + dg * dg + db * db;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3])
+{
+    MFloat yuv[3];
+    ConvertToFakeBT709(yuv, reconstructed);
+
+    MFloat dy = yuv[0] - preWeightedPixel[0];
+    MFloat du = yuv[1] - preWeightedPixel[1];
+    MFloat dv = yuv[2] - preWeightedPixel[2];
+
+    return dy * dy + du * du + dv * dv;
+}
+
+void cvtt::Internal::ETCComputer::TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options)
+{
+    MUInt15 quantized[3];
+    MUInt15 unquantized[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
+
+        if (isDifferential)
+            unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+        else
+            unquantized[ch] = (quantized[ch] << 4) | quantized[ch];
+    }
+
+    MUInt16 selectors = ParallelMath::MakeUInt16(0);
+    MFloat totalError = ParallelMath::MakeFloatZero();
+
+    MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
+    MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
+
+    MUInt15 unquantizedModified[4][3];
+    for (unsigned int s = 0; s < 4; s++)
+        for (int ch = 0; ch < 3; ch++)
+            unquantizedModified[s][ch] = ParallelMath::Min(ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::ToSInt16(unquantized[ch]) + modifiers[s], s16_zero)), u15_255);
+
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    for (int px = 0; px < 8; px++)
+    {
+        MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+        MUInt16 bestSelector = ParallelMath::MakeUInt16(0);
+
+        for (unsigned int s = 0; s < 4; s++)
+        {
+            MFloat error;
+            if (isFakeBT709)
+                error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
+            else if (isUniform)
+                error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
+            else
+                error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+            bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt16(s), bestSelector);
+            bestError = ParallelMath::Min(error, bestError);
+        }
+
+        totalError = totalError + bestError;
+        selectors = selectors | (bestSelector << (px * 2));
+    }
+
+    outError = totalError;
+    outSelectors = selectors;
+}
+
+void cvtt::Internal::ETCComputer::TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options)
+{
+    MUInt15 quantized[3];
+    MUInt15 unquantized[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
+        unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+    }
+
+    MUInt16 selectors = ParallelMath::MakeUInt16(0);
+    MFloat totalError = ParallelMath::MakeFloatZero();
+
+    MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
+    MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
+
+    MUInt15 unquantizedModified[3][3];
+    for (int ch = 0; ch < 3; ch++)
+    {
+        unquantizedModified[0][ch] = ParallelMath::Max(unquantized[ch], modifier) - modifier;
+        unquantizedModified[1][ch] = unquantized[ch];
+        unquantizedModified[2][ch] = ParallelMath::Min(unquantized[ch] + modifier, u15_255);
+    }
+
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    for (int px = 0; px < 8; px++)
+    {
+        ParallelMath::FloatCompFlag isTransparentFloat = ParallelMath::Int16FlagToFloat(isTransparent[px]);
+
+        MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+        MUInt15 bestSelector = ParallelMath::MakeUInt15(0);
+
+        for (unsigned int s = 0; s < 3; s++)
+        {
+            MFloat error;
+            if (isFakeBT709)
+                error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
+            else if (isUniform)
+                error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
+            else
+                error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+            bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(s), bestSelector);
+            bestError = ParallelMath::Min(error, bestError);
+        }
+
+        // Annoying quirk: The ETC encoding machinery assumes that selectors are in the table order in the spec, which isn't
+        // the same as their encoding bits, so the transparent index is actually 1 and the valid indexes are 0, 2, and 3.
+
+        // Remap selector 1 to 2, and 2 to 3
+        bestSelector = ParallelMath::Min(ParallelMath::MakeUInt15(3), bestSelector << 1);
+
+        // Mark zero transparent as 
+        ParallelMath::ConditionalSet(bestError, isTransparentFloat, ParallelMath::MakeFloatZero());
+        ParallelMath::ConditionalSet(bestSelector, isTransparent[px], ParallelMath::MakeUInt15(1));
+
+        totalError = totalError + bestError;
+        selectors = selectors | (ParallelMath::LosslessCast<MUInt16>::Cast(bestSelector) << (px * 2));
+    }
+
+    outError = totalError;
+    outSelectors = selectors;
+}
+
+void cvtt::Internal::ETCComputer::FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs)
+{
+    // We do this part scalar because most of the cost benefit of parallelization is in error evaluation,
+    // and this code has a LOT of early-outs and disjointed index lookups that vary heavily between blocks
+    // and save a lot of time.
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        bool canIgnore[2] = { ParallelMath::Extract(canIgnoreSector[0], block), ParallelMath::Extract(canIgnoreSector[1], block) };
+        bool canIgnoreEither = canIgnore[0] || canIgnore[1];
+        float blockBestTotalError = ParallelMath::Extract(bestTotalError, block);
+        float bestDiffErrors[2] = { FLT_MAX, FLT_MAX };
+        uint16_t bestDiffSelectors[2] = { 0, 0 };
+        uint16_t bestDiffColors[2] = { 0, 0 };
+        uint16_t bestDiffTables[2] = { 0, 0 };
+        for (int sector = 0; sector < 2; sector++)
+        {
+            unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
+            for (unsigned int i = 0; i < sectorNumAttempts; i++)
+            {
+                float error = ParallelMath::Extract(drs.diffErrors[sector][i], block);
+                if (error < bestDiffErrors[sector])
+                {
+                    bestDiffErrors[sector] = error;
+                    bestDiffSelectors[sector] = ParallelMath::Extract(drs.diffSelectors[sector][i], block);
+                    bestDiffColors[sector] = ParallelMath::Extract(drs.diffColors[sector][i], block);
+                    bestDiffTables[sector] = ParallelMath::Extract(drs.diffTables[sector][i], block);
+                }
+            }
+        }
+
+        if (canIgnore[0])
+            bestDiffColors[0] = bestDiffColors[1];
+        else if (canIgnore[1])
+            bestDiffColors[1] = bestDiffColors[0];
+
+        // The best differential possibilities must be better than the best total error
+        if (bestDiffErrors[0] + bestDiffErrors[1] < blockBestTotalError)
+        {
+            // Fast path if the best possible case is legal
+            if (canIgnoreEither || ETCDifferentialIsLegalScalar(bestDiffColors[0], bestDiffColors[1]))
+            {
+                ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
+                ParallelMath::PutFloat(bestTotalError, block, bestDiffErrors[0] + bestDiffErrors[1]);
+                ParallelMath::PutUInt15(bestFlip, block, flip);
+                ParallelMath::PutUInt15(bestD, block, d);
+                for (int sector = 0; sector < 2; sector++)
+                {
+                    ParallelMath::PutUInt15(bestColors[sector], block, bestDiffColors[sector]);
+                    ParallelMath::PutUInt16(bestSelectors[sector], block, bestDiffSelectors[sector]);
+                    ParallelMath::PutUInt15(bestTables[sector], block, bestDiffTables[sector]);
+                }
+            }
+            else
+            {
+                // Slow path: Sort the possible cases by quality, and search valid combinations
+                // TODO: Pre-flatten the error lists so this is nicer to cache
+                unsigned int numSortIndexes[2] = { 0, 0 };
+                for (int sector = 0; sector < 2; sector++)
+                {
+                    unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
+
+                    for (unsigned int i = 0; i < sectorNumAttempts; i++)
+                    {
+                        if (ParallelMath::Extract(drs.diffErrors[sector][i], block) < blockBestTotalError)
+                            drs.attemptSortIndexes[sector][numSortIndexes[sector]++] = i;
+                    }
+
+                    struct SortPredicate
+                    {
+                        const MFloat *diffErrors;
+                        int block;
+
+                        bool operator()(uint16_t a, uint16_t b) const
+                        {
+                            float errorA = ParallelMath::Extract(diffErrors[a], block);
+                            float errorB = ParallelMath::Extract(diffErrors[b], block);
+
+                            if (errorA < errorB)
+                                return true;
+                            if (errorA > errorB)
+                                return false;
+
+                            return a < b;
+                        }
+                    };
+
+                    SortPredicate sp;
+                    sp.diffErrors = drs.diffErrors[sector];
+                    sp.block = block;
+
+                    std::sort<uint16_t*, const SortPredicate&>(drs.attemptSortIndexes[sector], drs.attemptSortIndexes[sector] + numSortIndexes[sector], sp);
+                }
+
+                int scannedElements = 0;
+                for (unsigned int i = 0; i < numSortIndexes[0]; i++)
+                {
+                    unsigned int attemptIndex0 = drs.attemptSortIndexes[0][i];
+                    float error0 = ParallelMath::Extract(drs.diffErrors[0][attemptIndex0], block);
+
+                    scannedElements++;
+
+                    if (error0 >= blockBestTotalError)
+                        break;
+
+                    float maxError1 = ParallelMath::Extract(bestTotalError, block) - error0;
+                    uint16_t diffColor0 = ParallelMath::Extract(drs.diffColors[0][attemptIndex0], block);
+
+                    if (maxError1 < bestDiffErrors[1])
+                        break;
+
+                    for (unsigned int j = 0; j < numSortIndexes[1]; j++)
+                    {
+                        unsigned int attemptIndex1 = drs.attemptSortIndexes[1][j];
+                        float error1 = ParallelMath::Extract(drs.diffErrors[1][attemptIndex1], block);
+
+                        scannedElements++;
+
+                        if (error1 >= maxError1)
+                            break;
+
+                        uint16_t diffColor1 = ParallelMath::Extract(drs.diffColors[1][attemptIndex1], block);
+
+                        if (ETCDifferentialIsLegalScalar(diffColor0, diffColor1))
+                        {
+                            blockBestTotalError = error0 + error1;
+
+                            ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
+                            ParallelMath::PutFloat(bestTotalError, block, blockBestTotalError);
+                            ParallelMath::PutUInt15(bestFlip, block, flip);
+                            ParallelMath::PutUInt15(bestD, block, d);
+                            ParallelMath::PutUInt15(bestColors[0], block, diffColor0);
+                            ParallelMath::PutUInt15(bestColors[1], block, diffColor1);
+                            ParallelMath::PutUInt16(bestSelectors[0], block, ParallelMath::Extract(drs.diffSelectors[0][attemptIndex0], block));
+                            ParallelMath::PutUInt16(bestSelectors[1], block, ParallelMath::Extract(drs.diffSelectors[1][attemptIndex1], block));
+                            ParallelMath::PutUInt15(bestTables[0], block, ParallelMath::Extract(drs.diffTables[0][attemptIndex0], block));
+                            ParallelMath::PutUInt15(bestTables[1], block, ParallelMath::Extract(drs.diffTables[1][attemptIndex1], block));
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b)
+{
+    MSInt16 diff = ParallelMath::LosslessCast<MSInt16>::Cast(b) - ParallelMath::LosslessCast<MSInt16>::Cast(a);
+
+    return ParallelMath::Less(ParallelMath::MakeSInt16(-5), diff) & ParallelMath::Less(diff, ParallelMath::MakeSInt16(4));
+}
+
+cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b)
+{
+    MUInt15 mask = ParallelMath::MakeUInt15(31);
+
+    return ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 10), ParallelMath::RightShift(b, 10))
+        & ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 5) & mask, ParallelMath::RightShift(b, 5) & mask)
+        & ETCDifferentialIsLegalForChannel(a & mask, b & mask);
+}
+
+bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b)
+{
+    int16_t diff = static_cast<int16_t>(b) - static_cast<int16_t>(a);
+
+    return (-4 <= diff) && (diff <= 3);
+}
+
+bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b)
+{
+    MUInt15 mask = ParallelMath::MakeUInt15(31);
+
+    return ETCDifferentialIsLegalForChannelScalar((a >> 10), (b >> 10))
+        & ETCDifferentialIsLegalForChannelScalar((a >> 5) & 31, (b >> 5) & 31)
+        & ETCDifferentialIsLegalForChannelScalar(a & 31, b & 31);
+}
+
+void cvtt::Internal::ETCComputer::EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+    MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
+
+    // To speed this up, we compute line total as the sum, then subtract out isolated
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
+            lineTotal[ch] = lineTotal[ch] + pixels[px][ch];
+        }
+        numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        lineTotal[ch] = lineTotal[ch] - isolatedTotal[ch];
+
+    MUInt15 numPixelsLine = ParallelMath::MakeUInt15(16) - numPixelsIsolated;
+
+    MUInt15 isolatedAverageQuantized[3];
+    MUInt15 isolatedAverageTargets[3];
+    {
+        int divisors[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
+
+        MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
+        for (int ch = 0; ch < 3; ch++)
+        {
+            // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
+
+            MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
+            if (!isFakeBT709)
+                numerator = numerator + addend;
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int divisor = divisors[block];
+                if (divisor == 0)
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
+                else
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
+            }
+
+            isolatedAverageTargets[ch] = numerator;
+        }
+    }
+
+    if (isFakeBT709)
+        ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
+
+    MUInt15 isolatedColor[3];
+    for (int ch = 0; ch < 3; ch++)
+        isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
+
+    MFloat isolatedError[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (isFakeBT709)
+            isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
+        else if (isUniform)
+            isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
+        else
+            isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
+    }
+
+    MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+    MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
+
+    MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
+    MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
+
+    int16_t clusterMaxLine = 0;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
+        if (blockMaxLine > clusterMaxLine)
+            clusterMaxLine = blockMaxLine;
+    }
+
+    int16_t clusterMinLine = -clusterMaxLine;
+
+    int lineDivisors[ParallelMath::ParallelSize];
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
+
+    MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
+
+    for (int table = 0; table < 8; table++)
+    {
+        int numUniqueColors[ParallelMath::ParallelSize];
+        MUInt15 uniqueQuantizedColors[31];
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            numUniqueColors[block] = 0;
+
+        MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
+        MUInt15 modifierOffset = (modifier + modifier);
+
+        for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier++)
+        {
+            MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
+            MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
+
+            MUInt15 quantized[3];
+            if (isFakeBT709)
+            {
+                MUInt15 targets[3];
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                    targets[ch] = numerator;
+                }
+
+                ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
+            }
+            else
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                }
+            }
+
+            MUInt15 packedColor = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
+                if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
+                    ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
+            }
+        }
+
+        // Stripe unfilled unique colors
+        int maxUniqueColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (numUniqueColors[block] > maxUniqueColors)
+                maxUniqueColors = numUniqueColors[block];
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
+
+            int numUnique = numUniqueColors[block];
+            for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
+                ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
+        }
+
+        for (int ci = 0; ci < maxUniqueColors; ci++)
+        {
+            MUInt15 lineColors[3][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], (ch * 5)) & ParallelMath::MakeUInt15(15));
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
+                lineColors[1][ch] = unquantizedColor;
+                lineColors[2][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+            }
+
+            MSInt32 selectors = ParallelMath::MakeSInt32(0);
+            MFloat error = ParallelMath::MakeFloatZero();
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat pixelError = isolatedError[px];
+
+                MUInt15 pixelBestSelector = ParallelMath::MakeUInt15(0);
+                for (int i = 0; i < 3; i++)
+                {
+                    MFloat error = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
+                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, pixelError);
+                    pixelError = ParallelMath::Min(error, pixelError);
+                    pixelBestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(i + 1), pixelBestSelector);
+                }
+
+                error = error + pixelError;
+                selectors = selectors | (ParallelMath::ToInt32(pixelBestSelector) << (px * 2));
+            }
+
+            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+            bestError = ParallelMath::Min(error, bestError);
+
+            if (ParallelMath::AnySet(errorBetter))
+            {
+                ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
+                ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
+                ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
+                bestIsThisMode = bestIsThisMode | errorBetter;
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (ParallelMath::Extract(bestIsThisMode, block))
+        {
+            uint32_t lowBits = 0;
+            uint32_t highBits = 0;
+
+            uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
+            ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
+
+            for (int ch = 0; ch < 3; ch++)
+                blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
+
+            uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
+            int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+
+            ParallelMath::ScalarUInt16 lineColor[3];
+            for (int ch = 0; ch < 3; ch++)
+                lineColor[ch] = (blockBestLineColor >> (ch * 5)) & 15;
+
+            EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, true);
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    MUInt15 zero15 = ParallelMath::MakeUInt15(0);
+
+    MUInt15 counts[2] = { zero15, zero15 };
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 totals[2][3] =
+    {
+        { zero15, zero15, zero15 },
+        { zero15, zero15, zero15 }
+    };
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            totals[0][ch] = totals[0][ch] + pixels[px][ch];
+            totals[1][ch] = totals[1][ch] + ParallelMath::SelectOrZero(groupings[px], pixels[px][ch]);
+        }
+        counts[1] = counts[1] + ParallelMath::SelectOrZero(groupings[px], ParallelMath::MakeUInt15(1));
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        totals[0][ch] = totals[0][ch] - totals[1][ch];
+    counts[0] = ParallelMath::MakeUInt15(16) - counts[1];
+
+    MUInt16 bestSectorBits = ParallelMath::MakeUInt16(0);
+    MUInt16 bestSignBits = ParallelMath::MakeUInt16(0);
+    MUInt15 bestColors[2] = { zero15, zero15 };
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+
+    for (int table = 0; table < 8; table++)
+    {
+        MUInt15 numUniqueColors = zero15;
+
+        int modifier = cvtt::Tables::ETC1::g_thModifierTable[table];
+
+        for (int sector = 0; sector < 2; sector++)
+        {
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int blockNumUniqueColors = 0;
+                uint16_t blockUniqueQuantizedColors[31];
+
+                int maxOffsetMultiplier = ParallelMath::Extract(counts[sector], block);
+                int minOffsetMultiplier = -maxOffsetMultiplier;
+
+                int modifierOffset = modifier * 2;
+
+                int blockSectorCounts = ParallelMath::Extract(counts[sector], block);
+                int blockSectorTotals[3];
+                for (int ch = 0; ch < 3; ch++)
+                    blockSectorTotals[ch] = ParallelMath::Extract(totals[sector][ch], block);
+
+                for (int offsetPremultiplier = minOffsetMultiplier; offsetPremultiplier <= maxOffsetMultiplier; offsetPremultiplier++)
+                {
+                    // TODO: This isn't ideal for FakeBT709
+                    int16_t quantized[3];
+                    for (int ch = 0; ch < 3; ch++)
+                    {
+                        if (blockSectorCounts == 0)
+                            quantized[ch] = 0;
+                        else
+                            quantized[ch] = std::min<int16_t>(15, std::max<int16_t>(0, (blockSectorTotals[ch] * 2 + blockSectorCounts * 17 + modifierOffset * offsetPremultiplier)) / (blockSectorCounts * 34));
+                    }
+
+                    uint16_t packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
+                    if (blockNumUniqueColors == 0 || packedColor != blockUniqueQuantizedColors[blockNumUniqueColors - 1])
+                    {
+                        assert(blockNumUniqueColors < 32);
+                        blockUniqueQuantizedColors[blockNumUniqueColors++] = packedColor;
+                    }
+                }
+
+                ParallelMath::PutUInt15(he.numUniqueColors[sector], block, blockNumUniqueColors);
+
+                int baseIndex = 0;
+                if (sector == 1)
+                    baseIndex = ParallelMath::Extract(he.numUniqueColors[0], block);
+
+                for (int i = 0; i < blockNumUniqueColors; i++)
+                    ParallelMath::PutUInt15(he.uniqueQuantizedColors[baseIndex + i], block, blockUniqueQuantizedColors[i]);
+            }
+        }
+
+        MUInt15 totalColors = he.numUniqueColors[0] + he.numUniqueColors[1];
+        int maxErrorColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            maxErrorColors = std::max<int>(maxErrorColors, ParallelMath::Extract(totalColors, block));
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            int lastColor = ParallelMath::Extract(totalColors, block);
+            uint16_t stripeColor = ParallelMath::Extract(he.uniqueQuantizedColors[0], block);
+            for (int i = lastColor; i < maxErrorColors; i++)
+                ParallelMath::PutUInt15(he.uniqueQuantizedColors[i], block, stripeColor);
+        }
+
+        for (int ci = 0; ci < maxErrorColors; ci++)
+        {
+            MUInt15 fifteen = ParallelMath::MakeUInt15(15);
+            MUInt15 twoFiftyFive = ParallelMath::MakeUInt15(255);
+            MSInt16 zeroS16 = ParallelMath::MakeSInt16(0);
+
+            MUInt15 colors[2][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = ParallelMath::RightShift(he.uniqueQuantizedColors[ci], ((2 - ch) * 5)) & fifteen;
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                colors[0][ch] = ParallelMath::Min(twoFiftyFive, unquantizedColor + modifier);
+                colors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(zeroS16, ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::MakeSInt16(modifier)));
+            }
+
+            MUInt16 signBits = ParallelMath::MakeUInt16(0);
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat errors[2];
+                for (int i = 0; i < 2; i++)
+                {
+                    if (isFakeBT709)
+                        errors[i] = ComputeErrorFakeBT709(colors[i], preWeightedPixels[px]);
+                    else if (isUniform)
+                        errors[i] = ComputeErrorUniform(colors[i], pixels[px]);
+                    else
+                        errors[i] = ComputeErrorWeighted(colors[i], preWeightedPixels[px], options);
+                }
+
+                ParallelMath::Int16CompFlag errorOneLess = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errors[1], errors[0]));
+                he.errors[ci][px] = ParallelMath::Min(errors[0], errors[1]);
+                signBits = signBits | ParallelMath::SelectOrZero(errorOneLess, ParallelMath::MakeUInt16(1 << px));
+            }
+            he.signBits[ci] = signBits;
+        }
+
+        int maxUniqueColorCombos = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            int numUniqueColorCombos = ParallelMath::Extract(he.numUniqueColors[0], block) * ParallelMath::Extract(he.numUniqueColors[1], block);
+            if (numUniqueColorCombos > maxUniqueColorCombos)
+                maxUniqueColorCombos = numUniqueColorCombos;
+        }
+
+        MUInt15 indexes[2] = { zero15, zero15 };
+        MUInt15 maxIndex[2] = { he.numUniqueColors[0] - ParallelMath::MakeUInt15(1), he.numUniqueColors[1] - ParallelMath::MakeUInt15(1) };
+
+        int block1Starts[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            block1Starts[block] = ParallelMath::Extract(he.numUniqueColors[0], block);
+
+        for (int combo = 0; combo < maxUniqueColorCombos; combo++)
+        {
+            MUInt15 index0 = indexes[0] + ParallelMath::MakeUInt15(1);
+            ParallelMath::Int16CompFlag index0Overflow = ParallelMath::Less(maxIndex[0], index0);
+            ParallelMath::ConditionalSet(index0, index0Overflow, ParallelMath::MakeUInt15(0));
+
+            MUInt15 index1 = ParallelMath::Min(maxIndex[1], indexes[1] + ParallelMath::SelectOrZero(index0Overflow, ParallelMath::MakeUInt15(1)));
+            indexes[0] = index0;
+            indexes[1] = index1;
+
+            int ci0[ParallelMath::ParallelSize];
+            int ci1[ParallelMath::ParallelSize];
+            MUInt15 color0;
+            MUInt15 color1;
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                ci0[block] = ParallelMath::Extract(index0, block);
+                ci1[block] = ParallelMath::Extract(index1, block) + block1Starts[block];
+                ParallelMath::PutUInt15(color0, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci0[block]], block));
+                ParallelMath::PutUInt15(color1, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci1[block]], block));
+            }
+
+            MFloat totalError = ParallelMath::MakeFloatZero();
+            MUInt16 sectorBits = ParallelMath::MakeUInt16(0);
+            MUInt16 signBits = ParallelMath::MakeUInt16(0);
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat errorCI0;
+                MFloat errorCI1;
+                MUInt16 signBits0;
+                MUInt16 signBits1;
+
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    ParallelMath::PutFloat(errorCI0, block, ParallelMath::Extract(he.errors[ci0[block]][px], block));
+                    ParallelMath::PutFloat(errorCI1, block, ParallelMath::Extract(he.errors[ci1[block]][px], block));
+                    ParallelMath::PutUInt16(signBits0, block, ParallelMath::Extract(he.signBits[ci0[block]], block));
+                    ParallelMath::PutUInt16(signBits1, block, ParallelMath::Extract(he.signBits[ci1[block]], block));
+                }
+
+                totalError = totalError + ParallelMath::Min(errorCI0, errorCI1);
+
+                MUInt16 bitPosition = ParallelMath::MakeUInt16(1 << px);
+
+                ParallelMath::Int16CompFlag error1Better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errorCI1, errorCI0));
+
+                sectorBits = sectorBits | ParallelMath::SelectOrZero(error1Better, bitPosition);
+                signBits = signBits | (bitPosition & ParallelMath::Select(error1Better, signBits1, signBits0));
+            }
+
+            ParallelMath::FloatCompFlag totalErrorBetter = ParallelMath::Less(totalError, bestError);
+            ParallelMath::Int16CompFlag totalErrorBetter16 = ParallelMath::FloatFlagToInt16(totalErrorBetter);
+            if (ParallelMath::AnySet(totalErrorBetter16))
+            {
+                bestIsThisMode = bestIsThisMode | totalErrorBetter16;
+                ParallelMath::ConditionalSet(bestTable, totalErrorBetter16, ParallelMath::MakeUInt15(table));
+                ParallelMath::ConditionalSet(bestColors[0], totalErrorBetter16, color0);
+                ParallelMath::ConditionalSet(bestColors[1], totalErrorBetter16, color1);
+                ParallelMath::ConditionalSet(bestSectorBits, totalErrorBetter16, sectorBits);
+                ParallelMath::ConditionalSet(bestSignBits, totalErrorBetter16, signBits);
+                bestError = ParallelMath::Min(totalError, bestError);
+            }
+        }
+    }
+
+    if (ParallelMath::AnySet(bestIsThisMode))
+    {
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (!ParallelMath::Extract(bestIsThisMode, block))
+                continue;
+
+            ParallelMath::ScalarUInt16 blockBestColors[2] = { ParallelMath::Extract(bestColors[0], block), ParallelMath::Extract(bestColors[1], block) };
+            ParallelMath::ScalarUInt16 blockBestSectorBits = ParallelMath::Extract(bestSectorBits, block);
+            ParallelMath::ScalarUInt16 blockBestSignBits = ParallelMath::Extract(bestSignBits, block);
+            ParallelMath::ScalarUInt16 blockBestTable = ParallelMath::Extract(bestTable, block);
+
+            EmitHModeBlock(outputBuffer + block * 8, blockBestColors, blockBestSectorBits, blockBestSignBits, blockBestTable, true);
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolatedBase[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options)
+{
+    // We treat T and H mode as the same mode ("Virtual T mode") with punchthrough, because of how the colors work:
+    //
+    // T mode: C1, C2+M, Transparent, C2-M
+    // H mode: C1+M, C1-M, Transparent, C2-M
+    //
+    // So in either case, we have 2 colors +/- a modifier, and a third unique color, which is basically T mode except without the middle color.
+    // The only thing that matters is whether it's better to store the isolated color as T mode color 1, or store it offset in H mode color 2.
+    //
+    // Sometimes it won't even be possible to store it in H mode color 2 because the table low bit derives from a numeric comparison of the colors,
+    // but unlike opaque blocks, we can't flip them.
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    ParallelMath::FloatCompFlag isTransparentF[16];
+    for (int px = 0; px < 16; px++)
+        isTransparentF[px] = ParallelMath::Int16FlagToFloat(isTransparent[px]);
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+    ParallelMath::Int16CompFlag bestIsHMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+    MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
+    MUInt15 numPixelsLine = ParallelMath::MakeUInt15(0);
+
+    ParallelMath::Int16CompFlag isIsolated[16];
+    ParallelMath::Int16CompFlag isLine[16];
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        ParallelMath::Int16CompFlag isOpaque = ParallelMath::Not(isTransparent[px]);
+        isIsolated[px] = isIsolatedBase[px] & isOpaque;
+        isLine[px] = ParallelMath::Not(isIsolatedBase[px]) & isOpaque;
+    }
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
+            lineTotal[ch] = lineTotal[ch] + ParallelMath::SelectOrZero(isLine[px], pixels[px][ch]);
+        }
+        numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
+        numPixelsLine = numPixelsLine + ParallelMath::SelectOrZero(isLine[px], ParallelMath::MakeUInt15(1));
+    }
+
+    MUInt15 isolatedAverageQuantized[3];
+    MUInt15 hModeIsolatedQuantized[8][3];
+    MUInt15 isolatedAverageTargets[3];
+    {
+        int divisors[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
+
+        MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
+        for (int ch = 0; ch < 3; ch++)
+        {
+            // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
+
+            MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
+            if (!isFakeBT709)
+                numerator = numerator + addend;
+
+            MUInt15 hModeIsolatedNumerators[8];
+            for (int table = 0; table < 8; table++)
+            {
+                // FIXME: Handle fake BT.709 correctly
+                MUInt15 offsetTotal = isolatedTotal[ch] + ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]), numPixelsIsolated));
+
+                hModeIsolatedNumerators[table] = (offsetTotal + offsetTotal) + addend;
+            }
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int divisor = divisors[block];
+                if (divisor == 0)
+                {
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
+                    for (int table = 0; table < 8; table++)
+                        ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, 0);
+                }
+                else
+                {
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
+                    for (int table = 0; table < 8; table++)
+                        ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, ParallelMath::Extract(hModeIsolatedNumerators[table], block) / divisor);
+                }
+            }
+
+            isolatedAverageTargets[ch] = numerator;
+        }
+    }
+
+    if (isFakeBT709)
+        ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
+
+    for (int table = 0; table < 8; table++)
+        for (int ch = 0; ch < 3; ch++)
+            hModeIsolatedQuantized[table][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), hModeIsolatedQuantized[table][ch]);
+
+    MUInt15 isolatedColor[3];
+    for (int ch = 0; ch < 3; ch++)
+        isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
+
+    MFloat isolatedError[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (isFakeBT709)
+            isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
+        else if (isUniform)
+            isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
+        else
+            isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
+
+        ParallelMath::ConditionalSet(isolatedError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+    }
+
+    MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+    MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
+    MUInt15 bestIsolatedColor = ParallelMath::MakeUInt15(0);
+    MUInt15 bestHModeColor2 = ParallelMath::MakeUInt15(0);
+    ParallelMath::Int16CompFlag bestUseHMode = ParallelMath::MakeBoolInt16(false);
+
+    MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
+    MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
+
+    int16_t clusterMaxLine = 0;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
+        if (blockMaxLine > clusterMaxLine)
+            clusterMaxLine = blockMaxLine;
+    }
+
+    int16_t clusterMinLine = -clusterMaxLine;
+
+    int lineDivisors[ParallelMath::ParallelSize];
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
+
+    MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
+
+    for (int table = 0; table < 8; table++)
+    {
+        int numUniqueColors[ParallelMath::ParallelSize];
+        MUInt15 uniqueQuantizedColors[31];
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            numUniqueColors[block] = 0;
+
+        MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
+        MUInt15 modifierOffset = (modifier + modifier);
+
+        for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier += 2)
+        {
+            MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
+            MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
+
+            MUInt15 quantized[3];
+            if (isFakeBT709)
+            {
+                MUInt15 targets[3];
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                    targets[ch] = numerator;
+                }
+
+                ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
+            }
+            else
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                }
+            }
+
+            MUInt15 packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
+                if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
+                    ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
+            }
+        }
+
+        // Stripe unfilled unique colors
+        int maxUniqueColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (numUniqueColors[block] > maxUniqueColors)
+                maxUniqueColors = numUniqueColors[block];
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
+
+            int numUnique = numUniqueColors[block];
+            for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
+                ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
+        }
+
+        MFloat hModeErrors[16];
+        MUInt15 hModeUnquantizedColor[3];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MUInt15 quantizedChannel = hModeIsolatedQuantized[table][ch];
+
+            MUInt15 unquantizedCh = (quantizedChannel << 4) | quantizedChannel;
+            hModeUnquantizedColor[ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedCh) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+        }
+
+        for (int px = 0; px < 16; px++)
+        {
+            hModeErrors[px] = isUniform ? ComputeErrorUniform(hModeUnquantizedColor, pixels[px]) : ComputeErrorWeighted(hModeUnquantizedColor, preWeightedPixels[px], options);
+            ParallelMath::ConditionalSet(hModeErrors[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+        }
+
+        MUInt15 packedHModeColor2 = (hModeIsolatedQuantized[table][0] << 10) | (hModeIsolatedQuantized[table][1] << 5) | hModeIsolatedQuantized[table][2];
+        ParallelMath::Int16CompFlag tableLowBitIsZero = ((table & 1) == 0) ? ParallelMath::MakeBoolInt16(true) : ParallelMath::MakeBoolInt16(false);
+
+        for (int ci = 0; ci < maxUniqueColors; ci++)
+        {
+            MUInt15 lineColors[2][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], 10 - (ch * 5)) & ParallelMath::MakeUInt15(15));
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
+                lineColors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+            }
+
+            MUInt15 bestLineSelector[16];
+            MFloat bestLineError[16];
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat lineErrors[2];
+                for (int i = 0; i < 2; i++)
+                    lineErrors[i] = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
+
+                ParallelMath::Int16CompFlag firstIsBetter = ParallelMath::FloatFlagToInt16(ParallelMath::LessOrEqual(lineErrors[0], lineErrors[1]));
+                bestLineSelector[px] = ParallelMath::Select(firstIsBetter, ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(3));
+                bestLineError[px] = ParallelMath::Min(lineErrors[0], lineErrors[1]);
+
+                ParallelMath::ConditionalSet(bestLineError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+            }
+
+            // One case considered here was if it was possible to force H mode to be valid when the line color is unused.
+            // That case isn't actually useful because it's equivalent to the isolated color being unused at maximum offset,
+            // which is always checked after a swap.
+            MFloat tModeError = ParallelMath::MakeFloatZero();
+            MFloat hModeError = ParallelMath::MakeFloatZero();
+            for (int px = 0; px < 16; px++)
+            {
+                tModeError = tModeError + ParallelMath::Min(bestLineError[px], isolatedError[px]);
+                hModeError = hModeError + ParallelMath::Min(bestLineError[px], hModeErrors[px]);
+            }
+
+            ParallelMath::FloatCompFlag hLessError = ParallelMath::Less(hModeError, tModeError);
+
+            MUInt15 packedHModeColor1 = uniqueQuantizedColors[ci];
+
+            ParallelMath::Int16CompFlag hModeTableLowBitMustBeZero = ParallelMath::Less(packedHModeColor1, packedHModeColor2);
+
+            ParallelMath::Int16CompFlag hModeIsLegal = ParallelMath::Equal(hModeTableLowBitMustBeZero, tableLowBitIsZero);
+            ParallelMath::Int16CompFlag useHMode = ParallelMath::FloatFlagToInt16(hLessError) & hModeIsLegal;
+
+            MFloat roundBestError = tModeError;
+            ParallelMath::ConditionalSet(roundBestError, ParallelMath::Int16FlagToFloat(useHMode), hModeError);
+
+            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(roundBestError, bestError));
+            ParallelMath::FloatCompFlag useHModeF = ParallelMath::Int16FlagToFloat(useHMode);
+
+            if (ParallelMath::AnySet(errorBetter))
+            {
+                MSInt32 selectors = ParallelMath::MakeSInt32(0);
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 selector = bestLineSelector[px];
+
+                    MFloat isolatedPixelError = ParallelMath::Select(useHModeF, hModeErrors[px], isolatedError[px]);
+                    ParallelMath::Int16CompFlag isolatedBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(isolatedPixelError, bestLineError[px]));
+
+                    ParallelMath::ConditionalSet(selector, isolatedBetter, ParallelMath::MakeUInt15(0));
+                    ParallelMath::ConditionalSet(selector, isTransparent[px], ParallelMath::MakeUInt15(2));
+                    selectors = selectors | (ParallelMath::ToInt32(selector) << (px * 2));
+                }
+
+                bestError = ParallelMath::Min(bestError, roundBestError);
+                ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
+                ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
+                ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
+                ParallelMath::ConditionalSet(bestIsHMode, errorBetter, useHMode);
+                ParallelMath::ConditionalSet(bestHModeColor2, errorBetter, packedHModeColor2);
+                
+                bestIsThisMode = bestIsThisMode | errorBetter;
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (ParallelMath::Extract(bestIsThisMode, block))
+        {
+            uint32_t lowBits = 0;
+            uint32_t highBits = 0;
+
+            uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
+            ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
+
+            for (int ch = 0; ch < 3; ch++)
+                blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
+
+            uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
+            int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+
+            ParallelMath::ScalarUInt16 lineColor[3];
+            for (int ch = 0; ch < 3; ch++)
+                lineColor[ch] = (blockBestLineColor >> (10 - (ch * 5))) & 15;
+
+            if (ParallelMath::Extract(bestIsHMode, block))
+            {
+                // T mode: C1, C2+M, Transparent, C2-M
+                // H mode: C1+M, C1-M, Transparent, C2-M
+                static const ParallelMath::ScalarUInt16 selectorRemapSector[4] = { 1, 0, 1, 0 };
+                static const ParallelMath::ScalarUInt16 selectorRemapSign[4] = { 1, 0, 0, 1 };
+
+                // Remap selectors
+                ParallelMath::ScalarUInt16 signBits = 0;
+                ParallelMath::ScalarUInt16 sectorBits = 0;
+                int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+                for (int px = 0; px < 16; px++)
+                {
+                    int32_t selector = (blockBestSelectors >> (px * 2)) & 3;
+                    sectorBits |= (selectorRemapSector[selector] << px);
+                    signBits |= (selectorRemapSign[selector] << px);
+                }
+
+                ParallelMath::ScalarUInt16 blockColors[2] = { blockBestLineColor, ParallelMath::Extract(bestHModeColor2, block) };
+
+                EmitHModeBlock(outputBuffer + block * 8, blockColors, sectorBits, signBits, blockBestTable, false);
+            }
+            else
+                EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, false);
+        }
+    }
+}
+
+
+cvtt::ParallelMath::UInt15 cvtt::Internal::ETCComputer::DecodePlanarCoeff(const MUInt15 &coeff, int ch)
+{
+    if (ch == 1)
+        return (coeff << 1) | (ParallelMath::RightShift(coeff, 6));
+    else
+        return (coeff << 2) | (ParallelMath::RightShift(coeff, 4));
+}
+
+void cvtt::Internal::ETCComputer::EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    // NOTE: If it's desired to do this in another color space, the best way to do it would probably be
+    // to do everything in that color space and then transform it back to RGB.
+
+    // We compute H = (H-O)/4 and V= (V-O)/4 to simplify the math
+
+    // error = (x*H + y*V + O - C)^2
+    MFloat h[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+    MFloat v[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+    MFloat o[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+
+    MFloat totalError = ParallelMath::MakeFloatZero();
+    MUInt15 bestCoeffs[3][3];	// [Channel][Coeff]
+    for (int ch = 0; ch < 3; ch++)
+    {
+        float fhh = 0.f;
+        float fho = 0.f;
+        float fhv = 0.f;
+        float foo = 0.f;
+        float fov = 0.f;
+        float fvv = 0.f;
+        MFloat fc = ParallelMath::MakeFloatZero();
+        MFloat fh = ParallelMath::MakeFloatZero();
+        MFloat fv = ParallelMath::MakeFloatZero();
+        MFloat fo = ParallelMath::MakeFloatZero();
+
+        float &foh = fho;
+        float &fvh = fhv;
+        float &fvo = fov;
+
+        for (int px = 0; px < 16; px++)
+        {
+            float x = static_cast<float>(px % 4);
+            float y = static_cast<float>(px / 4);
+            MFloat c = isFakeBT709 ? preWeightedPixels[px][ch] : ParallelMath::ToFloat(pixels[px][ch]);
+
+            // (x*H + y*V + O - C)^2
+            fhh += x * x;
+            fhv += x * y;
+            fho += x;
+            fh = fh - c * x;
+
+            fvh += y * x;
+            fvv += y * y;
+            fvo += y;
+            fv = fv - c * y;
+
+            foh += x;
+            fov += y;
+            foo += 1;
+            fo = fo - c;
+
+            fh = fh - c * x;
+            fv = fv - c * y;
+            fo = fo - c;
+            fc = fc + c * c;
+        }
+
+        //float totalError = fhh * h * h + fho * h*o + fhv * h*v + foo * o * o + fov * o*v + fvv * v * v + fh * h + fv * v + fo * o + fc;
+
+        // error = fhh*h^2 + fho*h*o + fhv*h*v + foo*o^2 + fov*o*v + fvv*v^2 + fh*h + fv*v + fo*o + fc
+        // derror/dh = 2*fhh*h + fho*o + fhv*v + fh
+        // derror/dv = fhv*h + fov*o + 2*fvv*v + fv
+        // derror/do = fho*h + 2*foo*o + fov*v + fo
+
+        // Solve system of equations
+        // h o v 1 = 0
+        // -------
+        // d e f g  R0
+        // i j k l  R1
+        // m n p q  R2
+
+        float d = 2.0f * fhh;
+        float e = fho;
+        float f = fhv;
+        MFloat gD = fh;
+
+        float i = fhv;
+        float j = fov;
+        float k = 2.0f * fvv;
+        MFloat lD = fv;
+
+        float m = fho;
+        float n = 2.0f * foo;
+        float p = fov;
+        MFloat qD = fo;
+
+        {
+            // Factor out first column from R1 and R2
+            float r0to1 = -i / d;
+            float r0to2 = -m / d;
+
+            // 0 j1 k1 l1D
+            float j1 = j + r0to1 * e;
+            float k1 = k + r0to1 * f;
+            MFloat l1D = lD + gD * r0to1;
+
+            // 0 n1 p1 q1D
+            float n1 = n + r0to2 * e;
+            float p1 = p + r0to2 * f;
+            MFloat q1D = qD + gD * r0to2;
+
+            // Factor out third column from R2
+            float r1to2 = -p1 / k1;
+
+            // 0 n2 0 q2D
+            float n2 = n1 + r1to2 * j1;
+            MFloat q2D = q1D + l1D * r1to2;
+
+            o[ch] = -q2D / n2;
+
+            // Factor out second column from R1
+            // 0 n2 0 q2D
+
+            float r2to1 = -j1 / n2;
+
+            // 0 0 k1 l2D
+            // 0 n2 0 q2D
+            MFloat l2D = l1D + q2D * r2to1;
+
+            float elim2 = -f / k1;
+            float elim1 = -e / n2;
+
+            // d 0 0 g2D
+            MFloat g2D = gD + l2D * elim2 + q2D * elim1;
+
+            // n2*o + q2 = 0
+            // o = -q2 / n2
+            h[ch] = -g2D / d;
+            v[ch] = -l2D / k1;
+        }
+
+        // Undo the local transformation
+        h[ch] = h[ch] * 4.0f + o[ch];
+        v[ch] = v[ch] * 4.0f + o[ch];
+    }
+
+    if (isFakeBT709)
+    {
+        MFloat oRGB[3];
+        MFloat hRGB[3];
+        MFloat vRGB[3];
+
+        ConvertFromFakeBT709(oRGB, o);
+        ConvertFromFakeBT709(hRGB, h);
+        ConvertFromFakeBT709(vRGB, v);
+
+        // Twiddling in fake BT.607 is a mess, just round off for now (the precision is pretty good anyway)
+        {
+            ParallelMath::RoundTowardNearestForScope rtn;
+
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MFloat fcoeffs[3] = { oRGB[ch], hRGB[ch], vRGB[ch] };
+
+                for (int c = 0; c < 3; c++)
+                {
+                    MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
+                    if (ch == 1)
+                        coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
+                    else
+                        coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
+                    fcoeffs[c] = coeff;
+                }
+
+                for (int c = 0; c < 3; c++)
+                    bestCoeffs[ch][c] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rtn);
+            }
+        }
+
+        MUInt15 reconstructed[16][3];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MUInt15 dO = DecodePlanarCoeff(bestCoeffs[ch][0], ch);
+            MUInt15 dH = DecodePlanarCoeff(bestCoeffs[ch][1], ch);
+            MUInt15 dV = DecodePlanarCoeff(bestCoeffs[ch][2], ch);
+
+            MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+            MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+            MFloat error = ParallelMath::MakeFloatZero();
+
+            MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
+
+            for (int px = 0; px < 16; px++)
+            {
+                MUInt15 pxv = ParallelMath::MakeUInt15(px);
+                MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
+                MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
+
+                MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
+                MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
+                reconstructed[px][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
+            }
+        }
+
+        totalError = ParallelMath::MakeFloatZero();
+        for (int px = 0; px < 16; px++)
+            totalError = totalError + ComputeErrorFakeBT709(reconstructed[px], preWeightedPixels[px]);
+    }
+    else
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MFloat fcoeffs[3] = { o[ch], h[ch], v[ch] };
+            MUInt15 coeffRanges[3][2];
+
+            for (int c = 0; c < 3; c++)
+            {
+                MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
+                if (ch == 1)
+                    coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
+                else
+                    coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
+                fcoeffs[c] = coeff;
+            }
+
+            {
+                ParallelMath::RoundDownForScope rd;
+                for (int c = 0; c < 3; c++)
+                    coeffRanges[c][0] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rd);
+            }
+
+            {
+                ParallelMath::RoundUpForScope ru;
+                for (int c = 0; c < 3; c++)
+                    coeffRanges[c][1] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &ru);
+            }
+
+            MFloat bestChannelError = ParallelMath::MakeFloat(FLT_MAX);
+            for (int io = 0; io < 2; io++)
+            {
+                MUInt15 dO = DecodePlanarCoeff(coeffRanges[0][io], ch);
+
+                for (int ih = 0; ih < 2; ih++)
+                {
+                    MUInt15 dH = DecodePlanarCoeff(coeffRanges[1][ih], ch);
+                    MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+                    for (int iv = 0; iv < 2; iv++)
+                    {
+                        MUInt15 dV = DecodePlanarCoeff(coeffRanges[2][iv], ch);
+                        MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+                        MFloat error = ParallelMath::MakeFloatZero();
+
+                        MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 pxv = ParallelMath::MakeUInt15(px);
+                            MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
+                            MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
+
+                            MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
+                            MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
+                            MUInt15 dec = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
+
+                            MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(dec);
+
+                            MFloat deltaF = ParallelMath::ToFloat(delta);
+                            error = error + deltaF * deltaF;
+                        }
+
+                        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestChannelError));
+                        if (ParallelMath::AnySet(errorBetter))
+                        {
+                            bestChannelError = ParallelMath::Min(error, bestChannelError);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][0], errorBetter, coeffRanges[0][io]);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][1], errorBetter, coeffRanges[1][ih]);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][2], errorBetter, coeffRanges[2][iv]);
+                        }
+                    }
+                }
+            }
+
+            if (!isUniform)
+            {
+                switch (ch)
+                {
+                case 0:
+                    bestChannelError = bestChannelError * (options.redWeight * options.redWeight);
+                    break;
+                case 1:
+                    bestChannelError = bestChannelError * (options.greenWeight * options.greenWeight);
+                    break;
+                case 2:
+                    bestChannelError = bestChannelError * (options.blueWeight * options.blueWeight);
+                    break;
+                default:
+                    break;
+                }
+            }
+
+            totalError = totalError + bestChannelError;
+        }
+    }
+
+    ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(totalError, bestError));
+    if (ParallelMath::AnySet(errorBetter))
+    {
+        bestError = ParallelMath::Min(bestError, totalError);
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (!ParallelMath::Extract(errorBetter, block))
+                continue;
+
+            int ro = ParallelMath::Extract(bestCoeffs[0][0], block);
+            int rh = ParallelMath::Extract(bestCoeffs[0][1], block);
+            int rv = ParallelMath::Extract(bestCoeffs[0][2], block);
+
+            int go = ParallelMath::Extract(bestCoeffs[1][0], block);
+            int gh = ParallelMath::Extract(bestCoeffs[1][1], block);
+            int gv = ParallelMath::Extract(bestCoeffs[1][2], block);
+
+            int bo = ParallelMath::Extract(bestCoeffs[2][0], block);
+            int bh = ParallelMath::Extract(bestCoeffs[2][1], block);
+            int bv = ParallelMath::Extract(bestCoeffs[2][2], block);
+
+            int go1 = go >> 6;
+            int go2 = go & 63;
+
+            int bo1 = bo >> 5;
+            int bo2 = (bo >> 3) & 3;
+            int bo3 = bo & 7;
+
+            int rh1 = (rh >> 1);
+            int rh2 = rh & 1;
+
+            int fakeR = ro >> 2;
+            int fakeDR = go1 | ((ro & 3) << 1);
+
+            int fakeG = (go2 >> 2);
+            int fakeDG = ((go2 & 3) << 1) | bo1;
+
+            int fakeB = bo2;
+            int fakeDB = bo3 >> 1;
+
+            uint32_t highBits = 0;
+            uint32_t lowBits = 0;
+
+            // Avoid overflowing R
+            if ((fakeDR & 4) != 0 && fakeR + fakeDR < 8)
+                highBits |= 1 << (63 - 32);
+
+            // Avoid overflowing G
+            if ((fakeDG & 4) != 0 && fakeG + fakeDG < 8)
+                highBits |= 1 << (55 - 32);
+
+            // Overflow B
+            if (fakeB + fakeDB < 4)
+            {
+                // Overflow low
+                highBits |= 1 << (42 - 32);
+            }
+            else
+            {
+                // Overflow high
+                highBits |= 7 << (45 - 32);
+            }
+
+            highBits |= ro << (57 - 32);
+            highBits |= go1 << (56 - 32);
+            highBits |= go2 << (49 - 32);
+            highBits |= bo1 << (48 - 32);
+            highBits |= bo2 << (43 - 32);
+            highBits |= bo3 << (39 - 32);
+            highBits |= rh1 << (34 - 32);
+            highBits |= 1 << (33 - 32);
+            highBits |= rh2 << (32 - 32);
+
+            lowBits |= gh << 25;
+            lowBits |= bh << 19;
+            lowBits |= rv << 13;
+            lowBits |= gv << 6;
+            lowBits |= bv << 0;
+
+            for (int i = 0; i < 4; i++)
+                outputBuffer[block * 8 + i] = (highBits >> (24 - i * 8)) & 0xff;
+            for (int i = 0; i < 4; i++)
+                outputBuffer[block * 8 + i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha)
+{
+    ParallelMath::Int16CompFlag pixelIsTransparent[16];
+    ParallelMath::Int16CompFlag anyTransparent = ParallelMath::MakeBoolInt16(false);
+    ParallelMath::Int16CompFlag allTransparent = ParallelMath::MakeBoolInt16(true);
+
+    if (punchthroughAlpha)
+    {
+        const float fThreshold = std::max<float>(std::min<float>(1.0f, options.threshold), 0.0f) * 255.0f;
+
+        // +1.0f is intentional, we want to take the next valid integer (even if it's 256) since everything else lower is transparent
+        MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(std::floor(fThreshold + 1.0f)));
+
+        for (int px = 0; px < 16; px++)
+        {
+            MUInt15 alpha;
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                ParallelMath::PutUInt15(alpha, block, pixelBlocks[block].m_pixels[px][3]);
+
+            ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(alpha, threshold);
+            anyTransparent = (anyTransparent | isTransparent);
+            allTransparent = (allTransparent & isTransparent);
+            pixelIsTransparent[px] = isTransparent;
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+            pixelIsTransparent[px] = ParallelMath::MakeBoolInt16(false);
+
+        allTransparent = anyTransparent = ParallelMath::MakeBoolInt16(false);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+
+    ETC2CompressionDataInternal* internalData = static_cast<ETC2CompressionDataInternal*>(compressionData);
+
+    MUInt15 pixels[16][3];
+    MFloat preWeightedPixels[16][3];
+    ExtractBlocks(pixels, preWeightedPixels, pixelBlocks, options);
+
+    if (ParallelMath::AnySet(anyTransparent))
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::Int16CompFlag flag = pixelIsTransparent[px];
+            ParallelMath::FloatCompFlag fflag = ParallelMath::Int16FlagToFloat(flag);
+
+            for (int ch = 0; ch < 3; ch++)
+            {
+                ParallelMath::ConditionalSet(pixels[px][ch], flag, ParallelMath::MakeUInt15(0));
+                ParallelMath::ConditionalSet(preWeightedPixels[px][ch], fflag, ParallelMath::MakeFloat(0.0f));
+            }
+        }
+    }
+
+    if (!ParallelMath::AllSet(allTransparent))
+        EncodePlanar(outputBuffer, bestError, pixels, preWeightedPixels, options);
+
+    MFloat chromaDelta[16][2];
+
+    MUInt15 numOpaque = ParallelMath::MakeUInt15(16);
+    for (int px = 0; px < 16; px++)
+        numOpaque = numOpaque - ParallelMath::SelectOrZero(pixelIsTransparent[px], ParallelMath::MakeUInt15(1));
+
+    if (options.flags & cvtt::Flags::Uniform)
+    {
+        MSInt16 chromaCoordinates3[16][2];
+        for (int px = 0; px < 16; px++)
+        {
+            chromaCoordinates3[px][0] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
+            chromaCoordinates3[px][1] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][1] << 1) + ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
+        }
+
+        MSInt16 chromaCoordinateCentroid[2] = { ParallelMath::MakeSInt16(0), ParallelMath::MakeSInt16(0) };
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 2; ch++)
+                chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
+        }
+
+        if (punchthroughAlpha)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                {
+                    MUInt15 chromaCoordinateMultiplied = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(chromaCoordinates3[px][ch], numOpaque));
+                    MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(chromaCoordinateMultiplied) - chromaCoordinateCentroid[ch];
+                    chromaDelta[px][ch] = ParallelMath::ToFloat(delta);
+                }
+            }
+        }
+        else
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                    chromaDelta[px][ch] = ParallelMath::ToFloat((chromaCoordinates3[px][ch] << 4) - chromaCoordinateCentroid[ch]);
+            }
+        }
+
+        const MFloat rcpSqrt3 = ParallelMath::MakeFloat(0.57735026918962576450914878050196f);
+
+        for (int px = 0; px < 16; px++)
+            chromaDelta[px][1] = chromaDelta[px][1] * rcpSqrt3;
+    }
+    else
+    {
+        const float chromaAxis0[3] = { internalData->m_chromaSideAxis0[0], internalData->m_chromaSideAxis0[1], internalData->m_chromaSideAxis0[2] };
+        const float chromaAxis1[3] = { internalData->m_chromaSideAxis1[0], internalData->m_chromaSideAxis1[1], internalData->m_chromaSideAxis1[2] };
+
+        MFloat chromaCoordinates3[16][2];
+        for (int px = 0; px < 16; px++)
+        {
+            const MFloat &px0 = preWeightedPixels[px][0];
+            const MFloat &px1 = preWeightedPixels[px][1];
+            const MFloat &px2 = preWeightedPixels[px][2];
+
+            chromaCoordinates3[px][0] = px0 * chromaAxis0[0] + px1 * chromaAxis0[1] + px2 * chromaAxis0[2];
+            chromaCoordinates3[px][1] = px0 * chromaAxis1[0] + px1 * chromaAxis1[1] + px2 * chromaAxis1[2];
+        }
+
+        MFloat chromaCoordinateCentroid[2] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 2; ch++)
+                chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
+        }
+
+        if (punchthroughAlpha)
+        {
+            const MFloat numOpaqueF = ParallelMath::ToFloat(numOpaque);
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                {
+                    MFloat chromaCoordinateMultiplied = chromaCoordinates3[px][ch] * numOpaqueF;
+                    MFloat delta = chromaCoordinateMultiplied - chromaCoordinateCentroid[ch];
+                    chromaDelta[px][ch] = delta;
+                }
+            }
+        }
+        else
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                    chromaDelta[px][ch] = chromaCoordinates3[px][ch] * 16.0f - chromaCoordinateCentroid[ch];
+            }
+        }
+    }
+
+
+    MFloat covXX = ParallelMath::MakeFloatZero();
+    MFloat covYY = ParallelMath::MakeFloatZero();
+    MFloat covXY = ParallelMath::MakeFloatZero();
+
+    for (int px = 0; px < 16; px++)
+    {
+        MFloat nx = chromaDelta[px][0];
+        MFloat ny = chromaDelta[px][1];
+
+        covXX = covXX + nx * nx;
+        covYY = covYY + ny * ny;
+        covXY = covXY + nx * ny;
+    }
+
+    MFloat halfTrace = (covXX + covYY) * 0.5f;
+    MFloat det = covXX * covYY - covXY * covXY;
+
+    MFloat mm = ParallelMath::Sqrt(ParallelMath::Max(ParallelMath::MakeFloatZero(), halfTrace * halfTrace - det));
+
+    MFloat ev = halfTrace + mm;
+
+    MFloat dx = (covYY - ev + covXY);
+    MFloat dy = -(covXX - ev + covXY);
+
+    // If evenly distributed, pick an arbitrary plane
+    ParallelMath::FloatCompFlag allZero = ParallelMath::Equal(dx, ParallelMath::MakeFloatZero()) & ParallelMath::Equal(dy, ParallelMath::MakeFloatZero());
+    ParallelMath::ConditionalSet(dx, allZero, ParallelMath::MakeFloat(1.f));
+
+    ParallelMath::Int16CompFlag sectorAssignments[16];
+    for (int px = 0; px < 16; px++)
+        sectorAssignments[px] = ParallelMath::FloatFlagToInt16(ParallelMath::Less(chromaDelta[px][0] * dx + chromaDelta[px][1] * dy, ParallelMath::MakeFloatZero()));
+
+    if (!ParallelMath::AllSet(allTransparent))
+    {
+        EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
+
+        // Flip sector assignments
+        for (int px = 0; px < 16; px++)
+            sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+
+        EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
+
+        EncodeHMode(outputBuffer, bestError, sectorAssignments, pixels, internalData->m_h, preWeightedPixels, options);
+
+        CompressETC1BlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, internalData->m_drs, options, true);
+    }
+
+    if (ParallelMath::AnySet(anyTransparent))
+    {
+        if (!ParallelMath::AllSet(allTransparent))
+        {
+            // Flip sector assignments
+            for (int px = 0; px < 16; px++)
+                sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+        }
+
+        // Reset the error of any transparent blocks to max and retry with punchthrough modes
+        ParallelMath::ConditionalSet(bestError, ParallelMath::Int16FlagToFloat(anyTransparent), ParallelMath::MakeFloat(FLT_MAX));
+
+        EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
+
+        // Flip sector assignments
+        for (int px = 0; px < 16; px++)
+            sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+
+        EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
+
+        CompressETC1PunchthroughBlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, pixelIsTransparent, static_cast<ETC2CompressionDataInternal*>(compressionData)->m_drs, options);
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, const Options &options)
+{
+    MUInt15 pixels[16];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            ParallelMath::PutUInt15(pixels[px], block, pixelBlocks[block].m_pixels[px][3]);
+    }
+
+    CompressETC2AlphaBlockInternal(outputBuffer, pixels, false, false, options);
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options)
+{
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(is11Bit ? 2047 : 255);
+    MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
+
+    for (int px = 0; px < 16; px++)
+    {
+        minAlpha = ParallelMath::Min(minAlpha, pixels[px]);
+        maxAlpha = ParallelMath::Max(maxAlpha, pixels[px]);
+    }
+
+    MUInt15 alphaSpan = maxAlpha - minAlpha;
+    MUInt15 alphaSpanMidpointTimes2 = maxAlpha + minAlpha;
+
+    MUInt31 bestTotalError = ParallelMath::MakeUInt31(0x7fffffff);
+    MUInt15 bestTableIndex = ParallelMath::MakeUInt15(0);
+    MUInt15 bestBaseCodeword = ParallelMath::MakeUInt15(0);
+    MUInt15 bestMultiplier = ParallelMath::MakeUInt15(0);
+    MUInt15 bestIndexes[16];
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    const int numAlphaRanges = 10;
+    for (uint16_t tableIndex = 0; tableIndex < 16; tableIndex++)
+    {
+        for (int r = 0; r < numAlphaRanges; r++)
+        {
+            int subrange = r % 3;
+            int mainRange = r / 3;
+
+            int16_t maxOffset = Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - (subrange & 1)];
+            int16_t minOffset = -Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - ((subrange >> 1) & 1)] - 1;
+            uint16_t offsetSpan = static_cast<uint16_t>(maxOffset - minOffset);
+
+            MSInt16 vminOffset = ParallelMath::MakeSInt16(minOffset);
+            MUInt15 vmaxOffset = ParallelMath::MakeUInt15(maxOffset);
+            MUInt15 voffsetSpan = ParallelMath::MakeUInt15(offsetSpan);
+
+            MUInt15 minMultiplier = ParallelMath::MakeUInt15(0);
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t singleAlphaSpan = ParallelMath::Extract(alphaSpan, block);
+
+                uint16_t lowMultiplier = singleAlphaSpan / offsetSpan;
+                ParallelMath::PutUInt15(minMultiplier, block, lowMultiplier);
+            }
+
+            if (is11Bit)
+            {
+                // Clamps this to valid multipliers under 15 and rounds down to nearest multiple of 8
+                minMultiplier = ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(112)) & ParallelMath::MakeUInt15(120);
+            }
+            else
+            {
+                // We cap at 1 and 14 so both multipliers are valid and dividable
+                // Cases where offset span is 0 should be caught by multiplier 1 of table 13
+                minMultiplier = ParallelMath::Max(ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(14)), ParallelMath::MakeUInt15(1));
+            }
+
+            for (uint16_t multiplierOffset = 0; multiplierOffset < 2; multiplierOffset++)
+            {
+                MUInt15 multiplier = minMultiplier;
+
+                if (is11Bit)
+                {
+                    if (multiplierOffset == 1)
+                        multiplier = multiplier + ParallelMath::MakeUInt15(8);
+                    else
+                        multiplier = ParallelMath::Max(multiplier, ParallelMath::MakeUInt15(1));
+                }
+                else
+                {
+                    if (multiplierOffset == 1)
+                        multiplier = multiplier + ParallelMath::MakeUInt15(1);
+                }
+
+                MSInt16 multipliedMinOffset = ParallelMath::CompactMultiply(ParallelMath::LosslessCast<MSInt16>::Cast(multiplier), vminOffset);
+                MUInt15 multipliedMaxOffset = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(multiplier, vmaxOffset));
+
+                // codeword = (maxOffset + minOffset + minAlpha + maxAlpha) / 2
+                MSInt16 unclampedBaseAlphaTimes2 = ParallelMath::LosslessCast<MSInt16>::Cast(alphaSpanMidpointTimes2) - ParallelMath::LosslessCast<MSInt16>::Cast(multipliedMaxOffset) - multipliedMinOffset;
+
+                MUInt15 baseAlpha;
+                if (is11Bit)
+                {
+                    // In unsigned, 4 is added to the unquantized alpha, so compensating for that cancels the 4 we have to add to do rounding.
+                    if (isSigned)
+                        unclampedBaseAlphaTimes2 = unclampedBaseAlphaTimes2 + ParallelMath::MakeSInt16(8);
+
+                    // -128 is illegal for some reason
+                    MSInt16 minBaseAlphaTimes2 = isSigned ? ParallelMath::MakeSInt16(16) : ParallelMath::MakeSInt16(0);
+
+                    MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, minBaseAlphaTimes2)), ParallelMath::MakeUInt15(4095));
+                    baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2, 1) & ParallelMath::MakeUInt15(2040);
+
+                    if (!isSigned)
+                        baseAlpha = baseAlpha + ParallelMath::MakeUInt15(4);
+                }
+                else
+                {
+                    MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, ParallelMath::MakeSInt16(0))), ParallelMath::MakeUInt15(510));
+                    baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2 + ParallelMath::MakeUInt15(1), 1);
+                }
+
+                MUInt15 indexes[16];
+                MUInt31 totalError = ParallelMath::MakeUInt31(0);
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 quantizedValues;
+                    QuantizeETC2Alpha(tableIndex, pixels[px], baseAlpha, multiplier, is11Bit, isSigned, indexes[px], quantizedValues);
+
+                    if (is11Bit)
+                    {
+                        MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(quantizedValues) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px]);
+                        MSInt32 deltaSq = ParallelMath::XMultiply(delta, delta);
+                        totalError = totalError + ParallelMath::LosslessCast<MUInt31>::Cast(deltaSq);
+                    }
+                    else
+                        totalError = totalError + ParallelMath::ToUInt31(ParallelMath::SqDiffUInt8(quantizedValues, pixels[px]));
+                }
+
+                ParallelMath::Int16CompFlag isBetter = ParallelMath::Int32FlagToInt16(ParallelMath::Less(totalError, bestTotalError));
+                if (ParallelMath::AnySet(isBetter))
+                {
+                    ParallelMath::ConditionalSet(bestTotalError, isBetter, totalError);
+                    ParallelMath::ConditionalSet(bestTableIndex, isBetter, ParallelMath::MakeUInt15(tableIndex));
+                    ParallelMath::ConditionalSet(bestBaseCodeword, isBetter, baseAlpha);
+                    ParallelMath::ConditionalSet(bestMultiplier, isBetter, multiplier);
+
+                    for (int px = 0; px < 16; px++)
+                        ParallelMath::ConditionalSet(bestIndexes[px], isBetter, indexes[px]);
+                }
+
+                // TODO: Do one refine pass
+            }
+        }
+    }
+
+    if (is11Bit)
+    {
+        bestMultiplier = ParallelMath::RightShift(bestMultiplier, 3);
+
+        if (isSigned)
+            bestBaseCodeword = bestBaseCodeword ^ ParallelMath::MakeUInt15(0x80);
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        uint8_t *output = outputBuffer + block * 8;
+
+        output[0] = static_cast<uint8_t>(ParallelMath::Extract(bestBaseCodeword, block));
+
+        ParallelMath::ScalarUInt16 multiplier = ParallelMath::Extract(bestMultiplier, block);
+        ParallelMath::ScalarUInt16 tableIndex = ParallelMath::Extract(bestTableIndex, block);
+
+        output[1] = static_cast<uint8_t>((multiplier << 4) | tableIndex);
+
+        static const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+        ParallelMath::ScalarUInt16 indexes[16];
+        for (int px = 0; px < 16; px++)
+            indexes[pixelSelectorOrder[px]] = ParallelMath::Extract(bestIndexes[px], block);
+
+        int outputOffset = 2;
+        int outputBits = 0;
+        int numOutputBits = 0;
+        for (int s = 0; s < 16; s++)
+        {
+            outputBits = (outputBits << 3) | indexes[s];
+            numOutputBits += 3;
+
+            if (numOutputBits >= 8)
+            {
+                output[outputOffset++] = static_cast<uint8_t>(outputBits >> (numOutputBits - 8));
+                numOutputBits -= 8;
+
+                outputBits &= ((1 << numOutputBits) - 1);
+            }
+        }
+
+        assert(outputOffset == 8 && numOutputBits == 0);
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options)
+{
+    MUInt15 pixels[16];
+    for (int px = 0; px < 16; px++)
+    {
+        MSInt16 adjustedPixel;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            ParallelMath::PutSInt16(adjustedPixel, block, inputBlocks[block].m_pixels[px]);
+
+        // We use a slightly shifted range here so we can keep the unquantized base color in a UInt15
+        // That is, signed range is 1..2047, and unsigned range is 0..2047
+        if (isSigned)
+        {
+            adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(1023)) + ParallelMath::MakeSInt16(1024);
+            adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(1), adjustedPixel);
+        }
+        else
+        {
+            adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(2047));
+            adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(0), adjustedPixel);
+        }
+
+
+        pixels[px] = ParallelMath::LosslessCast<MUInt15>::Cast(adjustedPixel);
+    }
+
+    CompressETC2AlphaBlockInternal(outputBuffer, pixels, true, isSigned, options);
+}
+
+void cvtt::Internal::ETCComputer::CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options)
+{
+    DifferentialResolveStorage &drs = static_cast<ETC1CompressionDataInternal*>(compressionData)->m_drs;
+    MFloat bestTotalError = ParallelMath::MakeFloat(FLT_MAX);
+
+    MUInt15 pixels[16][3];
+    MFloat preWeightedPixels[16][3];
+    ExtractBlocks(pixels, preWeightedPixels, inputBlocks, options);
+
+    CompressETC1BlockInternal(bestTotalError, outputBuffer, pixels, preWeightedPixels, drs, options, false);
+}
+
+void cvtt::Internal::ETCComputer::ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options)
+{
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                ParallelMath::PutUInt15(pixels[px][ch], block, inputBlocks[block].m_pixels[px][ch]);
+        }
+
+        if (isFakeBT709)
+            ConvertToFakeBT709(preWeightedPixels[px], pixels[px]);
+        else if (isUniform)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+        }
+        else
+        {
+            preWeightedPixels[px][0] = ParallelMath::ToFloat(pixels[px][0]) * options.redWeight;
+            preWeightedPixels[px][1] = ParallelMath::ToFloat(pixels[px][1]) * options.greenWeight;
+            preWeightedPixels[px][2] = ParallelMath::ToFloat(pixels[px][2]) * options.blueWeight;
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
+{
+    for (int ch = 0; ch < 3; ch++)
+    {
+        const MUInt15& cu15 = sectorCumulative[ch];
+
+        if (isDifferential)
+        {
+            //quantized[ch] = (cu * 31 + (cu >> 3)) >> 11;
+            quantized[ch] = ParallelMath::ToUInt15(
+                ParallelMath::RightShift(
+                (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
+                    , 11)
+            );
+        }
+        else
+        {
+            //quantized[ch] = (cu * 30 + (cu >> 3)) >> 12;
+            quantized[ch] = ParallelMath::ToUInt15(
+                ParallelMath::RightShift(
+                (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
+                    , 12)
+            );
+        }
+    }
+
+    MFloat lowOctantRGBFloat[3];
+    MFloat highOctantRGBFloat[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        MUInt15 unquantized;
+        MUInt15 unquantizedNext;
+        if (isDifferential)
+        {
+            unquantized = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+            MUInt15 quantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(31), quantized[ch] + ParallelMath::MakeUInt15(1));
+            unquantizedNext = (quantizedNext << 3) | ParallelMath::RightShift(quantizedNext, 2);
+        }
+        else
+        {
+            unquantized = (quantized[ch] << 4) | quantized[ch];
+            unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
+        }
+        lowOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantized << 3);
+        highOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantizedNext << 3);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
+
+    MFloat cumulativeYUV[3];
+    ConvertToFakeBT709(cumulativeYUV, sectorCumulative);
+
+    for (uint16_t octant = 0; octant < 8; octant++)
+    {
+        const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
+        const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
+        const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
+
+        MFloat octantYUV[3];
+        ConvertToFakeBT709(octantYUV, r, g, b);
+
+        MFloat delta[3];
+        for (int ch = 0; ch < 3; ch++)
+            delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
+
+        MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
+        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+        ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
+        bestError = ParallelMath::Min(error, bestError);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
+}
+
+void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
+{
+    // sectorCumulative range is 0..2040 (11 bits)
+    MUInt15 roundingOffset = ParallelMath::MakeUInt15(0);
+
+    MUInt15 rOffset;
+    MUInt15 gOffset;
+    MUInt15 bOffset;
+    MUInt15 quantizedBase[3];
+    MUInt15 upperBound;
+
+    MUInt15 sectorCumulativeFillIn[3];
+    for (int ch = 0; ch < 3; ch++)
+        sectorCumulativeFillIn[ch] = sectorCumulative[ch] + ParallelMath::RightShift(sectorCumulative[ch], 8);
+
+    if (isDifferential)
+    {
+        rOffset = (sectorCumulativeFillIn[0] << 6) & ParallelMath::MakeUInt15(0xf00);
+        gOffset = (sectorCumulativeFillIn[1] << 4) & ParallelMath::MakeUInt15(0x0f0);
+        bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 2) & ParallelMath::MakeUInt15(0x00f);
+
+        for (int ch = 0; ch < 3; ch++)
+            quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 6);
+
+        upperBound = ParallelMath::MakeUInt15(31);
+    }
+    else
+    {
+        rOffset = (sectorCumulativeFillIn[0] << 5) & ParallelMath::MakeUInt15(0xf00);
+        gOffset = (sectorCumulativeFillIn[1] << 1) & ParallelMath::MakeUInt15(0x0f0);
+        bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 3) & ParallelMath::MakeUInt15(0x00f);
+
+        for (int ch = 0; ch < 3; ch++)
+            quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 7);
+
+        upperBound = ParallelMath::MakeUInt15(15);
+    }
+
+    MUInt15 lookupIndex = (rOffset | gOffset | bOffset);
+
+    MUInt15 octant;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        ParallelMath::PutUInt15(octant, block, Tables::FakeBT709::g_rounding16[ParallelMath::Extract(lookupIndex, block)]);
+
+    quantizedBase[0] = quantizedBase[0] + (octant & ParallelMath::MakeUInt15(1));
+    quantizedBase[1] = quantizedBase[1] + (ParallelMath::RightShift(octant, 1) & ParallelMath::MakeUInt15(1));
+    quantizedBase[2] = quantizedBase[2] + (ParallelMath::RightShift(octant, 2) & ParallelMath::MakeUInt15(1));
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = ParallelMath::Min(quantizedBase[ch], upperBound);
+}
+
+void cvtt::Internal::ETCComputer::ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 targets[3], const MUInt15 &granularity)
+{
+    MFloat lowOctantRGBFloat[3];
+    MFloat highOctantRGBFloat[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        MUInt15 unquantized = (quantized[ch] << 4) | quantized[ch];
+        MUInt15 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
+
+        lowOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantized, granularity) << 1);
+        highOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantizedNext, granularity) << 1);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
+
+    MFloat cumulativeYUV[3];
+    ConvertToFakeBT709(cumulativeYUV, ParallelMath::ToFloat(targets[0]), ParallelMath::ToFloat(targets[1]), ParallelMath::ToFloat(targets[2]));
+
+    for (uint16_t octant = 0; octant < 8; octant++)
+    {
+        const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
+        const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
+        const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
+
+        MFloat octantYUV[3];
+        ConvertToFakeBT709(octantYUV, r, g, b);
+
+        MFloat delta[3];
+        for (int ch = 0; ch < 3; ch++)
+            delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
+
+        MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
+        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+        ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
+        bestError = ParallelMath::Min(error, bestError);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3])
+{
+    MFloat floatRGB[3];
+    for (int ch = 0; ch < 3; ch++)
+        floatRGB[ch] = ParallelMath::ToFloat(color[ch]);
+
+    ConvertToFakeBT709(yuv, floatRGB);
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3])
+{
+    ConvertToFakeBT709(yuv, color[0], color[1], color[2]);
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat &pr, const MFloat &pg, const MFloat &pb)
+{
+    MFloat r = pr;
+    MFloat g = pg;
+    MFloat b = pb;
+
+    yuv[0] = r * 0.368233989135369f + g * 1.23876274963149f + b * 0.125054068802017f;
+    yuv[1] = r * 0.5f - g * 0.4541529f - b * 0.04584709f;
+    yuv[2] = r * -0.081014709086133f - g * 0.272538676238785f + b * 0.353553390593274f;
+}
+
+void cvtt::Internal::ETCComputer::ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3])
+{
+    MFloat yy = yuv[0] * 0.57735026466774571071f;
+    MFloat u = yuv[1];
+    MFloat v = yuv[2];
+
+    rgb[0] = yy + u * 1.5748000207960953486f;
+    rgb[1] = yy - u * 0.46812425854364753669f - v * 0.26491652528157560861f;
+    rgb[2] = yy + v * 2.6242146882856944069f;
+}
+
+
+void cvtt::Internal::ETCComputer::QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues)
+{
+    MSInt16 offset = ParallelMath::LosslessCast<MSInt16>::Cast(value) - ParallelMath::LosslessCast<MSInt16>::Cast(baseValue);
+    MSInt16 offsetTimes2 = offset + offset;
+
+    // ETC2's offset tables all have a reflect about 0.5*multiplier
+    MSInt16 offsetAboutReflectorTimes2 = offsetTimes2 + ParallelMath::LosslessCast<MSInt16>::Cast(multiplier);
+
+    MUInt15 absOffsetAboutReflectorTimes2 = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Abs(offsetAboutReflectorTimes2));
+    MUInt15 lookupIndex = ParallelMath::RightShift(absOffsetAboutReflectorTimes2, 1);
+
+    MUInt15 positiveIndex;
+    MUInt15 positiveOffsetUnmultiplied;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        uint16_t blockLookupIndex = ParallelMath::Extract(lookupIndex, block) / ParallelMath::Extract(multiplier, block);
+        if (blockLookupIndex >= Tables::ETC2::g_alphaRoundingTableWidth)
+            blockLookupIndex = Tables::ETC2::g_alphaRoundingTableWidth - 1;
+        uint16_t index = Tables::ETC2::g_alphaRoundingTables[tableIndex][blockLookupIndex];
+        ParallelMath::PutUInt15(positiveIndex, block, index);
+        ParallelMath::PutUInt15(positiveOffsetUnmultiplied, block, Tables::ETC2::g_alphaModifierTablePositive[tableIndex][index]);
+
+        // TODO: This is suboptimal when the offset is capped.  We should detect 0 and 255 values and always map them to the maximum offsets.
+        // Doing that will also affect refinement though.
+    }
+
+    MSInt16 signBits = ParallelMath::RightShift(offsetAboutReflectorTimes2, 15);
+    MSInt16 offsetUnmultiplied = ParallelMath::LosslessCast<MSInt16>::Cast(positiveOffsetUnmultiplied) ^ signBits;
+    MSInt16 quantizedOffset = ParallelMath::CompactMultiply(offsetUnmultiplied, multiplier);
+
+    MSInt16 offsetValue = ParallelMath::LosslessCast<MSInt16>::Cast(baseValue) + quantizedOffset;
+
+    if (is11Bit)
+    {
+        if (isSigned)
+            outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(1), offsetValue)));
+        else
+            outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
+    }
+    else
+        outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(255), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
+
+    MUInt15 indexSub = ParallelMath::LosslessCast<MUInt15>::Cast(signBits) & ParallelMath::MakeUInt15(4);
+
+    outIndexes = positiveIndex + ParallelMath::MakeUInt15(4) - indexSub;
+}
+
+
+void cvtt::Internal::ETCComputer::EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque)
+{
+    static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    uint32_t lowBits = 0;
+    uint32_t highBits = 0;
+
+    int rh = ((isolatedColor[0] >> 2) & 3);
+    int rl = (isolatedColor[0] & 3);
+
+    if (rh + rl < 4)
+    {
+        // Overflow low
+        highBits |= 1 << (58 - 32);
+    }
+    else
+    {
+        // Overflow high
+        highBits |= 7 << (61 - 32);
+    }
+
+    highBits |= rh << (59 - 32);
+    highBits |= rl << (56 - 32);
+    highBits |= isolatedColor[1] << (52 - 32);
+    highBits |= isolatedColor[2] << (48 - 32);
+    highBits |= lineColor[0] << (44 - 32);
+    highBits |= lineColor[1] << (40 - 32);
+    highBits |= lineColor[2] << (36 - 32);
+    highBits |= ((table >> 1) & 3) << (34 - 32);
+    if (opaque)
+        highBits |= 1 << (33 - 32);
+    highBits |= (table & 1) << (32 - 32);
+
+    for (int px = 0; px < 16; px++)
+    {
+        int sel = (packedSelectors >> (2 * selectorOrder[px])) & 3;
+        if ((sel & 0x1) != 0)
+            lowBits |= (1 << px);
+        if ((sel & 0x2) != 0)
+            lowBits |= (1 << (16 + px));
+    }
+
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque)
+{
+    if (blockColors[0] == blockColors[1])
+    {
+        // Base colors are the same.
+        // If the table low bit isn't 1, then we can't encode this, because swapping the block colors will have no effect
+        // on their order.
+        // Instead, we encode this as T mode where all of the indexes are on the line.
+
+        ParallelMath::ScalarUInt16 lineColor[3];
+        ParallelMath::ScalarUInt16 isolatedColor[3];
+
+        lineColor[0] = isolatedColor[0] = (blockColors[0] >> 10) & 0x1f;
+        lineColor[1] = isolatedColor[1] = (blockColors[0] >> 5) & 0x1f;
+        lineColor[2] = isolatedColor[2] = (blockColors[0] >> 0) & 0x1f;
+
+        int32_t packedSelectors = 0x55555555;
+        for (int px = 0; px < 16; px++)
+            packedSelectors |= ((signBits >> px) & 1) << ((px * 2) + 1);
+
+        EmitTModeBlock(outputBuffer, lineColor, isolatedColor, packedSelectors, table, opaque);
+        return;
+    }
+
+    static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    int16_t colors[2][3];
+    for (int sector = 0; sector < 2; sector++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            colors[sector][ch] = (blockColors[sector] >> ((2 - ch) * 5)) & 15;
+    }
+
+    uint32_t lowBits = 0;
+    uint32_t highBits = 0;
+
+    if (((table & 1) == 1) != (blockColors[0] > blockColors[1]))
+    {
+        for (int ch = 0; ch < 3; ch++)
+            std::swap(colors[0][ch], colors[1][ch]);
+        sectorBits ^= 0xffff;
+    }
+
+    int r1 = colors[0][0];
+    int g1a = colors[0][1] >> 1;
+    int g1b = (colors[0][1] & 1);
+    int b1a = colors[0][2] >> 3;
+    int b1b = colors[0][2] & 7;
+    int r2 = colors[1][0];
+    int g2 = colors[1][1];
+    int b2 = colors[1][2];
+
+    // Avoid overflowing R
+    if ((g1a & 4) != 0 && r1 + g1a < 8)
+        highBits |= 1 << (63 - 32);
+
+    int fakeDG = b1b >> 1;
+    int fakeG = b1a | (g1b << 1);
+
+    if (fakeG + fakeDG < 4)
+    {
+        // Overflow low
+        highBits |= 1 << (50 - 32);
+    }
+    else
+    {
+        // Overflow high
+        highBits |= 7 << (53 - 32);
+    }
+
+    int da = (table >> 2) & 1;
+    int db = (table >> 1) & 1;
+
+    highBits |= r1 << (59 - 32);
+    highBits |= g1a << (56 - 32);
+    highBits |= g1b << (52 - 32);
+    highBits |= b1a << (51 - 32);
+    highBits |= b1b << (47 - 32);
+    highBits |= r2 << (43 - 32);
+    highBits |= g2 << (39 - 32);
+    highBits |= b2 << (35 - 32);
+    highBits |= da << (34 - 32);
+    if (opaque)
+        highBits |= 1 << (33 - 32);
+    highBits |= db << (32 - 32);
+
+    for (int px = 0; px < 16; px++)
+    {
+        int sectorBit = (sectorBits >> selectorOrder[px]) & 1;
+        int signBit = (signBits >> selectorOrder[px]) & 1;
+
+        lowBits |= (signBit << px);
+        lowBits |= (sectorBit << (16 + px));
+    }
+
+    uint8_t *output = outputBuffer;
+
+    for (int i = 0; i < 4; i++)
+        output[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        output[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent)
+{
+    uint32_t highBits = 0;
+    uint32_t lowBits = 0;
+
+    if (blockBestD == 0)
+    {
+        highBits |= blockBestColors[0][0] << 28;
+        highBits |= blockBestColors[1][0] << 24;
+        highBits |= blockBestColors[0][1] << 20;
+        highBits |= blockBestColors[1][1] << 16;
+        highBits |= blockBestColors[0][2] << 12;
+        highBits |= blockBestColors[1][2] << 8;
+    }
+    else
+    {
+        highBits |= blockBestColors[0][0] << 27;
+        highBits |= ((blockBestColors[1][0] - blockBestColors[0][0]) & 7) << 24;
+        highBits |= blockBestColors[0][1] << 19;
+        highBits |= ((blockBestColors[1][1] - blockBestColors[0][1]) & 7) << 16;
+        highBits |= blockBestColors[0][2] << 11;
+        highBits |= ((blockBestColors[1][2] - blockBestColors[0][2]) & 7) << 8;
+    }
+
+    highBits |= (blockBestTables[0] << 5);
+    highBits |= (blockBestTables[1] << 2);
+    if (!transparent)
+        highBits |= (blockBestD << 1);
+    highBits |= blockBestFlip;
+
+    const uint8_t modifierCodes[4] = { 3, 2, 0, 1 };
+
+    uint8_t unpackedSelectors[16];
+    uint8_t unpackedSelectorCodes[16];
+    for (int sector = 0; sector < 2; sector++)
+    {
+        int blockSectorBestSelectors = blockBestSelectors[sector];
+
+        for (int px = 0; px < 8; px++)
+        {
+            int selector = (blockSectorBestSelectors >> (2 * px)) & 3;
+            unpackedSelectorCodes[g_flipTables[blockBestFlip][sector][px]] = modifierCodes[selector];
+            unpackedSelectors[g_flipTables[blockBestFlip][sector][px]] = selector;
+        }
+    }
+
+    const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    int lowBitOffset = 0;
+    for (int sb = 0; sb < 2; sb++)
+        for (int px = 0; px < 16; px++)
+            lowBits |= ((unpackedSelectorCodes[pixelSelectorOrder[px]] >> sb) & 1) << (px + sb * 16);
+
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage &drs, const Options &options, bool punchthrough)
+{
+	int numTries = 0;
+
+    MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
+    MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
+
+    MUInt15 bestColors[2] = { zeroU15, zeroU15 };
+    MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
+    MUInt15 bestTables[2] = { zeroU15, zeroU15 };
+    MUInt15 bestFlip = zeroU15;
+    MUInt15 bestD = zeroU15;
+
+    MUInt15 sectorPixels[2][2][8][3];
+    MFloat sectorPreWeightedPixels[2][2][8][3];
+    MUInt15 sectorCumulative[2][2][3];
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    for (int flip = 0; flip < 2; flip++)
+	{
+		for (int sector = 0; sector < 2; sector++)
+		{
+			for (int ch = 0; ch < 3; ch++)
+				sectorCumulative[flip][sector][ch] = zeroU15;
+
+			for (int px = 0; px < 8; px++)
+			{
+				for (int ch = 0; ch < 3; ch++)
+				{
+					MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
+					sectorPixels[flip][sector][px][ch] = pixelChannelValue;
+                    sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
+					sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
+				}
+			}
+		}
+	}
+
+	static const MSInt16 modifierTables[8][4] =
+	{
+		{ ParallelMath::MakeSInt16(-8), ParallelMath::MakeSInt16(-2), ParallelMath::MakeSInt16(2), ParallelMath::MakeSInt16(8) },
+		{ ParallelMath::MakeSInt16(-17), ParallelMath::MakeSInt16(-5), ParallelMath::MakeSInt16(5), ParallelMath::MakeSInt16(17) },
+		{ ParallelMath::MakeSInt16(-29), ParallelMath::MakeSInt16(-9), ParallelMath::MakeSInt16(9), ParallelMath::MakeSInt16(29) },
+		{ ParallelMath::MakeSInt16(-42), ParallelMath::MakeSInt16(-13), ParallelMath::MakeSInt16(13), ParallelMath::MakeSInt16(42) },
+		{ ParallelMath::MakeSInt16(-60), ParallelMath::MakeSInt16(-18), ParallelMath::MakeSInt16(18), ParallelMath::MakeSInt16(60) },
+		{ ParallelMath::MakeSInt16(-80), ParallelMath::MakeSInt16(-24), ParallelMath::MakeSInt16(24), ParallelMath::MakeSInt16(80) },
+		{ ParallelMath::MakeSInt16(-106), ParallelMath::MakeSInt16(-33), ParallelMath::MakeSInt16(33), ParallelMath::MakeSInt16(106) },
+		{ ParallelMath::MakeSInt16(-183), ParallelMath::MakeSInt16(-47), ParallelMath::MakeSInt16(47), ParallelMath::MakeSInt16(183) },
+	};
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    int minD = punchthrough ? 1 : 0;
+
+	for (int flip = 0; flip < 2; flip++)
+	{
+		drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
+
+		MFloat bestIndError[2] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };
+		MUInt16 bestIndSelectors[2] = { ParallelMath::MakeUInt16(0), ParallelMath::MakeUInt16(0) };
+		MUInt15 bestIndColors[2] = { zeroU15, zeroU15 };
+		MUInt15 bestIndTable[2] = { zeroU15, zeroU15 };
+
+		for (int d = minD; d < 2; d++)
+		{
+			for (int sector = 0; sector < 2; sector++)
+			{
+				const int16_t *potentialOffsets = cvtt::Tables::ETC1::g_potentialOffsets4;
+
+				for (int table = 0; table < 8; table++)
+				{
+					int16_t numOffsets = *potentialOffsets++;
+
+					MUInt15 possibleColors[cvtt::Tables::ETC1::g_maxPotentialOffsets];
+
+                    MUInt15 quantized[3];
+                    for (int oi = 0; oi < numOffsets; oi++)
+                    {
+                        if (!isFakeBT709)
+                        {
+						    for (int ch = 0; ch < 3; ch++)
+						    {
+                                // cu is in range 0..2040
+                                MUInt15 cu15 = ParallelMath::Min(
+                                    ParallelMath::MakeUInt15(2040),
+                                    ParallelMath::ToUInt15(
+                                        ParallelMath::Max(
+                                            ParallelMath::MakeSInt16(0),
+                                            ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
+                                        )
+                                    )
+                                );
+
+                                if (d == 1)
+                                {
+                                    //quantized[ch] = (cu * 31 + (cu >> 3) + 1024) >> 11;
+                                    quantized[ch] = ParallelMath::ToUInt15(
+                                        ParallelMath::RightShift(
+                                            (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(1024)
+                                            , 11)
+                                        );
+                                }
+                                else
+                                {
+                                    //quantized[ch] = (cu * 30 + (cu >> 3) + 2048) >> 12;
+                                    quantized[ch] = ParallelMath::ToUInt15(
+                                        ParallelMath::RightShift(
+                                        (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(2048)
+                                            , 12)
+                                    );
+                                }
+						    }
+                        }
+                        else
+                        {
+                            MUInt15 offsetCumulative[3];
+						    for (int ch = 0; ch < 3; ch++)
+						    {
+                                // cu is in range 0..2040
+                                MUInt15 cu15 = ParallelMath::Min(
+                                    ParallelMath::MakeUInt15(2040),
+                                    ParallelMath::ToUInt15(
+                                        ParallelMath::Max(
+                                            ParallelMath::MakeSInt16(0),
+                                            ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
+                                        )
+                                    )
+                                );
+
+                                offsetCumulative[ch] = cu15;
+						    }
+
+                            if ((options.flags & cvtt::Flags::ETC_FakeBT709Accurate) != 0)
+                                ResolveHalfBlockFakeBT709RoundingAccurate(quantized, offsetCumulative, d == 1);
+                            else
+                                ResolveHalfBlockFakeBT709RoundingFast(quantized, offsetCumulative, d == 1);
+                        }
+
+						possibleColors[oi] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+					}
+
+					potentialOffsets += numOffsets;
+
+                    ParallelMath::UInt15 numUniqueColors;
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        uint16_t blockNumUniqueColors = 1;
+                        for (int i = 1; i < numOffsets; i++)
+                        {
+                            uint16_t color = ParallelMath::Extract(possibleColors[i], block);
+                            if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
+                                ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
+                        }
+
+                        ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
+                    }
+
+                    int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
+                    for (int block = 1; block < ParallelMath::ParallelSize; block++)
+                        maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
+
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
+                        for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
+                            ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
+                    }
+
+					for (int i = 0; i < maxUniqueColors; i++)
+					{
+						MFloat error = ParallelMath::MakeFloatZero();
+						MUInt16 selectors = ParallelMath::MakeUInt16(0);
+                        MUInt15 quantized = possibleColors[i];
+						TestHalfBlock(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], modifierTables[table], d == 1, options);
+
+						if (d == 0)
+						{
+                            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestIndError[sector]));
+							if (ParallelMath::AnySet(errorBetter))
+							{
+								bestIndError[sector] = ParallelMath::Min(error, bestIndError[sector]);
+								ParallelMath::ConditionalSet(bestIndSelectors[sector], errorBetter, selectors);
+                                ParallelMath::ConditionalSet(bestIndColors[sector], errorBetter, quantized);
+                                ParallelMath::ConditionalSet(bestIndTable[sector], errorBetter, ParallelMath::MakeUInt15(table));
+							}
+						}
+						else
+						{
+                            ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
+
+							MUInt15 storageIndexes = drs.diffNumAttempts[sector];
+                            drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
+
+                            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                            {
+                                int storageIndex = ParallelMath::Extract(storageIndexes, block);
+
+                                ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
+                                ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
+                                ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
+                                ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
+                            }
+						}
+					}
+				}
+			}
+
+			if (d == 0)
+			{
+				MFloat bestIndErrorTotal = bestIndError[0] + bestIndError[1];
+                ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(bestIndErrorTotal, bestTotalError));
+				if (ParallelMath::AnySet(errorBetter))
+				{
+                    bestIsThisMode = bestIsThisMode | errorBetter;
+
+					bestTotalError = ParallelMath::Min(bestTotalError, bestIndErrorTotal);
+					ParallelMath::ConditionalSet(bestFlip, errorBetter, ParallelMath::MakeUInt15(flip));
+                    ParallelMath::ConditionalSet(bestD, errorBetter, ParallelMath::MakeUInt15(d));
+					for (int sector = 0; sector < 2; sector++)
+					{
+                        ParallelMath::ConditionalSet(bestColors[sector], errorBetter, bestIndColors[sector]);
+                        ParallelMath::ConditionalSet(bestSelectors[sector], errorBetter, bestIndSelectors[sector]);
+                        ParallelMath::ConditionalSet(bestTables[sector], errorBetter, bestIndTable[sector]);
+					}
+				}
+			}
+			else
+			{
+                ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(false), ParallelMath::MakeBoolInt16(false) };
+                FindBestDifferentialCombination(flip, d, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestD, bestColors, bestSelectors, bestTables, drs);
+			}
+		}
+	}
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (!ParallelMath::Extract(bestIsThisMode, block))
+            continue;
+
+        uint32_t highBits = 0;
+        uint32_t lowBits = 0;
+
+        int blockBestFlip = ParallelMath::Extract(bestFlip, block);
+        int blockBestD = ParallelMath::Extract(bestD, block);
+        int blockBestTables[2] = { ParallelMath::Extract(bestTables[0], block), ParallelMath::Extract(bestTables[1], block) };
+        ParallelMath::ScalarUInt16 blockBestSelectors[2] = { ParallelMath::Extract(bestSelectors[0], block), ParallelMath::Extract(bestSelectors[1], block) };
+
+        int colors[2][3];
+        for (int sector = 0; sector < 2; sector++)
+        {
+            int sectorColor = ParallelMath::Extract(bestColors[sector], block);
+            for (int ch = 0; ch < 3; ch++)
+                colors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
+        }
+
+        EmitETC1Block(outputBuffer + block * 8, blockBestFlip, blockBestD, colors, blockBestTables, blockBestSelectors, false);
+    }
+}
+
+
+void cvtt::Internal::ETCComputer::CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage &drs, const Options &options)
+{
+	int numTries = 0;
+
+    MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
+    MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
+
+    MUInt15 bestColors[2] = { zeroU15, zeroU15 };
+    MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
+    MUInt15 bestTables[2] = { zeroU15, zeroU15 };
+    MUInt15 bestFlip = zeroU15;
+
+    MUInt15 sectorPixels[2][2][8][3];
+    ParallelMath::Int16CompFlag sectorTransparent[2][2][8];
+    MFloat sectorPreWeightedPixels[2][2][8][3];
+    MUInt15 sectorCumulative[2][2][3];
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    for (int flip = 0; flip < 2; flip++)
+	{
+		for (int sector = 0; sector < 2; sector++)
+		{
+			for (int ch = 0; ch < 3; ch++)
+				sectorCumulative[flip][sector][ch] = zeroU15;
+
+			for (int px = 0; px < 8; px++)
+			{
+				for (int ch = 0; ch < 3; ch++)
+				{
+					MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
+					sectorPixels[flip][sector][px][ch] = pixelChannelValue;
+                    sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
+					sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
+				}
+
+                sectorTransparent[flip][sector][px] = isTransparent[g_flipTables[flip][sector][px]];
+			}
+		}
+	}
+
+	static const MUInt15 modifiers[8] =
+	{
+		ParallelMath::MakeUInt15(8),
+		ParallelMath::MakeUInt15(17),
+		ParallelMath::MakeUInt15(29),
+		ParallelMath::MakeUInt15(42),
+		ParallelMath::MakeUInt15(60),
+		ParallelMath::MakeUInt15(80),
+		ParallelMath::MakeUInt15(106),
+		ParallelMath::MakeUInt15(183),
+	};
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    const int maxSectorCumulativeOffsets = 17;
+
+	for (int flip = 0; flip < 2; flip++)
+	{
+        ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(true), ParallelMath::MakeBoolInt16(false) };
+
+        for (int sector = 0; sector < 2; sector++)
+            for (int px = 0; px < 8; px++)
+                canIgnoreSector[sector] = canIgnoreSector[sector] & sectorTransparent[flip][sector][px];
+
+		drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
+
+		for (int sector = 0; sector < 2; sector++)
+		{
+            MUInt15 sectorNumOpaque = ParallelMath::MakeUInt15(0);
+            for (int px = 0; px < 8; px++)
+                sectorNumOpaque = sectorNumOpaque + ParallelMath::SelectOrZero(sectorTransparent[flip][sector][px], ParallelMath::MakeUInt15(1));
+
+            int sectorMaxOpaque = 0;
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                sectorMaxOpaque = std::max<int>(sectorMaxOpaque, ParallelMath::Extract(sectorNumOpaque, block));
+
+            int sectorNumOpaqueMultipliers = sectorMaxOpaque * 2 + 1;
+
+            MUInt15 sectorNumOpaqueDenominator = ParallelMath::Max(ParallelMath::MakeUInt15(1), sectorNumOpaque) << 8;
+            MUInt15 sectorNumOpaqueAddend = sectorNumOpaque << 7;
+
+            MSInt16 sectorNumOpaqueSigned = ParallelMath::LosslessCast<MSInt16>::Cast(sectorNumOpaque);
+            MSInt16 negSectorNumOpaqueSigned = ParallelMath::MakeSInt16(0) - sectorNumOpaqueSigned;
+
+            MUInt15 sectorCumulativeMax = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(255), sectorNumOpaque));
+
+			for (int table = 0; table < 8; table++)
+			{
+				MUInt15 possibleColors[maxSectorCumulativeOffsets];
+
+                MUInt15 quantized[3];
+                for (int om = -sectorMaxOpaque; om <= sectorMaxOpaque; om++)
+                {
+                    MSInt16 clampedOffsetMult = ParallelMath::Max(ParallelMath::Min(ParallelMath::MakeSInt16(om), sectorNumOpaqueSigned), negSectorNumOpaqueSigned);
+                    MSInt16 offset = ParallelMath::CompactMultiply(clampedOffsetMult, modifiers[table]);
+
+                    for (int ch = 0; ch < 3; ch++)
+                    {
+                        // cu is in range 0..255*numOpaque (at most 0..2040)
+                        MUInt15 cu15 = ParallelMath::Min(
+                            sectorCumulativeMax,
+                            ParallelMath::ToUInt15(
+                                ParallelMath::Max(
+                                    ParallelMath::MakeSInt16(0),
+                                    ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + offset
+                                )
+                            )
+                        );
+
+                        //quantized[ch] = (cu * 31 + (cu >> 3) + (numOpaque * 128)) / (numOpaque * 256)
+                        MUInt16 cuTimes31 = (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15);
+                        MUInt15 cuDiv8 = ParallelMath::RightShift(cu15, 3);
+                        MUInt16 numerator = cuTimes31 + ParallelMath::LosslessCast<MUInt16>::Cast(cuDiv8 + sectorNumOpaqueAddend);
+                        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                            ParallelMath::PutUInt15(quantized[ch], block, ParallelMath::Extract(numerator, block) / ParallelMath::Extract(sectorNumOpaqueDenominator, block));
+                    }
+
+					possibleColors[om + sectorMaxOpaque] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+				}
+
+                ParallelMath::UInt15 numUniqueColors;
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    uint16_t blockNumUniqueColors = 1;
+                    for (int i = 1; i < sectorNumOpaqueMultipliers; i++)
+                    {
+                        uint16_t color = ParallelMath::Extract(possibleColors[i], block);
+                        if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
+                            ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
+                    }
+
+                    ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
+                }
+
+                int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
+                for (int block = 1; block < ParallelMath::ParallelSize; block++)
+                    maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
+
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
+                    for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
+                        ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
+                }
+
+				for (int i = 0; i < maxUniqueColors; i++)
+				{
+					MFloat error = ParallelMath::MakeFloatZero();
+					MUInt16 selectors = ParallelMath::MakeUInt16(0);
+                    MUInt15 quantized = possibleColors[i];
+					TestHalfBlockPunchthrough(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], sectorTransparent[flip][sector], modifiers[table], options);
+
+                    ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
+
+					MUInt15 storageIndexes = drs.diffNumAttempts[sector];
+                    drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
+
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int storageIndex = ParallelMath::Extract(storageIndexes, block);
+
+                        ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
+                        ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
+                        ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
+                        ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
+                    }
+                }
+            }
+        }
+
+        MUInt15 bestDDummy = ParallelMath::MakeUInt15(0);
+        FindBestDifferentialCombination(flip, 1, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestDDummy, bestColors, bestSelectors, bestTables, drs);
+	}
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (!ParallelMath::Extract(bestIsThisMode, block))
+            continue;
+
+        int blockBestColors[2][3];
+        int blockBestTables[2];
+        ParallelMath::ScalarUInt16 blockBestSelectors[2];
+        for (int sector = 0; sector < 2; sector++)
+        {
+            int sectorColor = ParallelMath::Extract(bestColors[sector], block);
+            for (int ch = 0; ch < 3; ch++)
+                blockBestColors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
+
+            blockBestTables[sector] = ParallelMath::Extract(bestTables[sector], block);
+            blockBestSelectors[sector] = ParallelMath::Extract(bestSelectors[sector], block);
+        }
+
+        EmitETC1Block(outputBuffer + block * 8, ParallelMath::Extract(bestFlip, block), 1, blockBestColors, blockBestTables, blockBestSelectors, true);
+    }
+}
+
+
+cvtt::ETC1CompressionData *cvtt::Internal::ETCComputer::AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context)
+{
+    void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
+    if (!buffer)
+        return NULL;
+    new (buffer) cvtt::Internal::ETCComputer::ETC1CompressionDataInternal(context);
+    return static_cast<ETC1CompressionData*>(buffer);
+}
+
+void cvtt::Internal::ETCComputer::ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
+{
+    cvtt::Internal::ETCComputer::ETC1CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC1CompressionDataInternal*>(compressionData);
+    void *context = internalData->m_context;
+    internalData->~ETC1CompressionDataInternal();
+    freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
+}
+
+cvtt::ETC2CompressionData *cvtt::Internal::ETCComputer::AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options)
+{
+    void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
+    if (!buffer)
+        return NULL;
+    new (buffer) cvtt::Internal::ETCComputer::ETC2CompressionDataInternal(context, options);
+    return static_cast<ETC2CompressionData*>(buffer);
+}
+
+void cvtt::Internal::ETCComputer::ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
+{
+    cvtt::Internal::ETCComputer::ETC2CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC2CompressionDataInternal*>(compressionData);
+    void *context = internalData->m_context;
+    internalData->~ETC2CompressionDataInternal();
+    freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
+}
+
+cvtt::Internal::ETCComputer::ETC2CompressionDataInternal::ETC2CompressionDataInternal(void *context, const cvtt::Options &options)
+    : m_context(context)
+{
+    const float cd[3] = { options.redWeight, options.greenWeight, options.blueWeight };
+    const float rotCD[3] = { cd[1], cd[2], cd[0] };
+
+    const float offs = -(rotCD[0] * cd[0] + rotCD[1] * cd[1] + rotCD[2] * cd[2]) / (cd[0] * cd[0] + cd[1] * cd[1] + cd[2] * cd[2]);
+
+    const float chromaAxis0[3] = { rotCD[0] + cd[0] * offs, rotCD[1] + cd[1] * offs, rotCD[2] + cd[2] * offs };
+
+    const float chromaAxis1Unnormalized[3] =
+    {
+        chromaAxis0[1] * cd[2] - chromaAxis0[2] * cd[1],
+        chromaAxis0[2] * cd[0] - chromaAxis0[0] * cd[2],
+        chromaAxis0[0] * cd[1] - chromaAxis0[1] * cd[0]
+    };
+
+    const float ca0LengthSq = (chromaAxis0[0] * chromaAxis0[0] + chromaAxis0[1] * chromaAxis0[1] + chromaAxis0[2] * chromaAxis0[2]);
+    const float ca1UNLengthSq = (chromaAxis1Unnormalized[0] * chromaAxis1Unnormalized[0] + chromaAxis1Unnormalized[1] * chromaAxis1Unnormalized[1] + chromaAxis1Unnormalized[2] * chromaAxis1Unnormalized[2]);
+    const float lengthRatio = static_cast<float>(std::sqrt(ca0LengthSq / ca1UNLengthSq));
+
+    const float chromaAxis1[3] = { chromaAxis1Unnormalized[0] * lengthRatio, chromaAxis1Unnormalized[1] * lengthRatio, chromaAxis1Unnormalized[2] * lengthRatio };
+
+    for (int i = 0; i < 3; i++)
+    {
+        m_chromaSideAxis0[i] = chromaAxis0[i];
+        m_chromaSideAxis1[i] = chromaAxis1[i];
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC.h b/thirdparty/cvtt/ConvectionKernels_ETC.h
new file mode 100644
index 0000000000..5e3c4d74fd
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC.h
@@ -0,0 +1,126 @@
+#pragma once
+#ifndef __CVTT_CONVECTIONKERNELS_ETC_H__
+#define __CVTT_CONVECTIONKERNELS_ETC_H__
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    struct Options;
+
+    namespace Internal
+    {
+        class ETCComputer
+        {
+        public:
+            static void CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options);
+            static void CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha);
+            static void CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, const Options &options);
+            static void CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options);
+
+            static ETC2CompressionData *AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options);
+            static void ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc);
+
+            static ETC1CompressionData *AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context);
+            static void ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc);
+
+        private:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+            struct DifferentialResolveStorage
+            {
+                static const unsigned int MaxAttemptsPerSector = 57 + 81 + 81 + 81 + 81 + 81 + 81 + 81;
+
+                MUInt15 diffNumAttempts[2];
+                MFloat diffErrors[2][MaxAttemptsPerSector];
+                MUInt16 diffSelectors[2][MaxAttemptsPerSector];
+                MUInt15 diffColors[2][MaxAttemptsPerSector];
+                MUInt15 diffTables[2][MaxAttemptsPerSector];
+
+                uint16_t attemptSortIndexes[2][MaxAttemptsPerSector];
+            };
+
+            struct HModeEval
+            {
+                MFloat errors[62][16];
+                MUInt16 signBits[62];
+                MUInt15 uniqueQuantizedColors[62];
+                MUInt15 numUniqueColors[2];
+            };
+
+            struct ETC1CompressionDataInternal : public cvtt::ETC1CompressionData
+            {
+                explicit ETC1CompressionDataInternal(void *context)
+                    : m_context(context)
+                {
+                }
+
+                DifferentialResolveStorage m_drs;
+                void *m_context;
+            };
+
+            struct ETC2CompressionDataInternal : public cvtt::ETC2CompressionData
+            {
+                explicit ETC2CompressionDataInternal(void *context, const cvtt::Options &options);
+
+                HModeEval m_h;
+                DifferentialResolveStorage m_drs;
+
+                void *m_context;
+                float m_chromaSideAxis0[3];
+                float m_chromaSideAxis1[3];
+            };
+
+            static MFloat ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3]);
+            static MFloat ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat pixelB[3], const Options options);
+            static MFloat ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat pixelB[3]);
+
+            static void TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options);
+            static void TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options);
+            static void FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs);
+
+            static ParallelMath::Int16CompFlag ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b);
+            static ParallelMath::Int16CompFlag ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b);
+            static bool ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b);
+            static bool ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b);
+
+            static void EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options);
+            static void EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options);
+
+            static void EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options);
+
+            static MUInt15 DecodePlanarCoeff(const MUInt15 &coeff, int ch);
+            static void EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options);
+
+            static void CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage& compressionData, const Options &options, bool punchthrough);
+            static void CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage& compressionData, const Options &options);
+            static void CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options);
+
+            static void ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options);
+
+            static void ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential);
+            static void ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential);
+            static void ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 target[3], const MUInt15 &granularity);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3]);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3]);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MFloat &r, const MFloat &g, const MFloat &b);
+            static void ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3]);
+
+            static void QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues);
+
+            static void EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque);
+            static void EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque);
+            static void EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent);
+
+            static const int g_flipTables[2][2][8];
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC1.h b/thirdparty/cvtt/ConvectionKernels_ETC1.h
new file mode 100644
index 0000000000..775e41669f
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC1.h
@@ -0,0 +1,29 @@
+#include <stdint.h>
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace ETC1
+        {
+            const int16_t g_potentialOffsets4[] =
+            {
+                57, -64, -58, -54, -52, -48, -46, -44, -42, -40, -38, -36, -34, -32, -30, -28, -26, -24, -22, -20, -18, -16, -14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 52, 54, 58, 64,
+                81, -136, -124, -114, -112, -102, -100, -92, -90, -88, -80, -78, -76, -70, -68, -66, -64, -58, -56, -54, -52, -48, -46, -44, -42, -40, -36, -34, -32, -30, -26, -24, -22, -20, -18, -14, -12, -10, -8, -4, -2, 0, 2, 4, 8, 10, 12, 14, 18, 20, 22, 24, 26, 30, 32, 34, 36, 40, 42, 44, 46, 48, 52, 54, 56, 58, 64, 66, 68, 70, 76, 78, 80, 88, 90, 92, 100, 102, 112, 114, 124, 136,
+                81, -232, -212, -194, -192, -174, -172, -156, -154, -152, -136, -134, -132, -118, -116, -114, -112, -98, -96, -94, -92, -80, -78, -76, -74, -72, -60, -58, -56, -54, -42, -40, -38, -36, -34, -22, -20, -18, -16, -4, -2, 0, 2, 4, 16, 18, 20, 22, 34, 36, 38, 40, 42, 54, 56, 58, 60, 72, 74, 76, 78, 80, 92, 94, 96, 98, 112, 114, 116, 118, 132, 134, 136, 152, 154, 156, 172, 174, 192, 194, 212, 232,
+                81, -336, -307, -281, -278, -252, -249, -226, -223, -220, -197, -194, -191, -171, -168, -165, -162, -142, -139, -136, -133, -116, -113, -110, -107, -104, -87, -84, -81, -78, -61, -58, -55, -52, -49, -32, -29, -26, -23, -6, -3, 0, 3, 6, 23, 26, 29, 32, 49, 52, 55, 58, 61, 78, 81, 84, 87, 104, 107, 110, 113, 116, 133, 136, 139, 142, 162, 165, 168, 171, 191, 194, 197, 220, 223, 226, 249, 252, 278, 281, 307, 336,
+                81, -480, -438, -402, -396, -360, -354, -324, -318, -312, -282, -276, -270, -246, -240, -234, -228, -204, -198, -192, -186, -168, -162, -156, -150, -144, -126, -120, -114, -108, -90, -84, -78, -72, -66, -48, -42, -36, -30, -12, -6, 0, 6, 12, 30, 36, 42, 48, 66, 72, 78, 84, 90, 108, 114, 120, 126, 144, 150, 156, 162, 168, 186, 192, 198, 204, 228, 234, 240, 246, 270, 276, 282, 312, 318, 324, 354, 360, 396, 402, 438, 480,
+                81, -640, -584, -536, -528, -480, -472, -432, -424, -416, -376, -368, -360, -328, -320, -312, -304, -272, -264, -256, -248, -224, -216, -208, -200, -192, -168, -160, -152, -144, -120, -112, -104, -96, -88, -64, -56, -48, -40, -16, -8, 0, 8, 16, 40, 48, 56, 64, 88, 96, 104, 112, 120, 144, 152, 160, 168, 192, 200, 208, 216, 224, 248, 256, 264, 272, 304, 312, 320, 328, 360, 368, 376, 416, 424, 432, 472, 480, 528, 536, 584, 640,
+                81, -848, -775, -709, -702, -636, -629, -570, -563, -556, -497, -490, -483, -431, -424, -417, -410, -358, -351, -344, -337, -292, -285, -278, -271, -264, -219, -212, -205, -198, -153, -146, -139, -132, -125, -80, -73, -66, -59, -14, -7, 0, 7, 14, 59, 66, 73, 80, 125, 132, 139, 146, 153, 198, 205, 212, 219, 264, 271, 278, 285, 292, 337, 344, 351, 358, 410, 417, 424, 431, 483, 490, 497, 556, 563, 570, 629, 636, 702, 709, 775, 848,
+                81, -1464, -1328, -1234, -1192, -1098, -1056, -1004, -962, -920, -868, -826, -784, -774, -732, -690, -648, -638, -596, -554, -544, -512, -502, -460, -418, -408, -376, -366, -324, -314, -282, -272, -230, -188, -178, -146, -136, -94, -84, -52, -42, 0, 42, 52, 84, 94, 136, 146, 178, 188, 230, 272, 282, 314, 324, 366, 376, 408, 418, 460, 502, 512, 544, 554, 596, 638, 648, 690, 732, 774, 784, 826, 868, 920, 962, 1004, 1056, 1098, 1192, 1234, 1328, 1464
+            };
+
+            const unsigned int g_maxPotentialOffsets = 81;
+
+            const int16_t g_thModifierTable[8] =
+            {
+                3, 6, 11, 16, 23, 32, 41, 64
+            };
+        }
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC2.h b/thirdparty/cvtt/ConvectionKernels_ETC2.h
new file mode 100644
index 0000000000..4befc8e8c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC2.h
@@ -0,0 +1,35 @@
+#include <stdint.h>
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace ETC2
+        {
+            const int16_t g_thModifierTable[8] =
+            {
+                3, 6, 11, 16, 23, 32, 41, 64
+            };
+
+            const int16_t g_alphaModifierTablePositive[16][4] =
+            {
+                { 2, 5, 8, 14, },
+                { 2, 6, 9, 12, },
+                { 1, 4, 7, 12, },
+                { 1, 3, 5, 12, },
+                { 2, 5, 7, 11, },
+                { 2, 6, 8, 10, },
+                { 3, 6, 7, 10, },
+                { 2, 4, 7, 10, },
+                { 1, 5, 7, 9, },
+                { 1, 4, 7, 9, },
+                { 1, 3, 7, 9, },
+                { 1, 4, 6, 9, },
+                { 2, 3, 6, 9, },
+                { 0, 1, 2, 9, },
+                { 3, 5, 7, 8, },
+                { 2, 4, 6, 8, },
+            };
+        }
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h b/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h
new file mode 100644
index 0000000000..a4f5a3ddfa
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace ETC2 {
+    const int g_alphaRoundingTableWidth = 13;
+    const uint8_t g_alphaRoundingTables[16][13] =
+    {
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3 },
+        { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3 },
+    };
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h b/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h
new file mode 100644
index 0000000000..c1276553b2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h
@@ -0,0 +1,181 @@
+#pragma once
+#ifndef __CVTT_ENDPOINTREFINER_H__
+#define __CVTT_ENDPOINTREFINER_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        // Solve for a, b where v = a*t + b
+        // This allows endpoints to be mapped to where T=0 and T=1
+        // Least squares from totals:
+        // a = (tv - t*v/w)/(tt - t*t/w)
+        // b = (v - a*t)/w
+        template<int TVectorSize>
+        class EndpointRefiner
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            MFloat m_tv[TVectorSize];
+            MFloat m_v[TVectorSize];
+            MFloat m_tt;
+            MFloat m_t;
+            MFloat m_w;
+            int m_wu;
+
+            float m_rcpMaxIndex;
+            float m_channelWeights[TVectorSize];
+            float m_rcpChannelWeights[TVectorSize];
+
+            void Init(int indexRange, const float channelWeights[TVectorSize])
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_tv[ch] = ParallelMath::MakeFloatZero();
+                    m_v[ch] = ParallelMath::MakeFloatZero();
+                }
+                m_tt = ParallelMath::MakeFloatZero();
+                m_t = ParallelMath::MakeFloatZero();
+                m_w = ParallelMath::MakeFloatZero();
+
+                m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_channelWeights[ch] = channelWeights[ch];
+                    m_rcpChannelWeights[ch] = 1.0f;
+                    if (m_channelWeights[ch] != 0.0f)
+                        m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch];
+                }
+
+                m_wu = 0;
+            }
+
+            void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight)
+            {
+                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat v = pwFloatPixel[ch] * weight;
+
+                    m_tv[ch] = m_tv[ch] + t * v;
+                    m_v[ch] = m_v[ch] + v;
+                }
+                m_tt = m_tt + weight * t * t;
+                m_t = m_t + weight * t;
+                m_w = m_w + weight;
+            }
+
+            void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels)
+            {
+                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MFloat v = pwFloatPixel[ch];
+
+                    m_tv[ch] = m_tv[ch] + t * v;
+                    m_v[ch] = m_v[ch] + v;
+                }
+                m_tt = m_tt + t * t;
+                m_t = m_t + t;
+                m_wu++;
+            }
+
+            void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index)
+            {
+                ContributeUnweightedPW(floatPixel, index, TVectorSize);
+            }
+
+            void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])
+            {
+                // a = (tv - t*v/w)/(tt - t*t/w)
+                // b = (v - a*t)/w
+                MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu));
+
+                ParallelMath::MakeSafeDenominator(w);
+                MFloat wRcp = ParallelMath::Reciprocal(w);
+
+                MFloat adenom = (m_tt * w - m_t * m_t) * wRcp;
+
+                ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
+                ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    /*
+                    if (adenom == 0.0)
+                    p1 = p2 = er.v / er.w;
+                    else
+                    {
+                    float4 a = (er.tv - er.t*er.v / er.w) / adenom;
+                    float4 b = (er.v - a * er.t) / er.w;
+                    p1 = b;
+                    p2 = a + b;
+                    }
+                    */
+
+                    MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom;
+                    MFloat b = (m_v[ch] - a * m_t) * wRcp;
+
+                    MFloat p1 = b;
+                    MFloat p2 = a + b;
+
+                    ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp));
+                    ParallelMath::ConditionalSet(p2, adenomZero, p1);
+
+                    // Unweight
+                    float inverseWeight = m_rcpChannelWeights[ch];
+
+                    endPoint[0][ch] = p1 * inverseWeight;
+                    endPoint[1][ch] = p2 * inverseWeight;
+                }
+            }
+
+            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                MFloat floatEndPoint[2][TVectorSize];
+                GetRefinedEndpoints(floatEndPoint);
+
+                for (int epi = 0; epi < 2; epi++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode);
+            }
+
+            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode);
+            }
+
+            void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                MFloat floatEndPoint[2][TVectorSize];
+                GetRefinedEndpoints(floatEndPoint);
+
+                for (int epi = 0; epi < 2; epi++)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                    {
+                        MFloat f = floatEndPoint[epi][ch];
+                        if (isSigned)
+                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode));
+                        else
+                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode));
+                    }
+                }
+            }
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h b/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h
new file mode 100644
index 0000000000..e09dfd248c
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h
@@ -0,0 +1,153 @@
+#pragma once
+#ifndef __CVTT_ENDPOINTSELECTOR_H__
+#define __CVTT_ENDPOINTSELECTOR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+#include "ConvectionKernels_PackedCovarianceMatrix.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        static const int NumEndpointSelectorPasses = 3;
+
+        template<int TVectorSize, int TIterationCount>
+        class EndpointSelector
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+
+            EndpointSelector()
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_centroid[ch] = ParallelMath::MakeFloatZero();
+                    m_direction[ch] = ParallelMath::MakeFloatZero();
+                }
+                m_weightTotal = ParallelMath::MakeFloatZero();
+                m_minDist = ParallelMath::MakeFloat(FLT_MAX);
+                m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
+            }
+
+            void ContributePass(const MFloat *value, int pass, const MFloat &weight)
+            {
+                if (pass == 0)
+                    ContributeCentroid(value, weight);
+                else if (pass == 1)
+                    ContributeDirection(value, weight);
+                else if (pass == 2)
+                    ContributeMinMax(value);
+            }
+
+            void FinishPass(int pass)
+            {
+                if (pass == 0)
+                    FinishCentroid();
+                else if (pass == 1)
+                    FinishDirection();
+            }
+
+            UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
+            {
+                MFloat unweightedBase[TVectorSize];
+                MFloat unweightedOffset[TVectorSize];
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
+                    MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
+
+                    float safeWeight = channelWeights[ch];
+                    if (safeWeight == 0.f)
+                        safeWeight = 1.0f;
+
+                    unweightedBase[ch] = min / channelWeights[ch];
+                    unweightedOffset[ch] = (max - min) / channelWeights[ch];
+                }
+
+                return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
+            }
+
+        private:
+            void ContributeCentroid(const MFloat *value, const MFloat &weight)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
+                m_weightTotal = m_weightTotal + weight;
+            }
+
+            void FinishCentroid()
+            {
+                MFloat denom = m_weightTotal;
+                ParallelMath::MakeSafeDenominator(denom);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_centroid[ch] = m_centroid[ch] / denom;
+            }
+
+            void ContributeDirection(const MFloat *value, const MFloat &weight)
+            {
+                MFloat diff[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    diff[ch] = value[ch] - m_centroid[ch];
+
+                m_covarianceMatrix.Add(diff, weight);
+            }
+
+            void FinishDirection()
+            {
+                MFloat approx[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    approx[ch] = ParallelMath::MakeFloat(1.0f);
+
+                for (int i = 0; i < TIterationCount; i++)
+                {
+                    MFloat product[TVectorSize];
+                    m_covarianceMatrix.Product(product, approx);
+
+                    MFloat largestComponent = product[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        largestComponent = ParallelMath::Max(largestComponent, product[ch]);
+
+                    // product = largestComponent*newApprox
+                    ParallelMath::MakeSafeDenominator(largestComponent);
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        approx[ch] = product[ch] / largestComponent;
+                }
+
+                // Normalize
+                MFloat approxLen = ParallelMath::MakeFloatZero();
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    approxLen = approxLen + approx[ch] * approx[ch];
+
+                approxLen = ParallelMath::Sqrt(approxLen);
+
+                ParallelMath::MakeSafeDenominator(approxLen);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_direction[ch] = approx[ch] / approxLen;
+            }
+
+            void ContributeMinMax(const MFloat *value)
+            {
+                MFloat dist = ParallelMath::MakeFloatZero();
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
+
+                m_minDist = ParallelMath::Min(m_minDist, dist);
+                m_maxDist = ParallelMath::Max(m_maxDist, dist);
+            }
+
+            ParallelMath::Float m_centroid[TVectorSize];
+            ParallelMath::Float m_direction[TVectorSize];
+            PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
+            ParallelMath::Float m_weightTotal;
+
+            ParallelMath::Float m_minDist;
+            ParallelMath::Float m_maxDist;
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h b/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h
new file mode 100644
index 0000000000..1eb924befe
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h
@@ -0,0 +1,282 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace FakeBT709 {
+    const uint8_t g_rounding16[] =
+    {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+    };
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp b/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp
new file mode 100644
index 0000000000..b3d1b5497e
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp
@@ -0,0 +1,66 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_IndexSelector.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        const ParallelMath::UInt16 g_weightReciprocals[17] =
+        {
+            ParallelMath::MakeUInt16(0),        // -1 
+            ParallelMath::MakeUInt16(0),        // 0
+            ParallelMath::MakeUInt16(32768),    // 1
+            ParallelMath::MakeUInt16(16384),    // 2
+            ParallelMath::MakeUInt16(10923),    // 3
+            ParallelMath::MakeUInt16(8192),     // 4
+            ParallelMath::MakeUInt16(6554),     // 5
+            ParallelMath::MakeUInt16(5461),     // 6
+            ParallelMath::MakeUInt16(4681),     // 7
+            ParallelMath::MakeUInt16(4096),     // 8
+            ParallelMath::MakeUInt16(3641),     // 9
+            ParallelMath::MakeUInt16(3277),     // 10
+            ParallelMath::MakeUInt16(2979),     // 11
+            ParallelMath::MakeUInt16(2731),     // 12
+            ParallelMath::MakeUInt16(2521),     // 13
+            ParallelMath::MakeUInt16(2341),     // 14
+            ParallelMath::MakeUInt16(2185),     // 15
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelector.h b/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
new file mode 100644
index 0000000000..0f9d209183
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
@@ -0,0 +1,147 @@
+#pragma once
+#ifndef __CVTT_INDEXSELECTOR_H__
+#define __CVTT_INDEXSELECTOR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        extern const ParallelMath::UInt16 g_weightReciprocals[17];
+
+        template<int TVectorSize>
+        class IndexSelector
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+
+            template<class TInterpolationEPType, class TColorEPType>
+            void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
+            {
+                // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
+                // We need to select indexes using the color-space endpoints.
+
+                m_isUniform = true;
+                for (int ch = 1; ch < TVectorSize; ch++)
+                {
+                    if (channelWeights[ch] != channelWeights[0])
+                        m_isUniform = false;
+                }
+
+                // To work with channel weights, we need something where:
+                // pxDiff = px - ep[0]
+                // epDiff = ep[1] - ep[0]
+                //
+                // weightedEPDiff = epDiff * channelWeights
+                // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
+                // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
+                // index = normalizedIndex * maxValue
+                //
+                // Equivalent to:
+                // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
+                // index = dot(axis, pxDiff)
+
+                for (int ep = 0; ep < 2; ep++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
+
+                m_range = range;
+                m_maxValue = static_cast<float>(range - 1);
+
+                MFloat epDiffWeighted[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
+                    MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
+                    epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
+                }
+
+                MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
+
+                ParallelMath::MakeSafeDenominator(lenSquared);
+
+                MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
+            }
+
+            template<bool TSigned>
+            void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
+            {
+                MAInt16 converted[2][TVectorSize];
+                for (int epi = 0; epi < 2; epi++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
+
+                Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
+            }
+
+            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
+                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
+                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
+                }
+            }
+
+            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
+                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
+                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
+                }
+            }
+
+            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
+            {
+                ReconstructLDR_BC7(index, pixel, TVectorSize);
+            }
+
+            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
+            {
+                ReconstructLDRPrecise(index, pixel, TVectorSize);
+            }
+
+            MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
+            {
+                MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
+
+                return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
+            }
+
+        protected:
+            MAInt16 m_endPoint[2][TVectorSize];
+
+        private:
+            MFloat m_origin[TVectorSize];
+            MFloat m_axis[TVectorSize];
+            int m_range;
+            float m_maxValue;
+            bool m_isUniform;
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h b/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h
new file mode 100644
index 0000000000..84795cd689
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h
@@ -0,0 +1,155 @@
+#pragma once
+#ifndef __CVTT_INDEXSELECTORHDR_H__
+#define __CVTT_INDEXSELECTORHDR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_IndexSelector.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v);
+        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v);
+
+        template<int TVectorSize>
+        class IndexSelectorHDR : public IndexSelector<TVectorSize>
+        {
+        public:
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt31 MUInt31;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::Float MFloat;
+
+        private:
+
+            MUInt15 InvertSingle(const MUInt15& anIndex) const
+            {
+                MUInt15 inverted = m_maxValueMinusOne - anIndex;
+                return ParallelMath::Select(m_isInverted, inverted, anIndex);
+            }
+
+            void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]);
+                    MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]);
+
+                    MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
+
+                    pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6);
+
+                    pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32));
+                }
+            }
+
+            void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]);
+                    MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]);
+
+                    MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
+
+                    pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6);
+
+                    pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31)));
+                }
+            }
+
+            MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const
+            {
+                MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch];
+                return diff * diff;
+            }
+
+            MFloat ErrorForInterpolator(int index, const MFloat *pixel) const
+            {
+                MFloat error = ErrorForInterpolatorComponent(index, 0, pixel);
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    error = error + ErrorForInterpolatorComponent(index, ch, pixel);
+                return error;
+            }
+
+        public:
+
+            void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights)
+            {
+                assert(range <= 16);
+
+                m_range = range;
+
+                m_isInverted = ParallelMath::MakeBoolInt16(false);
+                m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1));
+
+                if (!fastIndexing)
+                {
+                    for (int i = 0; i < range; i++)
+                    {
+                        MSInt16 recon2CL[TVectorSize];
+
+                        if (isSigned)
+                            ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
+                        else
+                            ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
+
+                        for (int ch = 0; ch < TVectorSize; ch++)
+                            m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch];
+                    }
+                }
+            }
+
+            void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const
+            {
+                ReconstructHDRSignedUninverted(InvertSingle(index), pixel);
+            }
+
+            void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const
+            {
+                ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel);
+            }
+
+            void ConditionalInvert(const ParallelMath::Int16CompFlag &invert)
+            {
+                m_isInverted = invert;
+            }
+
+            MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const
+            {
+                MUInt15 index = ParallelMath::MakeUInt15(0);
+
+                MFloat bestError = ErrorForInterpolator(0, pixel);
+                for (int i = 1; i < m_range; i++)
+                {
+                    MFloat error = ErrorForInterpolator(i, pixel);
+                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                    ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
+                    bestError = ParallelMath::Min(bestError, error);
+                }
+
+                return InvertSingle(index);
+            }
+
+            MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
+            {
+                return InvertSingle(this->SelectIndexLDR(pixel, rtn));
+            }
+
+        private:
+            MFloat m_reconstructedInterpolators[16][TVectorSize];
+            ParallelMath::Int16CompFlag m_isInverted;
+            MUInt15 m_maxValueMinusOne;
+            int m_range;
+        };
+    }
+}
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h b/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h
new file mode 100644
index 0000000000..7ac3d4fdda
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h
@@ -0,0 +1,68 @@
+#pragma once
+#ifndef __CVTT_COVARIANCEMATRIX_H__
+#define __CVTT_COVARIANCEMATRIX_H__
+
+namespace cvtt
+{
+    namespace Internal
+    {
+
+        template<int TMatrixSize>
+        class PackedCovarianceMatrix
+        {
+        public:
+            // 0: xx,
+            // 1: xy, yy
+            // 3: xz, yz, zz 
+            // 6: xw, yw, zw, ww
+            // ... etc.
+            static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
+
+            typedef ParallelMath::Float MFloat;
+
+            PackedCovarianceMatrix()
+            {
+                for (int i = 0; i < PyramidSize; i++)
+                    m_values[i] = ParallelMath::MakeFloatZero();
+            }
+
+            void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight)
+            {
+                int index = 0;
+                for (int row = 0; row < TMatrixSize; row++)
+                {
+                    for (int col = 0; col <= row; col++)
+                    {
+                        m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
+                        index++;
+                    }
+                }
+            }
+
+            void Product(MFloat *outVec, const MFloat *inVec)
+            {
+                for (int row = 0; row < TMatrixSize; row++)
+                {
+                    MFloat sum = ParallelMath::MakeFloatZero();
+
+                    int index = (row * (row + 1)) >> 1;
+                    for (int col = 0; col < TMatrixSize; col++)
+                    {
+                        sum = sum + inVec[col] * m_values[index];
+                        if (col >= row)
+                            index += col + 1;
+                        else
+                            index++;
+                    }
+
+                    outVec[row] = sum;
+                }
+            }
+
+        private:
+            ParallelMath::Float m_values[PyramidSize];
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ParallelMath.h b/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
new file mode 100644
index 0000000000..9e25280f45
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
@@ -0,0 +1,1816 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+#pragma once
+#ifndef __CVTT_PARALLELMATH_H__
+#define __CVTT_PARALLELMATH_H__
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_Config.h"
+
+#ifdef CVTT_USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#include <float.h>
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+#include <math.h>
+
+#define UNREFERENCED_PARAMETER(n) ((void)n)
+
+// Parallel math implementation
+//
+// After preprocessor defs are handled, what this should do is expose the following types:
+// SInt16 - Signed 16-bit integer
+// UInt16 - Signed 16-bit integer
+// UInt15 - Unsigned 15-bit integer
+// SInt32 - Signed 32-bit integer
+// UInt31 - Unsigned 31-bit integer
+// AInt16 - 16-bit integer of unknown signedness (only used for storage)
+// Int16CompFlag - Comparison flags from comparing 16-bit integers
+// Int32CompFlag - Comparison flags from comparing 32-bit integers
+// FloatCompFlag - Comparison flags from comparing 32-bit floats
+//
+// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
+// (particularly max, min, compares, and right shift) may not be available.  In cases where ops are not available, it's
+// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers.  The 15-bit and 31-bit uint types
+// can elide the bit flips if unsigned versions are not available.
+
+namespace cvtt
+{
+#ifdef CVTT_USE_SSE2
+    // SSE2 version
+    struct ParallelMath
+    {
+        typedef uint16_t ScalarUInt16;
+        typedef int16_t ScalarSInt16;
+
+        template<unsigned int TRoundingMode>
+        struct RoundForScope
+        {
+            unsigned int m_oldCSR;
+
+            RoundForScope()
+            {
+                m_oldCSR = _mm_getcsr();
+                _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
+            }
+
+            ~RoundForScope()
+            {
+                _mm_setcsr(m_oldCSR);
+            }
+        };
+
+        struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
+        {
+        };
+
+        struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
+        {
+        };
+
+        struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
+        {
+        };
+
+        struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
+        {
+        };
+
+        static const int ParallelSize = 8;
+
+        enum Int16Subtype
+        {
+            IntSubtype_Signed,
+            IntSubtype_UnsignedFull,
+            IntSubtype_UnsignedTruncated,
+            IntSubtype_Abstract,
+        };
+
+        template<int TSubtype>
+        struct VInt16
+        {
+            __m128i m_value;
+
+            inline VInt16 operator+(int16_t other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
+                return result;
+            }
+
+            inline VInt16 operator+(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_add_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator|(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_or_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator&(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_and_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator-(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_sub_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator<<(int bits) const
+            {
+                VInt16 result;
+                result.m_value = _mm_slli_epi16(m_value, bits);
+                return result;
+            }
+
+            inline VInt16 operator^(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_xor_si128(m_value, other.m_value);
+                return result;
+            }
+        };
+
+        typedef VInt16<IntSubtype_Signed> SInt16;
+        typedef VInt16<IntSubtype_UnsignedFull> UInt16;
+        typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
+        typedef VInt16<IntSubtype_Abstract> AInt16;
+
+        template<int TSubtype>
+        struct VInt32
+        {
+            __m128i m_values[2];
+
+            inline VInt32 operator+(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline VInt32 operator-(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline VInt32 operator<<(const int other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_slli_epi32(m_values[0], other);
+                result.m_values[1] = _mm_slli_epi32(m_values[1], other);
+                return result;
+            }
+
+            inline VInt32 operator|(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        typedef VInt32<IntSubtype_Signed> SInt32;
+        typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
+        typedef VInt32<IntSubtype_UnsignedFull> UInt32;
+        typedef VInt32<IntSubtype_Abstract> AInt32;
+
+        template<class TTargetType>
+        struct LosslessCast
+        {
+#ifdef CVTT_PERMIT_ALIASING
+            template<int TSrcSubtype>
+            static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
+            {
+                return reinterpret_cast<VInt32<TSubtype>&>(src);
+            }
+
+            template<int TSrcSubtype>
+            static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
+            {
+                return reinterpret_cast<VInt16<TSubtype>&>(src);
+            }
+#else
+            template<int TSrcSubtype>
+            static TTargetType Cast(const VInt32<TSrcSubtype> &src)
+            {
+                TTargetType result;
+                result.m_values[0] = src.m_values[0];
+                result.m_values[1] = src.m_values[1];
+                return result;
+            }
+
+            template<int TSrcSubtype>
+            static TTargetType Cast(const VInt16<TSrcSubtype> &src)
+            {
+                TTargetType result;
+                result.m_value = src.m_value;
+                return result;
+            }
+#endif
+        };
+
+        struct Int64
+        {
+            __m128i m_values[4];
+        };
+
+        struct Float
+        {
+            __m128 m_values[2];
+
+            inline Float operator+(const Float &other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator+(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+
+            inline Float operator-(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator-() const
+            {
+                Float result;
+                result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
+                result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
+                return result;
+            }
+
+            inline Float operator*(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator*(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+
+            inline Float operator/(const Float &other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator/(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+        };
+
+        struct Int16CompFlag
+        {
+            __m128i m_value;
+
+            inline Int16CompFlag operator&(const Int16CompFlag &other) const
+            {
+                Int16CompFlag result;
+                result.m_value = _mm_and_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline Int16CompFlag operator|(const Int16CompFlag &other) const
+            {
+                Int16CompFlag result;
+                result.m_value = _mm_or_si128(m_value, other.m_value);
+                return result;
+            }
+        };
+
+        struct Int32CompFlag
+        {
+            __m128i m_values[2];
+
+            inline Int32CompFlag operator&(const Int32CompFlag &other) const
+            {
+                Int32CompFlag result;
+                result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Int32CompFlag operator|(const Int32CompFlag &other) const
+            {
+                Int32CompFlag result;
+                result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        struct FloatCompFlag
+        {
+            __m128 m_values[2];
+
+            inline FloatCompFlag operator&(const FloatCompFlag &other) const
+            {
+                FloatCompFlag result;
+                result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline FloatCompFlag operator|(const FloatCompFlag &other) const
+            {
+                FloatCompFlag result;
+                result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        template<int TSubtype>
+        static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_add_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_and_si128(flag.m_value, a.m_value);
+            return result;
+        }
+
+        template<int TSubtype>
+        static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
+        }
+
+        template<int TSubtype>
+        static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)
+        {
+            __m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);
+            __m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);
+            dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0]));
+            dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1]));
+        }
+
+        static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
+        }
+
+        static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
+        {
+            SInt16 result;
+            result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
+            return result;
+        }
+
+        template<int TSubtype>
+        static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
+        }
+
+        static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
+        {
+            for (int i = 0; i < 2; i++)
+                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
+        }
+
+        static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
+        {
+            for (int i = 0; i < 2; i++)
+                dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
+        }
+
+        static void MakeSafeDenominator(Float& v)
+        {
+            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
+        }
+
+        static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
+        {
+            int lostBits = 16 - precision;
+            if (lostBits == 0)
+                return v;
+
+            SInt16 result;
+            result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
+            return result;
+        }
+
+        static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
+        {
+            int lostBits = 16 - precision;
+            if (lostBits == 0)
+                return v;
+
+            UInt16 result;
+            result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
+            return result;
+        }
+
+        static UInt16 Min(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i bitFlip = _mm_set1_epi16(-32768);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
+            return result;
+        }
+
+        static SInt16 Min(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt15 Min(const UInt15 &a, const UInt15 &b)
+        {
+            UInt15 result;
+            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Min(const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static UInt16 Max(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i bitFlip = _mm_set1_epi16(-32768);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
+            return result;
+        }
+
+        static SInt16 Max(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt15 Max(const UInt15 &a, const UInt15 &b)
+        {
+            UInt15 result;
+            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Max(const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Float Clamp(const Float &v, float min, float max)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
+            return result;
+        }
+
+        static Float Reciprocal(const Float &v)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
+            return result;
+        }
+
+        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
+        {
+            int16_t values[8];
+            for (int i = 0; i < 8; i++)
+                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
+
+            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
+        }
+
+        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
+        {
+            int16_t values[8];
+            for (int i = 0; i < 8; i++)
+                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
+
+            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
+        }
+
+        static Float MakeFloat(float v)
+        {
+            Float f;
+            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
+            return f;
+        }
+
+        static Float MakeFloatZero()
+        {
+            Float f;
+            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
+            return f;
+        }
+
+        static UInt16 MakeUInt16(uint16_t v)
+        {
+            UInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static SInt16 MakeSInt16(int16_t v)
+        {
+            SInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static AInt16 MakeAInt16(int16_t v)
+        {
+            AInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static UInt15 MakeUInt15(uint16_t v)
+        {
+            UInt15 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static SInt32 MakeSInt32(int32_t v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_set1_epi32(v);
+            result.m_values[1] = _mm_set1_epi32(v);
+            return result;
+        }
+
+        static UInt31 MakeUInt31(uint32_t v)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_set1_epi32(v);
+            result.m_values[1] = _mm_set1_epi32(v);
+            return result;
+        }
+
+        static uint16_t Extract(const UInt16 &v, int offset)
+        {
+            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
+        }
+
+        static int16_t Extract(const SInt16 &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
+        }
+
+        static uint16_t Extract(const UInt15 &v, int offset)
+        {
+            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
+        }
+
+        static int16_t Extract(const AInt16 &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
+        }
+
+        static int32_t Extract(const SInt32 &v, int offset)
+        {
+            return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3];
+        }
+
+        static float Extract(const Float &v, int offset)
+        {
+            return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3];
+        }
+
+        static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0;
+        }
+
+        static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
+        {
+            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
+        }
+
+        static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
+        {
+            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
+        }
+
+        static void PutSInt16(SInt16 &dest, int offset, int16_t v)
+        {
+            reinterpret_cast<int16_t*>(&dest)[offset] = v;
+        }
+
+        static float ExtractFloat(const Float& v, int offset)
+        {
+            return reinterpret_cast<const float*>(&v)[offset];
+        }
+
+        static void PutFloat(Float &dest, int offset, float v)
+        {
+            reinterpret_cast<float*>(&dest)[offset] = v;
+        }
+
+        static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)
+        {
+            reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0;
+        }
+
+        static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)
+        {
+            Int32CompFlag result;
+            result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]);
+            result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]);
+            return result;
+        }
+
+        static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static FloatCompFlag Less(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        template<int TSubtype>
+        static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static FloatCompFlag Equal(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)
+        {
+            Int16CompFlag notResult;
+            notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);
+            return Not(notResult);
+        }
+
+        static Float ToFloat(const UInt16 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
+            return result;
+        }
+
+        static UInt31 ToUInt31(const UInt16 &v)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const UInt16 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const UInt15 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const SInt16 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
+            result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
+            return result;
+        }
+
+        static Float ToFloat(const SInt16 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
+            return result;
+        }
+
+        static Float ToFloat(const UInt15 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
+            return result;
+        }
+
+        static Float ToFloat(const UInt31 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
+            result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
+            return result;
+        }
+
+        static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
+        {
+            __m128i lo = _mm_castps_si128(v.m_values[0]);
+            __m128i hi = _mm_castps_si128(v.m_values[1]);
+
+            Int16CompFlag result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
+        {
+            __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
+            __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
+
+            FloatCompFlag result;
+            result.m_values[0] = _mm_castsi128_ps(lo);
+            result.m_values[1] = _mm_castsi128_ps(hi);
+            return result;
+        }
+
+        static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)
+        {
+            __m128i lo = v.m_values[0];
+            __m128i hi = v.m_values[1];
+
+            Int16CompFlag result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static Int16CompFlag MakeBoolInt16(bool b)
+        {
+            Int16CompFlag result;
+            if (b)
+                result.m_value = _mm_set1_epi16(-1);
+            else
+                result.m_value = _mm_setzero_si128();
+            return result;
+        }
+
+        static FloatCompFlag MakeBoolFloat(bool b)
+        {
+            FloatCompFlag result;
+            if (b)
+                result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
+            else
+                result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
+            return result;
+        }
+
+        static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
+            return result;
+        }
+
+        static Int16CompFlag Not(const Int16CompFlag &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1));
+            return result;
+        }
+
+        static Int32CompFlag Not(const Int32CompFlag &b)
+        {
+            Int32CompFlag result;
+            result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1));
+            result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1));
+            return result;
+        }
+
+        static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
+            __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
+            return result;
+        }
+
+        static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
+            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            UInt15 result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
+            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            SInt16 result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static Float Sqrt(const Float &f)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
+            return result;
+        }
+
+        static UInt16 Abs(const SInt16 &a)
+        {
+            __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
+            __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
+
+            UInt16 result;
+            result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
+            return result;
+        }
+
+        static Float Abs(const Float& a)
+        {
+            __m128 invMask = _mm_set1_ps(-0.0f);
+
+            Float result;
+            result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
+            result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
+            return result;
+        }
+
+        static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
+        {
+            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
+
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(diff, diff);
+            return result;
+        }
+
+        static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
+        {
+            __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
+
+            __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
+            __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
+            __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
+            __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
+
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
+            result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
+
+            return result;
+        }
+
+        static Float TwosCLHalfToFloat(const SInt16 &v)
+        {
+            __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
+
+            __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
+            __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
+            __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
+
+            __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
+
+            // Convert exponent to high-bits 
+            exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
+
+            __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
+
+            __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
+            __m128i lowBits = _mm_slli_epi16(mantissa, 13);
+
+            __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
+            __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
+
+            __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
+            __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
+
+            Float result;
+            result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
+            result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
+
+            return result;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+            Float fb = TwosCLHalfToFloat(b);
+
+            Float diff = fa - fb;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a) * aWeight;
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static UInt16 RightShift(const UInt16 &v, int bits)
+        {
+            UInt16 result;
+            result.m_value = _mm_srli_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static UInt31 RightShift(const UInt31 &v, int bits)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
+            result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
+            return result;
+        }
+
+        static SInt16 RightShift(const SInt16 &v, int bits)
+        {
+            SInt16 result;
+            result.m_value = _mm_srai_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static UInt15 RightShift(const UInt15 &v, int bits)
+        {
+            UInt15 result;
+            result.m_value = _mm_srli_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static SInt32 RightShift(const SInt32 &v, int bits)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
+            result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
+            return result;
+        }
+
+        static SInt16 ToSInt16(const SInt32 &v)
+        {
+            SInt16 result;
+            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
+            return result;
+        }
+
+        static SInt16 ToSInt16(const UInt16 &v)
+        {
+            SInt16 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static SInt16 ToSInt16(const UInt15 &v)
+        {
+            SInt16 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static UInt16 ToUInt16(const UInt32 &v)
+        {
+            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
+            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
+
+            UInt16 result;
+            result.m_value = _mm_packs_epi32(low, high);
+            return result;
+        }
+
+        static UInt16 ToUInt16(const UInt31 &v)
+        {
+            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
+            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
+
+            UInt16 result;
+            result.m_value = _mm_packs_epi32(low, high);
+            return result;
+        }
+
+        static UInt15 ToUInt15(const UInt31 &v)
+        {
+            UInt15 result;
+            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
+            return result;
+        }
+
+        static UInt15 ToUInt15(const SInt16 &v)
+        {
+            UInt15 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static UInt15 ToUInt15(const UInt16 &v)
+        {
+            UInt15 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
+        {
+            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
+        {
+            return XMultiply(b, a);
+        }
+
+        static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
+        {
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
+        {
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
+        {
+            return XMultiply(b, a);
+        }
+
+        static bool AnySet(const Int16CompFlag &v)
+        {
+            return _mm_movemask_epi8(v.m_value) != 0;
+        }
+
+        static bool AllSet(const Int16CompFlag &v)
+        {
+            return _mm_movemask_epi8(v.m_value) == 0xffff;
+        }
+
+        static bool AnySet(const FloatCompFlag &v)
+        {
+            return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
+        }
+
+        static bool AllSet(const FloatCompFlag &v)
+        {
+            return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
+        }
+    };
+
+#else
+    // Scalar version
+    struct ParallelMath
+    {
+        struct RoundTowardZeroForScope
+        {
+        };
+
+        struct RoundTowardNearestForScope
+        {
+        };
+
+        struct RoundUpForScope
+        {
+        };
+
+        struct RoundDownForScope
+        {
+        };
+
+        static const int ParallelSize = 1;
+
+        enum Int16Subtype
+        {
+            IntSubtype_Signed,
+            IntSubtype_UnsignedFull,
+            IntSubtype_UnsignedTruncated,
+            IntSubtype_Abstract,
+        };
+
+        typedef int32_t SInt16;
+        typedef int32_t UInt15;
+        typedef int32_t UInt16;
+        typedef int32_t AInt16;
+
+        typedef int32_t SInt32;
+        typedef int32_t UInt31;
+        typedef int32_t UInt32;
+        typedef int32_t AInt32;
+
+        typedef int32_t ScalarUInt16;
+        typedef int32_t ScalarSInt16;
+
+        typedef float Float;
+
+        template<class TTargetType>
+        struct LosslessCast
+        {
+            static const int32_t& Cast(const int32_t &src)
+            {
+                return src;
+            }
+        };
+
+        typedef bool Int16CompFlag;
+        typedef bool FloatCompFlag;
+
+        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
+        {
+            return a + b;
+        }
+
+        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
+        {
+            return a - b;
+        }
+
+        static float Select(bool flag, float a, float b)
+        {
+            return flag ? a : b;
+        }
+
+        static int32_t Select(bool flag, int32_t a, int32_t b)
+        {
+            return flag ? a : b;
+        }
+
+        static int32_t SelectOrZero(bool flag, int32_t a)
+        {
+            return flag ? a : 0;
+        }
+
+        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static void ConditionalSet(bool& dest, bool flag, bool src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static int32_t ConditionalNegate(bool flag, int32_t v)
+        {
+            return (flag) ? -v : v;
+        }
+
+        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
+        {
+            if (!flag)
+                dest = src;
+        }
+
+        static void ConditionalSet(float& dest, bool flag, float src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static void NotConditionalSet(float& dest, bool flag, float src)
+        {
+            if (!flag)
+                dest = src;
+        }
+
+        static void MakeSafeDenominator(float& v)
+        {
+            if (v == 0.0f)
+                v = 1.0f;
+        }
+
+        static int32_t SignedRightShift(int32_t v, int bits)
+        {
+            return v >> bits;
+        }
+
+        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
+        {
+            v = (v << (32 - precision)) & 0xffffffff;
+            return SignedRightShift(v, 32 - precision);
+        }
+
+        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
+        {
+            return v & ((1 << precision) - 1);
+        }
+
+        static int32_t Min(int32_t a, int32_t b)
+        {
+            if (a < b)
+                return a;
+            return b;
+        }
+
+        static float Min(float a, float b)
+        {
+            if (a < b)
+                return a;
+            return b;
+        }
+
+        static int32_t Max(int32_t a, int32_t b)
+        {
+            if (a > b)
+                return a;
+            return b;
+        }
+
+        static float Max(float a, float b)
+        {
+            if (a > b)
+                return a;
+            return b;
+        }
+
+        static float Abs(float a)
+        {
+            return fabsf(a);
+        }
+
+        static int32_t Abs(int32_t a)
+        {
+            if (a < 0)
+                return -a;
+            return a;
+        }
+
+        static float Clamp(float v, float min, float max)
+        {
+            if (v < min)
+                return min;
+            if (v > max)
+                return max;
+            return v;
+        }
+
+        static float Reciprocal(float v)
+        {
+            return 1.0f / v;
+        }
+
+        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
+        {
+            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
+        }
+
+        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
+        {
+            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
+        }
+
+        static float MakeFloat(float v)
+        {
+            return v;
+        }
+
+        static float MakeFloatZero()
+        {
+            return 0.0f;
+        }
+
+        static int32_t MakeUInt16(uint16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeSInt16(int16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeAInt16(int16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeUInt15(uint16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeSInt32(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeUInt31(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t Extract(int32_t v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static bool Extract(bool v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static float Extract(float v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static float ExtractFloat(float v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static void PutFloat(float &dest, int offset, float v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutBoolInt16(bool &dest, int offset, bool v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static bool Less(int32_t a, int32_t b)
+        {
+            return a < b;
+        }
+
+        static bool Less(float a, float b)
+        {
+            return a < b;
+        }
+
+        static bool LessOrEqual(int32_t a, int32_t b)
+        {
+            return a < b;
+        }
+
+        static bool LessOrEqual(float a, float b)
+        {
+            return a < b;
+        }
+
+        static bool Equal(int32_t a, int32_t b)
+        {
+            return a == b;
+        }
+
+        static bool Equal(float a, float b)
+        {
+            return a == b;
+        }
+
+        static float ToFloat(int32_t v)
+        {
+            return static_cast<float>(v);
+        }
+
+        static int32_t ToUInt31(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToInt32(int32_t v)
+        {
+            return v;
+        }
+
+        static bool FloatFlagToInt16(bool v)
+        {
+            return v;
+        }
+
+        static bool Int32FlagToInt16(bool v)
+        {
+            return v;
+        }
+
+        static bool Int16FlagToFloat(bool v)
+        {
+            return v;
+        }
+
+        static bool MakeBoolInt16(bool b)
+        {
+            return b;
+        }
+
+        static bool MakeBoolFloat(bool b)
+        {
+            return b;
+        }
+
+        static bool AndNot(bool a, bool b)
+        {
+            return a && !b;
+        }
+
+        static bool Not(bool b)
+        {
+            return !b;
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
+        {
+            UNREFERENCED_PARAMETER(rtz);
+            return static_cast<int>(v);
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
+        {
+            UNREFERENCED_PARAMETER(ru);
+            return static_cast<int>(ceilf(v));
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
+        {
+            UNREFERENCED_PARAMETER(rd);
+            return static_cast<int>(floorf(v));
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
+        {
+            UNREFERENCED_PARAMETER(rtn);
+            return static_cast<int>(floorf(v + 0.5f));
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        static float Sqrt(float f)
+        {
+            return sqrtf(f);
+        }
+
+        static int32_t SqDiffUInt8(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static int32_t SqDiffInt16(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static int32_t SqDiffSInt16(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static float TwosCLHalfToFloat(int32_t v)
+        {
+            int32_t absV = (v < 0) ? -v : v;
+
+            int32_t signBits = (absV & -32768);
+            int32_t mantissa = (absV & 0x03ff);
+            int32_t exponent = (absV & 0x7c00);
+
+            bool isDenormal = (exponent == 0);
+
+            // Convert exponent to high-bits
+            exponent = (exponent >> 3) + 14336;
+
+            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
+
+            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
+
+            float f, correction;
+            memcpy(&f, &fBits, 4);
+            memcpy(&correction, &denormalCorrection, 4);
+
+            return f - correction;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+            Float fb = TwosCLHalfToFloat(b);
+
+            Float diff = fa - fb;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a) * aWeight;
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static int32_t RightShift(int32_t v, int bits)
+        {
+            return SignedRightShift(v, bits);
+        }
+
+        static int32_t ToSInt16(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToUInt16(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToUInt15(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t XMultiply(int32_t a, int32_t b)
+        {
+            return a * b;
+        }
+
+        static int32_t CompactMultiply(int32_t a, int32_t b)
+        {
+            return a * b;
+        }
+
+        static bool AnySet(bool v)
+        {
+            return v;
+        }
+
+        static bool AllSet(bool v)
+        {
+            return v;
+        }
+    };
+
+#endif
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC.cpp b/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
new file mode 100644
index 0000000000..23f1bd3314
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
@@ -0,0 +1,1054 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_S3TC.h"
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_BCCommon.h"
+#include "ConvectionKernels_EndpointRefiner.h"
+#include "ConvectionKernels_EndpointSelector.h"
+#include "ConvectionKernels_IndexSelector.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+#include "ConvectionKernels_S3TC_SingleColor.h"
+
+void cvtt::Internal::S3TCComputer::Init(MFloat& error)
+{
+    error = ParallelMath::MakeFloat(FLT_MAX);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)
+{
+    MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
+    v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)
+{
+    MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
+    v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3])
+{
+    QuantizeTo5Bits(endPoint[0]);
+    QuantizeTo6Bits(endPoint[1]);
+    QuantizeTo5Bits(endPoint[2]);
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)
+{
+    return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
+{
+    MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
+    absDiff = absDiff + d;
+    return absDiff * absDiff;
+}
+
+void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
+    MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    float channelWeightsSq[3];
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            totals[ch] = totals[ch] + pixels[px][ch];
+    }
+
+    MUInt15 average[3];
+    for (int ch = 0; ch < 3; ch++)
+        average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
+
+    const Tables::S3TCSC::TableEntry* rbTable = NULL;
+    const Tables::S3TCSC::TableEntry* gTable = NULL;
+    if (flags & cvtt::Flags::S3TC_Paranoid)
+    {
+        if (range == 4)
+        {
+            rbTable = Tables::S3TCSC::g_singleColor5_3_p;
+            gTable = Tables::S3TCSC::g_singleColor6_3_p;
+        }
+        else
+        {
+            assert(range == 3);
+            rbTable = Tables::S3TCSC::g_singleColor5_2_p;
+            gTable = Tables::S3TCSC::g_singleColor6_2_p;
+        }
+    }
+    else
+    {
+        if (range == 4)
+        {
+            rbTable = Tables::S3TCSC::g_singleColor5_3;
+            gTable = Tables::S3TCSC::g_singleColor6_3;
+        }
+        else
+        {
+            assert(range == 3);
+            rbTable = Tables::S3TCSC::g_singleColor5_2;
+            gTable = Tables::S3TCSC::g_singleColor6_2;
+        }
+    }
+
+    MUInt15 interpolated[3];
+    MUInt15 eps[2][3];
+    MSInt16 spans[3];
+    for (int i = 0; i < ParallelMath::ParallelSize; i++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            uint16_t avg = ParallelMath::Extract(average[ch], i);
+            const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
+            ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
+            ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
+            ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
+            ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
+        }
+    }
+
+    MFloat error = ParallelMath::MakeFloatZero();
+    if (flags & cvtt::Flags::S3TC_Paranoid)
+    {
+        MFloat spanParanoidFactors[3];
+        for (int ch = 0; ch < 3; ch++)
+            spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
+
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
+        }
+    }
+
+    ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
+    ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
+
+    if (ParallelMath::AnySet(better16))
+    {
+        bestError = ParallelMath::Min(bestError, error);
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
+
+        MUInt15 vindexes = ParallelMath::MakeUInt15(1);
+        for (int px = 0; px < 16; px++)
+            ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
+
+        ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
+    }
+}
+
+void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
+    MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    float channelWeightsSq[3];
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    MUInt15 endPoints[2][3];
+
+    for (int ep = 0; ep < 2; ep++)
+        for (int ch = 0; ch < 3; ch++)
+            endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
+
+    QuantizeTo565(endPoints[0]);
+    QuantizeTo565(endPoints[1]);
+
+    IndexSelector<3> selector;
+    selector.Init<false>(channelWeights, endPoints, range);
+
+    MUInt15 indexes[16];
+
+    MFloat paranoidFactors[3];
+    for (int ch = 0; ch < 3; ch++)
+        paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
+
+    MFloat error = ParallelMath::MakeFloatZero();
+    AggregatedError<3> aggError;
+    for (int px = 0; px < 16; px++)
+    {
+        MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
+        indexes[px] = index;
+
+        if (refiner)
+            refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
+
+        MUInt15 reconstructed[3];
+        selector.ReconstructLDRPrecise(index, reconstructed);
+
+        if (flags & Flags::S3TC_Paranoid)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
+        }
+        else
+            BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
+    }
+
+    if (!(flags & Flags::S3TC_Paranoid))
+        error = aggError.Finalize(flags, channelWeightsSq);
+
+    ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
+
+    if (ParallelMath::AnySet(better))
+    {
+        ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
+
+        ParallelMath::ConditionalSet(bestError, better, error);
+
+        for (int ep = 0; ep < 2; ep++)
+            for (int ch = 0; ch < 3; ch++)
+                ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
+
+        for (int px = 0; px < 16; px++)
+            ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
+
+        ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
+    }
+}
+
+void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
+    const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
+    const ParallelMath::RoundTowardNearestForScope* rtn)
+{
+    UNREFERENCED_PARAMETER(alphaTest);
+    UNREFERENCED_PARAMETER(flags);
+
+    EndpointRefiner<3> refiner;
+
+    refiner.Init(nCounts, channelWeights);
+
+    bool escape = false;
+    int e = 0;
+    for (int i = 0; i < nCounts; i++)
+    {
+        for (int n = 0; n < counts[i]; n++)
+        {
+            ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
+            if (!ParallelMath::AnySet(valid))
+            {
+                escape = true;
+                break;
+            }
+
+            if (ParallelMath::AllSet(valid))
+                refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
+            else
+            {
+                MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
+                refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
+            }
+        }
+
+        if (escape)
+            break;
+    }
+
+    MUInt15 endPoints[2][3];
+    refiner.GetRefinedEndpointsLDR(endPoints, rtn);
+
+    TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
+}
+
+void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
+{
+    UNREFERENCED_PARAMETER(flags);
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    float weights[1] = { 1.0f };
+
+    MUInt15 pixels[16];
+    MFloat floatPixels[16];
+
+    for (int px = 0; px < 16; px++)
+    {
+        ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
+        floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
+    }
+
+    MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
+
+    IndexSelector<1> selector;
+    selector.Init<false>(weights, ep, 16);
+
+    MUInt15 indexes[16];
+
+    for (int px = 0; px < 16; px++)
+        indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        for (int px = 0; px < 16; px += 2)
+        {
+            int index0 = ParallelMath::Extract(indexes[px], block);
+            int index1 = ParallelMath::Extract(indexes[px + 1], block);
+
+            packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
+        }
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
+{
+    if (maxTweakRounds < 1)
+        maxTweakRounds = 1;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    float oneWeight[1] = { 1.0f };
+
+    MUInt15 pixels[16];
+    MFloat floatPixels[16];
+
+    MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
+    MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
+
+    for (int px = 0; px < 16; px++)
+    {
+        ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
+
+        if (isSigned)
+            pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
+
+        floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
+    }
+
+    MUInt15 sortedPixels[16];
+    for (int px = 0; px < 16; px++)
+        sortedPixels[px] = pixels[px];
+
+    for (int sortEnd = 15; sortEnd > 0; sortEnd--)
+    {
+        for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
+        {
+            MUInt15 a = sortedPixels[sortOffset];
+            MUInt15 b = sortedPixels[sortOffset + 1];
+
+            sortedPixels[sortOffset] = ParallelMath::Min(a, b);
+            sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
+        }
+    }
+
+    MUInt15 zero = ParallelMath::MakeUInt15(0);
+    MUInt15 one = ParallelMath::MakeUInt15(1);
+
+    MUInt15 bestIsFullRange = zero;
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestEP[2] = { zero, zero };
+    MUInt15 bestIndexes[16] = {
+        zero, zero, zero, zero,
+        zero, zero, zero, zero,
+        zero, zero, zero, zero,
+        zero, zero, zero, zero
+    };
+
+    // Full-precision
+    {
+        MUInt15 minEP = sortedPixels[0];
+        MUInt15 maxEP = sortedPixels[15];
+
+        MFloat base[1] = { ParallelMath::ToFloat(minEP) };
+        MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
+
+        UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
+
+        int numTweakRounds = BCCommon::TweakRoundsForRange(8);
+        if (numTweakRounds > maxTweakRounds)
+            numTweakRounds = maxTweakRounds;
+
+        for (int tweak = 0; tweak < numTweakRounds; tweak++)
+        {
+            MUInt15 ep[2][1];
+
+            ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
+
+            for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
+            {
+                EndpointRefiner<1> refiner;
+                refiner.Init(8, oneWeight);
+
+                if (isSigned)
+                    for (int epi = 0; epi < 2; epi++)
+                        ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
+
+                IndexSelector<1> indexSelector;
+                indexSelector.Init<false>(oneWeight, ep, 8);
+
+                MUInt15 indexes[16];
+
+                AggregatedError<1> aggError;
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+                    MUInt15 reconstructedPixel;
+
+                    indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
+                    BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
+
+                    if (refinePass != numRefineRounds - 1)
+                        refiner.ContributeUnweightedPW(&floatPixels[px], index);
+
+                    indexes[px] = index;
+                }
+                MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
+
+                ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                if (ParallelMath::AnySet(errorBetter16))
+                {
+                    bestError = ParallelMath::Min(error, bestError);
+                    ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
+                    for (int px = 0; px < 16; px++)
+                        ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
+
+                    for (int epi = 0; epi < 2; epi++)
+                        ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
+                }
+
+                if (refinePass != numRefineRounds - 1)
+                    refiner.GetRefinedEndpointsLDR(ep, &rtn);
+            }
+        }
+    }
+
+    // Reduced precision with special endpoints
+    {
+        MUInt15 bestHeuristicMin = sortedPixels[0];
+        MUInt15 bestHeuristicMax = sortedPixels[15];
+
+        ParallelMath::Int16CompFlag canTryClipping;
+
+        // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
+        // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
+        // This will usually not find anything, but it's cheap to check.
+
+        {
+            MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
+            MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
+
+            MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
+            canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
+        }
+
+        if (ParallelMath::AnySet(canTryClipping))
+        {
+            MUInt15 lowClearances[16];
+            MUInt15 highClearances[16];
+            MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
+
+            lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
+
+            for (int px = 1; px < 16; px++)
+            {
+                lowClearances[px] = sortedPixels[px - 1];
+                highClearances[px] = highTerminal - sortedPixels[16 - px];
+            }
+
+            for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
+            {
+                uint16_t numSkippedLow = firstIndex;
+
+                MUInt15 lowClearance = lowClearances[firstIndex];
+
+                for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
+                {
+                    uint16_t numSkippedHigh = 15 - lastIndex;
+                    uint16_t numSkipped = numSkippedLow + numSkippedHigh;
+
+                    MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
+
+                    ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
+
+                    if (!ParallelMath::AnySet(areMoreSkipped))
+                        continue;
+
+                    MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
+                    MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
+
+                    MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
+
+                    ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
+                    ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
+                    ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
+                }
+            }
+        }
+
+        MUInt15 bestSimpleMin = one;
+        MUInt15 bestSimpleMax = highTerminalMinusOne;
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
+            ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
+        }
+
+        MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
+        MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
+
+        int minEPRange = 2;
+        if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
+            minEPRange = 1;
+
+        int maxEPRange = 2;
+        if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
+            maxEPRange = 1;
+
+        for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
+        {
+            for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
+            {
+                MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
+                MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
+
+                UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
+
+                int numTweakRounds = BCCommon::TweakRoundsForRange(6);
+                if (numTweakRounds > maxTweakRounds)
+                    numTweakRounds = maxTweakRounds;
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    MUInt15 ep[2][1];
+
+                    ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
+
+                    for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
+                    {
+                        EndpointRefiner<1> refiner;
+                        refiner.Init(6, oneWeight);
+
+                        if (isSigned)
+                            for (int epi = 0; epi < 2; epi++)
+                                ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
+
+                        IndexSelector<1> indexSelector;
+                        indexSelector.Init<false>(oneWeight, ep, 6);
+
+                        MUInt15 indexes[16];
+                        MFloat error = ParallelMath::MakeFloatZero();
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+                            MUInt15 reconstructedPixel;
+
+                            indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
+
+                            MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
+                            MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
+                            MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
+
+                            MFloat bestPixelError = zeroError;
+                            MUInt15 index = ParallelMath::MakeUInt15(6);
+
+                            ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
+                            bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
+
+                            ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
+
+                            if (ParallelMath::AllSet(selectedIndexBetter))
+                            {
+                                if (refinePass != numRefineRounds - 1)
+                                    refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
+                            }
+                            else
+                            {
+                                MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
+
+                                if (refinePass != numRefineRounds - 1)
+                                    refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
+                            }
+
+                            ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
+                            bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
+
+                            error = error + bestPixelError;
+
+                            indexes[px] = index;
+                        }
+
+                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                        if (ParallelMath::AnySet(errorBetter16))
+                        {
+                            bestError = ParallelMath::Min(error, bestError);
+                            ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
+
+                            for (int epi = 0; epi < 2; epi++)
+                                ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
+                        }
+
+                        if (refinePass != numRefineRounds - 1)
+                            refiner.GetRefinedEndpointsLDR(ep, &rtn);
+                    }
+                }
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int ep0 = ParallelMath::Extract(bestEP[0], block);
+        int ep1 = ParallelMath::Extract(bestEP[1], block);
+        int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
+
+        if (isSigned)
+        {
+            ep0 -= 127;
+            ep1 -= 127;
+
+            assert(ep0 >= -127 && ep0 <= 127);
+            assert(ep1 >= -127 && ep1 <= 127);
+        }
+
+
+        bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
+
+        if (swapEndpoints)
+            std::swap(ep0, ep1);
+
+        uint16_t dumpBits = 0;
+        int dumpBitsOffset = 0;
+        int dumpByteOffset = 2;
+        packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
+        packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
+
+        int maxValue = (isFullRange != 0) ? 7 : 5;
+
+        for (int px = 0; px < 16; px++)
+        {
+            int index = ParallelMath::Extract(bestIndexes[px], block);
+
+            if (swapEndpoints && index <= maxValue)
+                index = maxValue - index;
+
+            if (index != 0)
+            {
+                if (index == maxValue)
+                    index = 1;
+                else if (index < maxValue)
+                    index++;
+            }
+
+            assert(index >= 0 && index < 8);
+
+            dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
+            dumpBitsOffset += 3;
+
+            if (dumpBitsOffset >= 8)
+            {
+                assert(dumpByteOffset < 8);
+                packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
+                dumpBits >>= 8;
+                dumpBitsOffset -= 8;
+                dumpByteOffset++;
+            }
+        }
+
+        assert(dumpBitsOffset == 0);
+        assert(dumpByteOffset == 8);
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
+{
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    if (maxTweakRounds < 1)
+        maxTweakRounds = 1;
+
+    EndpointSelector<3, 8> endpointSelector;
+
+    MUInt15 pixels[16][4];
+    MFloat floatPixels[16][4];
+
+    MFloat preWeightedPixels[16][4];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
+    }
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+    }
+
+    if (alphaTest)
+    {
+        MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
+            pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
+        }
+    }
+
+    BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
+
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
+
+    for (int px = 0; px < 16; px++)
+        minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
+
+    MFloat pixelWeights[16];
+    for (int px = 0; px < 16; px++)
+    {
+        pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
+        if (alphaTest)
+        {
+            ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
+
+            ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
+        }
+    }
+
+    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+    {
+        for (int px = 0; px < 16; px++)
+            endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
+
+        endpointSelector.FinishPass(pass);
+    }
+
+    UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
+
+    MUInt15 bestEndpoints[2][3];
+    MUInt15 bestIndexes[16];
+    MUInt15 bestRange = ParallelMath::MakeUInt15(0);
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    for (int ep = 0; ep < 2; ep++)
+        for (int ch = 0; ch < 3; ch++)
+            bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
+
+    if (exhaustive)
+    {
+        MSInt16 sortBins[16];
+
+        {
+            // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
+            // and pack the original indexes into the low bits.
+
+            MUInt15 sortEP[2][3];
+            ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
+
+            IndexSelector<3> sortSelector;
+            sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
+
+            for (int16_t px = 0; px < 16; px++)
+            {
+                MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
+
+                if (alphaTest)
+                {
+                    ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
+
+                    ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
+                }
+
+                sortBin = sortBin + ParallelMath::MakeSInt16(px);
+
+                sortBins[px] = sortBin;
+            }
+        }
+
+        // Sort bins
+        for (int sortEnd = 1; sortEnd < 16; sortEnd++)
+        {
+            for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
+            {
+                MSInt16 a = sortBins[sortLoc];
+                MSInt16 b = sortBins[sortLoc - 1];
+
+                sortBins[sortLoc] = ParallelMath::Max(a, b);
+                sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
+            }
+        }
+
+        MUInt15 firstElement = ParallelMath::MakeUInt15(0);
+        for (uint16_t e = 0; e < 16; e++)
+        {
+            ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
+            ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
+            if (!ParallelMath::AnySet(isInvalid))
+                break;
+        }
+
+        MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
+
+        MUInt15 sortedInputs[16][4];
+        MFloat floatSortedInputs[16][4];
+        MFloat pwFloatSortedInputs[16][4];
+
+        for (int e = 0; e < 16; e++)
+        {
+            for (int ch = 0; ch < 4; ch++)
+                sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
+            {
+                ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
+                int originalIndex = (sortBin & 15);
+
+                for (int ch = 0; ch < 4; ch++)
+                    ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
+            }
+        }
+
+        for (int e = 0; e < 16; e++)
+        {
+            for (int ch = 0; ch < 4; ch++)
+            {
+                MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
+                floatSortedInputs[e][ch] = f;
+                pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
+            }
+        }
+
+        for (int n0 = 0; n0 <= 15; n0++)
+        {
+            int remainingFor1 = 16 - n0;
+            if (remainingFor1 == 16)
+                remainingFor1 = 15;
+
+            for (int n1 = 0; n1 <= remainingFor1; n1++)
+            {
+                int remainingFor2 = 16 - n1 - n0;
+                if (remainingFor2 == 16)
+                    remainingFor2 = 15;
+
+                for (int n2 = 0; n2 <= remainingFor2; n2++)
+                {
+                    int n3 = 16 - n2 - n1 - n0;
+
+                    if (n3 == 16)
+                        continue;
+
+                    int counts[4] = { n0, n1, n2, n3 };
+
+                    TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+                }
+            }
+        }
+
+        TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+
+        if (alphaTest)
+        {
+            for (int n0 = 0; n0 <= 15; n0++)
+            {
+                int remainingFor1 = 16 - n0;
+                if (remainingFor1 == 16)
+                    remainingFor1 = 15;
+
+                for (int n1 = 0; n1 <= remainingFor1; n1++)
+                {
+                    int n2 = 16 - n1 - n0;
+
+                    if (n2 == 16)
+                        continue;
+
+                    int counts[3] = { n0, n1, n2 };
+
+                    TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+                }
+            }
+
+            TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+        }
+    }
+    else
+    {
+        int minRange = alphaTest ? 3 : 4;
+
+        for (int range = minRange; range <= 4; range++)
+        {
+            int tweakRounds = BCCommon::TweakRoundsForRange(range);
+            if (tweakRounds > maxTweakRounds)
+                tweakRounds = maxTweakRounds;
+
+            for (int tweak = 0; tweak < tweakRounds; tweak++)
+            {
+                MUInt15 endPoints[2][3];
+
+                ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
+
+                for (int refine = 0; refine < numRefineRounds; refine++)
+                {
+                    EndpointRefiner<3> refiner;
+                    refiner.Init(range, channelWeights);
+
+                    TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
+
+                    if (refine != numRefineRounds - 1)
+                        refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
+                }
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
+        assert(range == 3 || range == 4);
+
+        ParallelMath::ScalarUInt16 compressedEP[2];
+        for (int ep = 0; ep < 2; ep++)
+        {
+            ParallelMath::ScalarUInt16 endPoint[3];
+            for (int ch = 0; ch < 3; ch++)
+                endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
+
+            int compressed = (endPoint[0] & 0xf8) << 8;
+            compressed |= (endPoint[1] & 0xfc) << 3;
+            compressed |= (endPoint[2] & 0xf8) >> 3;
+
+            compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
+        }
+
+        int indexOrder[4];
+
+        if (range == 4)
+        {
+            if (compressedEP[0] == compressedEP[1])
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 0;
+                indexOrder[2] = 0;
+                indexOrder[3] = 0;
+            }
+            else if (compressedEP[0] < compressedEP[1])
+            {
+                std::swap(compressedEP[0], compressedEP[1]);
+                indexOrder[0] = 1;
+                indexOrder[1] = 3;
+                indexOrder[2] = 2;
+                indexOrder[3] = 0;
+            }
+            else
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 2;
+                indexOrder[2] = 3;
+                indexOrder[3] = 1;
+            }
+        }
+        else
+        {
+            assert(range == 3);
+
+            if (compressedEP[0] > compressedEP[1])
+            {
+                std::swap(compressedEP[0], compressedEP[1]);
+                indexOrder[0] = 1;
+                indexOrder[1] = 2;
+                indexOrder[2] = 0;
+            }
+            else
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 2;
+                indexOrder[2] = 1;
+            }
+            indexOrder[3] = 3;
+        }
+
+        packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
+        packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
+        packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
+        packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
+
+        for (int i = 0; i < 16; i += 4)
+        {
+            int packedIndexes = 0;
+            for (int subi = 0; subi < 4; subi++)
+            {
+                ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
+                packedIndexes |= (indexOrder[index] << (subi * 2));
+            }
+
+            packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
+        }
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC.h b/thirdparty/cvtt/ConvectionKernels_S3TC.h
new file mode 100644
index 0000000000..aa197229c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC.h
@@ -0,0 +1,51 @@
+#pragma once
+#ifndef __CVTT_S3TC_H__
+#define __CVTT_S3TC_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class EndpointRefiner;
+    }
+
+    struct PixelBlockU8;
+}
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class S3TCComputer
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            static void Init(MFloat& error);
+            static void QuantizeTo6Bits(MUInt15& v);
+            static void QuantizeTo5Bits(MUInt15& v);
+            static void QuantizeTo565(MUInt15 endPoint[3]);
+            static MFloat ParanoidFactorForSpan(const MSInt16& span);
+            static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d);
+            static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
+                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
+                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
+                const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
+                const ParallelMath::RoundTowardNearestForScope* rtn);
+            static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride);
+            static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds);
+            static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds);
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h b/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h
new file mode 100644
index 0000000000..c772b163c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h
@@ -0,0 +1,304 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace S3TCSC {
+
+struct TableEntry
+{
+    uint8_t m_min;
+    uint8_t m_max;
+    uint8_t m_actualColor;
+    uint8_t m_span;
+};
+
+TableEntry g_singleColor5_3[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 2, 8 }, { 0, 8, 2, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 10, 8 }, { 0, 33, 11, 33 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 18, 8 }, { 8, 41, 19, 33 }, { 24, 16, 21, 8 }, { 24, 16, 21, 8 }, { 33, 0, 22, 33 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 41, 29, 17 }, { 33, 24, 30, 9 }, { 33, 24, 30, 9 },
+    { 24, 49, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 33, 41, 35, 8 }, { 33, 41, 35, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 },
+    { 49, 24, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 43, 8 }, { 33, 66, 44, 33 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 51, 8 }, { 41, 74, 52, 33 }, { 57, 49, 54, 8 }, { 57, 49, 54, 8 }, { 66, 33, 55, 33 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 74, 62, 17 }, { 66, 57, 63, 9 },
+    { 66, 57, 63, 9 }, { 57, 82, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 66, 74, 68, 8 }, { 66, 74, 68, 8 }, { 74, 66, 71, 8 }, { 74, 66, 71, 8 },
+    { 74, 66, 71, 8 }, { 82, 57, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 76, 8 }, { 66, 99, 77, 33 }, { 82, 74, 79, 8 }, { 82, 74, 79, 8 },
+    { 82, 74, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 84, 8 }, { 74, 107, 85, 33 }, { 90, 82, 87, 8 }, { 90, 82, 87, 8 },
+    { 99, 66, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 107, 95, 17 },
+    { 99, 90, 96, 9 }, { 99, 90, 96, 9 }, { 90, 115, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 99, 107, 101, 8 }, { 99, 107, 101, 8 }, { 107, 99, 104, 8 },
+    { 107, 99, 104, 8 }, { 107, 99, 104, 8 }, { 115, 90, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 109, 8 }, { 99, 132, 110, 33 }, { 115, 107, 112, 8 },
+    { 115, 107, 112, 8 }, { 115, 107, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 117, 8 }, { 107, 140, 118, 33 }, { 123, 115, 120, 8 },
+    { 123, 115, 120, 8 }, { 132, 99, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 },
+    { 123, 140, 128, 17 }, { 132, 123, 129, 9 }, { 132, 123, 129, 9 }, { 123, 148, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 132, 140, 134, 8 }, { 132, 140, 134, 8 },
+    { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 148, 123, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 142, 8 }, { 132, 165, 143, 33 },
+    { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 150, 8 }, { 140, 173, 151, 33 },
+    { 156, 148, 153, 8 }, { 156, 148, 153, 8 }, { 165, 132, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 159, 9 }, { 156, 165, 159, 9 },
+    { 156, 165, 159, 9 }, { 156, 173, 161, 17 }, { 165, 156, 162, 9 }, { 165, 156, 162, 9 }, { 156, 181, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 165, 173, 167, 8 },
+    { 165, 173, 167, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 181, 156, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 181, 175, 8 },
+    { 165, 198, 176, 33 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 189, 183, 8 },
+    { 173, 206, 184, 33 }, { 189, 181, 186, 8 }, { 189, 181, 186, 8 }, { 198, 165, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 198, 192, 9 },
+    { 189, 198, 192, 9 }, { 189, 198, 192, 9 }, { 189, 206, 194, 17 }, { 198, 189, 195, 9 }, { 198, 189, 195, 9 }, { 189, 214, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 198, 206, 200, 8 }, { 198, 206, 200, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 214, 189, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 214, 208, 8 }, { 198, 231, 209, 33 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 222, 216, 8 }, { 206, 239, 217, 33 }, { 222, 214, 219, 8 }, { 222, 214, 219, 8 }, { 231, 198, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 239, 227, 17 }, { 231, 222, 228, 9 }, { 231, 222, 228, 9 }, { 222, 247, 230, 25 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 239, 233, 8 }, { 231, 239, 233, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 247, 222, 238, 25 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 247, 241, 8 }, { 239, 247, 241, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 255, 249, 8 }, { 247, 255, 249, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_3[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 4, 1, 4 }, { 4, 0, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 5, 4 }, { 8, 4, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 12, 9, 4 }, { 12, 8, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 13, 4 }, { 16, 12, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 20, 17, 4 }, { 20, 16, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 21, 4 }, { 24, 20, 22, 4 }, { 0, 69, 23, 69 },
+    { 24, 24, 24, 0 }, { 24, 28, 25, 4 }, { 28, 24, 26, 4 }, { 8, 65, 27, 57 }, { 28, 28, 28, 0 }, { 28, 32, 29, 4 }, { 32, 28, 30, 4 }, { 12, 69, 31, 57 },
+    { 32, 32, 32, 0 }, { 32, 36, 33, 4 }, { 36, 32, 34, 4 }, { 20, 65, 35, 45 }, { 36, 36, 36, 0 }, { 36, 40, 37, 4 }, { 40, 36, 38, 4 }, { 24, 69, 39, 45 },
+    { 40, 40, 40, 0 }, { 40, 44, 41, 4 }, { 44, 40, 42, 4 }, { 32, 65, 43, 33 }, { 44, 44, 44, 0 }, { 44, 48, 45, 4 }, { 48, 44, 46, 4 }, { 36, 69, 47, 33 },
+    { 48, 48, 48, 0 }, { 48, 52, 49, 4 }, { 52, 48, 50, 4 }, { 44, 65, 51, 21 }, { 52, 52, 52, 0 }, { 52, 56, 53, 4 }, { 56, 52, 54, 4 }, { 48, 69, 55, 21 },
+    { 56, 56, 56, 0 }, { 56, 60, 57, 4 }, { 60, 56, 58, 4 }, { 56, 65, 59, 9 }, { 60, 60, 60, 0 }, { 60, 65, 61, 5 }, { 65, 56, 62, 9 }, { 65, 60, 63, 5 },
+    { 60, 73, 64, 13 }, { 65, 65, 65, 0 }, { 65, 69, 66, 4 }, { 69, 65, 67, 4 }, { 73, 60, 68, 13 }, { 69, 69, 69, 0 }, { 69, 73, 70, 4 }, { 73, 69, 71, 4 },
+    { 81, 56, 72, 25 }, { 73, 73, 73, 0 }, { 73, 77, 74, 4 }, { 77, 73, 75, 4 }, { 85, 60, 76, 25 }, { 77, 77, 77, 0 }, { 77, 81, 78, 4 }, { 81, 77, 79, 4 },
+    { 93, 56, 80, 37 }, { 81, 81, 81, 0 }, { 81, 85, 82, 4 }, { 85, 81, 83, 4 }, { 97, 60, 84, 37 }, { 85, 85, 85, 0 }, { 85, 89, 86, 4 }, { 89, 85, 87, 4 },
+    { 105, 56, 88, 49 }, { 89, 89, 89, 0 }, { 89, 93, 90, 4 }, { 93, 89, 91, 4 }, { 109, 60, 92, 49 }, { 93, 93, 93, 0 }, { 93, 97, 94, 4 }, { 97, 93, 95, 4 },
+    { 77, 134, 96, 57 }, { 97, 97, 97, 0 }, { 97, 101, 98, 4 }, { 101, 97, 99, 4 }, { 85, 130, 100, 45 }, { 101, 101, 101, 0 }, { 101, 105, 102, 4 }, { 105, 101, 103, 4 },
+    { 89, 134, 104, 45 }, { 105, 105, 105, 0 }, { 105, 109, 106, 4 }, { 109, 105, 107, 4 }, { 97, 130, 108, 33 }, { 109, 109, 109, 0 }, { 109, 113, 110, 4 }, { 113, 109, 111, 4 },
+    { 101, 134, 112, 33 }, { 113, 113, 113, 0 }, { 113, 117, 114, 4 }, { 117, 113, 115, 4 }, { 109, 130, 116, 21 }, { 117, 117, 117, 0 }, { 117, 121, 118, 4 }, { 121, 117, 119, 4 },
+    { 113, 134, 120, 21 }, { 121, 121, 121, 0 }, { 121, 125, 122, 4 }, { 125, 121, 123, 4 }, { 121, 130, 124, 9 }, { 125, 125, 125, 0 }, { 125, 130, 126, 5 }, { 130, 121, 127, 9 },
+    { 130, 125, 128, 5 }, { 125, 138, 129, 13 }, { 130, 130, 130, 0 }, { 130, 134, 131, 4 }, { 134, 130, 132, 4 }, { 138, 125, 133, 13 }, { 134, 134, 134, 0 }, { 134, 138, 135, 4 },
+    { 138, 134, 136, 4 }, { 146, 121, 137, 25 }, { 138, 138, 138, 0 }, { 138, 142, 139, 4 }, { 142, 138, 140, 4 }, { 150, 125, 141, 25 }, { 142, 142, 142, 0 }, { 142, 146, 143, 4 },
+    { 146, 142, 144, 4 }, { 158, 121, 145, 37 }, { 146, 146, 146, 0 }, { 146, 150, 147, 4 }, { 150, 146, 148, 4 }, { 162, 125, 149, 37 }, { 150, 150, 150, 0 }, { 150, 154, 151, 4 },
+    { 154, 150, 152, 4 }, { 170, 121, 153, 49 }, { 154, 154, 154, 0 }, { 154, 158, 155, 4 }, { 158, 154, 156, 4 }, { 174, 125, 157, 49 }, { 158, 158, 158, 0 }, { 158, 162, 159, 4 },
+    { 162, 158, 160, 4 }, { 142, 199, 161, 57 }, { 162, 162, 162, 0 }, { 162, 166, 163, 4 }, { 166, 162, 164, 4 }, { 150, 195, 165, 45 }, { 166, 166, 166, 0 }, { 166, 170, 167, 4 },
+    { 170, 166, 168, 4 }, { 154, 199, 169, 45 }, { 170, 170, 170, 0 }, { 170, 174, 171, 4 }, { 174, 170, 172, 4 }, { 162, 195, 173, 33 }, { 174, 174, 174, 0 }, { 174, 178, 175, 4 },
+    { 178, 174, 176, 4 }, { 166, 199, 177, 33 }, { 178, 178, 178, 0 }, { 178, 182, 179, 4 }, { 182, 178, 180, 4 }, { 174, 195, 181, 21 }, { 182, 182, 182, 0 }, { 182, 186, 183, 4 },
+    { 186, 182, 184, 4 }, { 178, 199, 185, 21 }, { 186, 186, 186, 0 }, { 186, 190, 187, 4 }, { 190, 186, 188, 4 }, { 186, 195, 189, 9 }, { 190, 190, 190, 0 }, { 190, 195, 191, 5 },
+    { 195, 186, 192, 9 }, { 195, 190, 193, 5 }, { 190, 203, 194, 13 }, { 195, 195, 195, 0 }, { 195, 199, 196, 4 }, { 199, 195, 197, 4 }, { 203, 190, 198, 13 }, { 199, 199, 199, 0 },
+    { 199, 203, 200, 4 }, { 203, 199, 201, 4 }, { 211, 186, 202, 25 }, { 203, 203, 203, 0 }, { 203, 207, 204, 4 }, { 207, 203, 205, 4 }, { 215, 190, 206, 25 }, { 207, 207, 207, 0 },
+    { 207, 211, 208, 4 }, { 211, 207, 209, 4 }, { 223, 186, 210, 37 }, { 211, 211, 211, 0 }, { 211, 215, 212, 4 }, { 215, 211, 213, 4 }, { 227, 190, 214, 37 }, { 215, 215, 215, 0 },
+    { 215, 219, 216, 4 }, { 219, 215, 217, 4 }, { 235, 186, 218, 49 }, { 219, 219, 219, 0 }, { 219, 223, 220, 4 }, { 223, 219, 221, 4 }, { 239, 190, 222, 49 }, { 223, 223, 223, 0 },
+    { 223, 227, 224, 4 }, { 227, 223, 225, 4 }, { 247, 186, 226, 61 }, { 227, 227, 227, 0 }, { 227, 231, 228, 4 }, { 231, 227, 229, 4 }, { 251, 190, 230, 61 }, { 231, 231, 231, 0 },
+    { 231, 235, 232, 4 }, { 235, 231, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 236, 4 }, { 239, 235, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 243, 240, 4 }, { 243, 239, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 244, 4 }, { 247, 243, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 251, 248, 4 }, { 251, 247, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 252, 4 }, { 255, 251, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_2[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
+    { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
+    { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
+    { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
+    { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
+    { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
+    { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
+    { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
+    { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
+    { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
+    { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
+    { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
+    { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
+    { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
+    { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
+    { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
+    { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
+    { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
+    { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
+    { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_2[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
+    { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
+    { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
+    { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
+    { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 },
+    { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 },
+    { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
+    { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
+    { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
+    { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
+    { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 },
+    { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 },
+    { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
+    { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
+    { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
+    { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
+    { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
+    { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
+    { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
+    { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 },
+    { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 },
+    { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_3_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 2, 8 }, { 0, 8, 2, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 10, 8 }, { 0, 33, 11, 33 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 18, 8 }, { 8, 41, 19, 33 }, { 24, 16, 21, 8 }, { 24, 16, 21, 8 }, { 33, 0, 22, 33 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 41, 29, 17 }, { 33, 24, 30, 9 }, { 33, 24, 30, 9 },
+    { 24, 49, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 33, 41, 35, 8 }, { 33, 41, 35, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 },
+    { 49, 24, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 43, 8 }, { 33, 66, 44, 33 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 51, 8 }, { 41, 74, 52, 33 }, { 57, 49, 54, 8 }, { 57, 49, 54, 8 }, { 66, 33, 55, 33 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 74, 62, 17 }, { 66, 57, 63, 9 },
+    { 66, 57, 63, 9 }, { 57, 82, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 66, 74, 68, 8 }, { 66, 74, 68, 8 }, { 74, 66, 71, 8 }, { 74, 66, 71, 8 },
+    { 74, 66, 71, 8 }, { 82, 57, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 76, 8 }, { 66, 99, 77, 33 }, { 82, 74, 79, 8 }, { 82, 74, 79, 8 },
+    { 82, 74, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 84, 8 }, { 74, 107, 85, 33 }, { 90, 82, 87, 8 }, { 90, 82, 87, 8 },
+    { 99, 66, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 107, 95, 17 },
+    { 99, 90, 96, 9 }, { 99, 90, 96, 9 }, { 90, 115, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 99, 107, 101, 8 }, { 99, 107, 101, 8 }, { 107, 99, 104, 8 },
+    { 107, 99, 104, 8 }, { 107, 99, 104, 8 }, { 115, 90, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 109, 8 }, { 99, 132, 110, 33 }, { 115, 107, 112, 8 },
+    { 115, 107, 112, 8 }, { 115, 107, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 117, 8 }, { 107, 140, 118, 33 }, { 123, 115, 120, 8 },
+    { 123, 115, 120, 8 }, { 132, 99, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 },
+    { 123, 140, 128, 17 }, { 132, 123, 129, 9 }, { 132, 123, 129, 9 }, { 123, 148, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 132, 140, 134, 8 }, { 132, 140, 134, 8 },
+    { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 148, 123, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 142, 8 }, { 132, 165, 143, 33 },
+    { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 150, 8 }, { 140, 173, 151, 33 },
+    { 156, 148, 153, 8 }, { 156, 148, 153, 8 }, { 165, 132, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 159, 9 }, { 156, 165, 159, 9 },
+    { 156, 165, 159, 9 }, { 156, 173, 161, 17 }, { 165, 156, 162, 9 }, { 165, 156, 162, 9 }, { 156, 181, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 165, 173, 167, 8 },
+    { 165, 173, 167, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 181, 156, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 181, 175, 8 },
+    { 165, 198, 176, 33 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 189, 183, 8 },
+    { 173, 206, 184, 33 }, { 189, 181, 186, 8 }, { 189, 181, 186, 8 }, { 198, 165, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 198, 192, 9 },
+    { 189, 198, 192, 9 }, { 189, 198, 192, 9 }, { 189, 206, 194, 17 }, { 198, 189, 195, 9 }, { 198, 189, 195, 9 }, { 189, 214, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 198, 206, 200, 8 }, { 198, 206, 200, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 214, 189, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 214, 208, 8 }, { 198, 231, 209, 33 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 222, 216, 8 }, { 206, 239, 217, 33 }, { 222, 214, 219, 8 }, { 222, 214, 219, 8 }, { 231, 198, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 239, 227, 17 }, { 231, 222, 228, 9 }, { 231, 222, 228, 9 }, { 222, 247, 230, 25 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 239, 233, 8 }, { 231, 239, 233, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 247, 222, 238, 25 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 247, 241, 8 }, { 239, 247, 241, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 255, 249, 8 }, { 247, 255, 249, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_3_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 4, 1, 4 }, { 4, 0, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 5, 4 }, { 8, 4, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 12, 9, 4 }, { 12, 8, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 13, 4 }, { 16, 12, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 20, 17, 4 }, { 20, 16, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 21, 4 }, { 24, 20, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 28, 25, 4 }, { 28, 24, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 29, 4 }, { 32, 28, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 36, 33, 4 }, { 36, 32, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 37, 4 }, { 40, 36, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 44, 41, 4 }, { 44, 40, 42, 4 }, { 32, 65, 43, 33 }, { 44, 44, 44, 0 }, { 44, 48, 45, 4 }, { 48, 44, 46, 4 }, { 36, 69, 47, 33 },
+    { 48, 48, 48, 0 }, { 48, 52, 49, 4 }, { 52, 48, 50, 4 }, { 44, 65, 51, 21 }, { 52, 52, 52, 0 }, { 52, 56, 53, 4 }, { 56, 52, 54, 4 }, { 48, 69, 55, 21 },
+    { 56, 56, 56, 0 }, { 56, 60, 57, 4 }, { 60, 56, 58, 4 }, { 56, 65, 59, 9 }, { 60, 60, 60, 0 }, { 60, 65, 61, 5 }, { 65, 56, 62, 9 }, { 65, 60, 63, 5 },
+    { 60, 73, 64, 13 }, { 65, 65, 65, 0 }, { 65, 69, 66, 4 }, { 69, 65, 67, 4 }, { 73, 60, 68, 13 }, { 69, 69, 69, 0 }, { 69, 73, 70, 4 }, { 73, 69, 71, 4 },
+    { 81, 56, 72, 25 }, { 73, 73, 73, 0 }, { 73, 77, 74, 4 }, { 77, 73, 75, 4 }, { 85, 60, 76, 25 }, { 77, 77, 77, 0 }, { 77, 81, 78, 4 }, { 81, 77, 79, 4 },
+    { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 82, 4 }, { 85, 81, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 86, 4 }, { 89, 85, 87, 4 },
+    { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 90, 4 }, { 93, 89, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 94, 4 }, { 97, 93, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 98, 4 }, { 101, 97, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 102, 4 }, { 105, 101, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 106, 4 }, { 109, 105, 107, 4 }, { 97, 130, 108, 33 }, { 109, 109, 109, 0 }, { 109, 113, 110, 4 }, { 113, 109, 111, 4 },
+    { 101, 134, 112, 33 }, { 113, 113, 113, 0 }, { 113, 117, 114, 4 }, { 117, 113, 115, 4 }, { 109, 130, 116, 21 }, { 117, 117, 117, 0 }, { 117, 121, 118, 4 }, { 121, 117, 119, 4 },
+    { 113, 134, 120, 21 }, { 121, 121, 121, 0 }, { 121, 125, 122, 4 }, { 125, 121, 123, 4 }, { 121, 130, 124, 9 }, { 125, 125, 125, 0 }, { 125, 130, 126, 5 }, { 130, 121, 127, 9 },
+    { 130, 125, 128, 5 }, { 125, 138, 129, 13 }, { 130, 130, 130, 0 }, { 130, 134, 131, 4 }, { 134, 130, 132, 4 }, { 138, 125, 133, 13 }, { 134, 134, 134, 0 }, { 134, 138, 135, 4 },
+    { 138, 134, 136, 4 }, { 146, 121, 137, 25 }, { 138, 138, 138, 0 }, { 138, 142, 139, 4 }, { 142, 138, 140, 4 }, { 150, 125, 141, 25 }, { 142, 142, 142, 0 }, { 142, 146, 143, 4 },
+    { 146, 142, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 147, 4 }, { 150, 146, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 154, 151, 4 },
+    { 154, 150, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 155, 4 }, { 158, 154, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 162, 159, 4 },
+    { 162, 158, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 163, 4 }, { 166, 162, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 170, 167, 4 },
+    { 170, 166, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 171, 4 }, { 174, 170, 172, 4 }, { 162, 195, 173, 33 }, { 174, 174, 174, 0 }, { 174, 178, 175, 4 },
+    { 178, 174, 176, 4 }, { 166, 199, 177, 33 }, { 178, 178, 178, 0 }, { 178, 182, 179, 4 }, { 182, 178, 180, 4 }, { 174, 195, 181, 21 }, { 182, 182, 182, 0 }, { 182, 186, 183, 4 },
+    { 186, 182, 184, 4 }, { 178, 199, 185, 21 }, { 186, 186, 186, 0 }, { 186, 190, 187, 4 }, { 190, 186, 188, 4 }, { 186, 195, 189, 9 }, { 190, 190, 190, 0 }, { 190, 195, 191, 5 },
+    { 195, 186, 192, 9 }, { 195, 190, 193, 5 }, { 190, 203, 194, 13 }, { 195, 195, 195, 0 }, { 195, 199, 196, 4 }, { 199, 195, 197, 4 }, { 203, 190, 198, 13 }, { 199, 199, 199, 0 },
+    { 199, 203, 200, 4 }, { 203, 199, 201, 4 }, { 211, 186, 202, 25 }, { 203, 203, 203, 0 }, { 203, 207, 204, 4 }, { 207, 203, 205, 4 }, { 215, 190, 206, 25 }, { 207, 207, 207, 0 },
+    { 207, 211, 208, 4 }, { 211, 207, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 212, 4 }, { 215, 211, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
+    { 215, 219, 216, 4 }, { 219, 215, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 220, 4 }, { 223, 219, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
+    { 223, 227, 224, 4 }, { 227, 223, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 228, 4 }, { 231, 227, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 235, 232, 4 }, { 235, 231, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 236, 4 }, { 239, 235, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 243, 240, 4 }, { 243, 239, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 244, 4 }, { 247, 243, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 251, 248, 4 }, { 251, 247, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 252, 4 }, { 255, 251, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_2_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
+    { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
+    { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
+    { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
+    { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
+    { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
+    { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
+    { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
+    { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
+    { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
+    { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
+    { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
+    { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
+    { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
+    { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
+    { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
+    { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
+    { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
+    { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
+    { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_2_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
+    { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
+    { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
+    { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
+    { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 },
+    { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 },
+    { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
+    { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
+    { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
+    { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
+    { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 },
+    { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 },
+    { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
+    { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
+    { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
+    { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
+    { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
+    { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
+    { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
+    { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
+    { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
+    { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp b/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp
new file mode 100644
index 0000000000..ad59988655
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp
@@ -0,0 +1,48 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if defined(CVTT_SINGLE_FILE)
+#define CVTT_SINGLE_FILE_IMPL
+
+#include "ConvectionKernels_API.cpp"
+#include "ConvectionKernels_BC67.cpp"
+#include "ConvectionKernels_BC6H_IO.cpp"
+#include "ConvectionKernels_BC7_PrioData.cpp"
+#include "ConvectionKernels_BCCommon.cpp"
+#include "ConvectionKernels_ETC.cpp"
+#include "ConvectionKernels_IndexSelector.cpp"
+#include "ConvectionKernels_S3TC.cpp"
+#include "ConvectionKernels_Util.cpp"
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h b/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h
new file mode 100644
index 0000000000..371cbe54bf
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include "ConvectionKernels_Util.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class UnfinishedEndpoints
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            UnfinishedEndpoints()
+            {
+            }
+
+            UnfinishedEndpoints(const MFloat *base, const MFloat *offset)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_base[ch] = base[ch];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_offset[ch] = offset[ch];
+            }
+
+            UnfinishedEndpoints(const UnfinishedEndpoints& other)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_base[ch] = other.m_base[ch];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_offset[ch] = other.m_offset[ch];
+            }
+
+            void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MUInt15 channelEPs[2];
+                    for (int epi = 0; epi < 2; epi++)
+                    {
+                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f);
+                        channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode);
+                    }
+
+                    outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]);
+                    outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]);
+                }
+            }
+
+            void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode)
+            {
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MSInt16 channelEPs[2];
+                    for (int epi = 0; epi < 2; epi++)
+                    {
+                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f);
+                        channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode);
+                    }
+
+                    outEP0[ch] = channelEPs[0];
+                    outEP1[ch] = channelEPs[1];
+                }
+            }
+
+            void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1)
+            {
+                ParallelMath::RoundTowardNearestForScope roundingMode;
+
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
+                    MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
+                    outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode);
+                    outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode);
+                }
+            }
+
+            template<int TNewVectorSize>
+            UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler)
+            {
+                MFloat newBase[TNewVectorSize];
+                MFloat newOffset[TNewVectorSize];
+
+                for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++)
+                {
+                    newBase[ch] = m_base[ch];
+                    newOffset[ch] = m_offset[ch];
+                }
+
+                MFloat fillerV = ParallelMath::MakeFloat(filler);
+
+                for (int ch = TVectorSize; ch < TNewVectorSize; ch++)
+                {
+                    newBase[ch] = fillerV;
+                    newOffset[ch] = ParallelMath::MakeFloatZero();
+                }
+
+                return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset);
+            }
+
+        private:
+            MFloat m_base[TVectorSize];
+            MFloat m_offset[TVectorSize];
+        };
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_Util.cpp b/thirdparty/cvtt/ConvectionKernels_Util.cpp
new file mode 100644
index 0000000000..d9c25c7845
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Util.cpp
@@ -0,0 +1,88 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+#include <algorithm>
+
+namespace cvtt
+{
+    namespace Util
+    {
+        // Signed input blocks are converted into unsigned space, with the maximum value being 254
+        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])
+        {
+            for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                const PixelBlockS8& inputSignedBlock = inputSigned[block];
+                PixelBlockU8& inputNormalizedBlock = inputNormalized[block];
+
+                for (size_t px = 0; px < 16; px++)
+                {
+                    for (size_t ch = 0; ch < 4; ch++)
+                        inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127);
+                }
+            }
+        }
+
+        void FillWeights(const Options &options, float channelWeights[4])
+        {
+            if (options.flags & Flags::Uniform)
+                channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f;
+            else
+            {
+                channelWeights[0] = options.redWeight;
+                channelWeights[1] = options.greenWeight;
+                channelWeights[2] = options.blueWeight;
+                channelWeights[3] = options.alphaWeight;
+            }
+        }
+
+        void ComputeTweakFactors(int tweak, int range, float *outFactors)
+        {
+            int totalUnits = range - 1;
+            int minOutsideUnits = ((tweak >> 1) & 1);
+            int maxOutsideUnits = (tweak & 1);
+            int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
+
+            outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
+            outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
+        }
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_Util.h b/thirdparty/cvtt/ConvectionKernels_Util.h
new file mode 100644
index 0000000000..c07b9bf2aa
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Util.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    struct PixelBlockU8;
+    struct PixelBlockS8;
+    struct Options;
+}
+
+namespace cvtt
+{
+    namespace Util
+    {
+        // Signed input blocks are converted into unsigned space, with the maximum value being 254
+        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize]);
+        void FillWeights(const Options &options, float channelWeights[4]);
+        void ComputeTweakFactors(int tweak, int range, float *outFactors);
+    }
+}
diff --git a/thirdparty/cvtt/etc_notes.txt b/thirdparty/cvtt/etc_notes.txt
new file mode 100644
index 0000000000..bb041a8435
--- /dev/null
+++ b/thirdparty/cvtt/etc_notes.txt
@@ -0,0 +1,27 @@
+The ETC1 compressor uses modified cluster fit:
+
+Assume that there exists an ideal base color and set of selectors for a given table.
+For a given table and set of selectors, the ideal base color can be determined by subtracting the offsets from each pixel and averaging them.
+Doing that is equivalent to subtracting the average offset from the average color.
+Because positive and negative selectors of the same magnitude cancel out, the search space of possible average offsets is reduced: 57 unique offsets for the first table and 81 for the others.
+Most of the offsets result in the same color as another average offset due to quantization of the base color, so those can be de-duplicated.
+So:
+- Start with a high-precision average color.
+- Apply precomputed luma offsets to it.
+- Quantize and de-duplicate the base colors.
+- Find the ideal selectors for each base color.
+
+Differential mode is solved by just finding the best legal combination from those attempts.
+
+There are several scenarios where this is not ideal:
+- Clamping behavior can sometimes be leveraged for a more accurate block.
+- Differentials can sometimes be moved slightly closer to become legal.
+- This only works when MSE is the error metric (i.e. not normal maps)
+- This only works when pixel weights are of equal importance (i.e. not using weight by alpha or edge deblocking)
+
+T and H mode just work by generating clustering assignments by computing a chrominance line and splitting the block in half by the chrominance midpoint and using those to determine the averages.
+
+Planar mode is just solved algebraically.
+
+If you want to emulate etc2comp's default settings, add the flag ETC_UseFakeBT709 to use its modified Rec. 709 error coefficients.
+Doing that will significantly slow down encoding because it requires much more complicated quantization math.
+\ No newline at end of file